diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2018-10-23 05:47:28 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2018-10-23 05:47:28 +0000 |
commit | f6666e4c3977a5d74f3da7464672ea48e44dff4b (patch) | |
tree | 21df237f03b2aabc094d514d3d8f0d13c4f00f4e /lib/mesa/src/gallium/drivers/r600 | |
parent | 6f93ffbe5713be116f31de00cb562a09606a779c (diff) |
Import Mesa 17.3.9
Diffstat (limited to 'lib/mesa/src/gallium/drivers/r600')
21 files changed, 13564 insertions, 1 deletions
diff --git a/lib/mesa/src/gallium/drivers/r600/cayman_msaa.c b/lib/mesa/src/gallium/drivers/r600/cayman_msaa.c new file mode 100644 index 000000000..6bc307a4b --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/cayman_msaa.c @@ -0,0 +1,270 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Marek Olšák <maraeo@gmail.com> + * + */ + +#include "r600_cs.h" +#include "evergreend.h" + +/* 2xMSAA + * There are two locations (4, 4), (-4, -4). */ +const uint32_t eg_sample_locs_2x[4] = { + FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), + FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), + FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), + FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4), +}; +const unsigned eg_max_dist_2x = 4; +/* 4xMSAA + * There are 4 locations: (-2, 6), (6, -2), (-6, 2), (2, 6). */ +const uint32_t eg_sample_locs_4x[4] = { + FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), + FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), + FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), + FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6), +}; +const unsigned eg_max_dist_4x = 6; + +/* Cayman 8xMSAA */ +static const uint32_t cm_sample_locs_8x[] = { + FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), + FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), + FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), + FILL_SREG( 1, -3, -1, 3, 5, 1, -3, -5), + FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), + FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), + FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), + FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7), +}; +static const unsigned cm_max_dist_8x = 8; +/* Cayman 16xMSAA */ +static const uint32_t cm_sample_locs_16x[] = { + FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), + FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), + FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), + FILL_SREG( 1, 1, -1, -3, -3, 2, 4, -1), + FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), + FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), + FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), + FILL_SREG(-5, -2, 2, 5, 5, 3, 3, -5), + FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), + FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), + FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), + FILL_SREG(-2, 6, 0, -7, -4, -6, -6, 4), + FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), + FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), + FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), + FILL_SREG(-8, 0, 7, -4, 6, 7, -7, -8), +}; +static const unsigned cm_max_dist_16x = 8; + +void cayman_get_sample_position(struct pipe_context *ctx, unsigned sample_count, + unsigned sample_index, float *out_value) +{ + int offset, index; + struct { + int idx:4; + } val; + switch (sample_count) { + case 1: + default: + out_value[0] = out_value[1] = 0.5; + break; + case 2: + offset = 4 * (sample_index * 2); + val.idx = (eg_sample_locs_2x[0] >> offset) & 0xf; + out_value[0] = (float)(val.idx + 8) / 16.0f; + val.idx = (eg_sample_locs_2x[0] >> (offset + 4)) & 0xf; + out_value[1] = (float)(val.idx + 8) / 16.0f; + break; + case 4: + offset = 4 * (sample_index * 2); + val.idx = (eg_sample_locs_4x[0] >> offset) & 0xf; + out_value[0] = (float)(val.idx + 8) / 16.0f; + val.idx = (eg_sample_locs_4x[0] >> (offset + 4)) & 0xf; + out_value[1] = (float)(val.idx + 8) / 16.0f; + break; + case 8: + offset = 4 * (sample_index % 4 * 2); + index = (sample_index / 4) * 4; + val.idx = (cm_sample_locs_8x[index] >> offset) & 0xf; + out_value[0] = (float)(val.idx + 8) / 16.0f; + val.idx = (cm_sample_locs_8x[index] >> (offset + 4)) & 0xf; + out_value[1] = (float)(val.idx + 8) / 16.0f; + break; + case 16: + offset = 4 * (sample_index % 4 * 2); + index = (sample_index / 4) * 4; + val.idx = (cm_sample_locs_16x[index] >> offset) & 0xf; + out_value[0] = (float)(val.idx + 8) / 16.0f; + val.idx = (cm_sample_locs_16x[index] >> (offset + 4)) & 0xf; + out_value[1] = (float)(val.idx + 8) / 16.0f; + break; + } +} + +void cayman_init_msaa(struct pipe_context *ctx) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + int i; + + cayman_get_sample_position(ctx, 1, 0, rctx->sample_locations_1x[0]); + + for (i = 0; i < 2; i++) + cayman_get_sample_position(ctx, 2, i, rctx->sample_locations_2x[i]); + for (i = 0; i < 4; i++) + cayman_get_sample_position(ctx, 4, i, rctx->sample_locations_4x[i]); + for (i = 0; i < 8; i++) + cayman_get_sample_position(ctx, 8, i, rctx->sample_locations_8x[i]); + for (i = 0; i < 16; i++) + cayman_get_sample_position(ctx, 16, i, rctx->sample_locations_16x[i]); +} + +void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples) +{ + switch (nr_samples) { + default: + case 1: + radeon_set_context_reg(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 0); + radeon_set_context_reg(cs, CM_R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 0); + radeon_set_context_reg(cs, CM_R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 0); + radeon_set_context_reg(cs, CM_R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 0); + break; + case 2: + radeon_set_context_reg(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_2x[0]); + radeon_set_context_reg(cs, CM_R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_2x[1]); + radeon_set_context_reg(cs, CM_R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, eg_sample_locs_2x[2]); + radeon_set_context_reg(cs, CM_R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, eg_sample_locs_2x[3]); + break; + case 4: + radeon_set_context_reg(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_4x[0]); + radeon_set_context_reg(cs, CM_R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_4x[1]); + radeon_set_context_reg(cs, CM_R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, eg_sample_locs_4x[2]); + radeon_set_context_reg(cs, CM_R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, eg_sample_locs_4x[3]); + break; + case 8: + radeon_set_context_reg_seq(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14); + radeon_emit(cs, cm_sample_locs_8x[0]); + radeon_emit(cs, cm_sample_locs_8x[4]); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + radeon_emit(cs, cm_sample_locs_8x[1]); + radeon_emit(cs, cm_sample_locs_8x[5]); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + radeon_emit(cs, cm_sample_locs_8x[2]); + radeon_emit(cs, cm_sample_locs_8x[6]); + radeon_emit(cs, 0); + radeon_emit(cs, 0); + radeon_emit(cs, cm_sample_locs_8x[3]); + radeon_emit(cs, cm_sample_locs_8x[7]); + break; + case 16: + radeon_set_context_reg_seq(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 16); + radeon_emit(cs, cm_sample_locs_16x[0]); + radeon_emit(cs, cm_sample_locs_16x[4]); + radeon_emit(cs, cm_sample_locs_16x[8]); + radeon_emit(cs, cm_sample_locs_16x[12]); + radeon_emit(cs, cm_sample_locs_16x[1]); + radeon_emit(cs, cm_sample_locs_16x[5]); + radeon_emit(cs, cm_sample_locs_16x[9]); + radeon_emit(cs, cm_sample_locs_16x[13]); + radeon_emit(cs, cm_sample_locs_16x[2]); + radeon_emit(cs, cm_sample_locs_16x[6]); + radeon_emit(cs, cm_sample_locs_16x[10]); + radeon_emit(cs, cm_sample_locs_16x[14]); + radeon_emit(cs, cm_sample_locs_16x[3]); + radeon_emit(cs, cm_sample_locs_16x[7]); + radeon_emit(cs, cm_sample_locs_16x[11]); + radeon_emit(cs, cm_sample_locs_16x[15]); + break; + } +} + +void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, + int ps_iter_samples, int overrast_samples, + unsigned sc_mode_cntl_1) +{ + int setup_samples = nr_samples > 1 ? nr_samples : + overrast_samples > 1 ? overrast_samples : 0; + /* Required by OpenGL line rasterization. + * + * TODO: We should also enable perpendicular endcaps for AA lines, + * but that requires implementing line stippling in the pixel + * shader. SC can only do line stippling with axis-aligned + * endcaps. + */ + unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1); + + if (setup_samples > 1) { + /* indexed by log2(nr_samples) */ + unsigned max_dist[] = { + 0, + eg_max_dist_2x, + eg_max_dist_4x, + cm_max_dist_8x, + cm_max_dist_16x + }; + unsigned log_samples = util_logbase2(setup_samples); + unsigned log_ps_iter_samples = + util_logbase2(util_next_power_of_two(ps_iter_samples)); + + radeon_set_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2); + radeon_emit(cs, sc_line_cntl | + S_028BDC_EXPAND_LINE_WIDTH(1)); /* CM_R_028BDC_PA_SC_LINE_CNTL */ + radeon_emit(cs, S_028BE0_MSAA_NUM_SAMPLES(log_samples) | + S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) | + S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples)); /* CM_R_028BE0_PA_SC_AA_CONFIG */ + + if (nr_samples > 1) { + radeon_set_context_reg(cs, CM_R_028804_DB_EQAA, + S_028804_MAX_ANCHOR_SAMPLES(log_samples) | + S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | + S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | + S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples) | + S_028804_HIGH_QUALITY_INTERSECTIONS(1) | + S_028804_STATIC_ANCHOR_ASSOCIATIONS(1)); + radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, + EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) | + sc_mode_cntl_1); + } else if (overrast_samples > 1) { + radeon_set_context_reg(cs, CM_R_028804_DB_EQAA, + S_028804_HIGH_QUALITY_INTERSECTIONS(1) | + S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) | + S_028804_OVERRASTERIZATION_AMOUNT(log_samples)); + radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, + sc_mode_cntl_1); + } + } else { + radeon_set_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2); + radeon_emit(cs, sc_line_cntl); /* CM_R_028BDC_PA_SC_LINE_CNTL */ + radeon_emit(cs, 0); /* CM_R_028BE0_PA_SC_AA_CONFIG */ + + radeon_set_context_reg(cs, CM_R_028804_DB_EQAA, + S_028804_HIGH_QUALITY_INTERSECTIONS(1) | + S_028804_STATIC_ANCHOR_ASSOCIATIONS(1)); + radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, + sc_mode_cntl_1); + } +} diff --git a/lib/mesa/src/gallium/drivers/r600/r600_buffer_common.c b/lib/mesa/src/gallium/drivers/r600/r600_buffer_common.c new file mode 100644 index 000000000..a6e3b7fcf --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600_buffer_common.c @@ -0,0 +1,685 @@ +/* + * Copyright 2013 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Marek Olšák + */ + +#include "r600_cs.h" +#include "util/u_memory.h" +#include "util/u_upload_mgr.h" +#include <inttypes.h> +#include <stdio.h> + +bool r600_rings_is_buffer_referenced(struct r600_common_context *ctx, + struct pb_buffer *buf, + enum radeon_bo_usage usage) +{ + if (ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, buf, usage)) { + return true; + } + if (radeon_emitted(ctx->dma.cs, 0) && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, buf, usage)) { + return true; + } + return false; +} + +void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, + struct r600_resource *resource, + unsigned usage) +{ + enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE; + bool busy = false; + + assert(!(resource->flags & RADEON_FLAG_SPARSE)); + + if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) { + return ctx->ws->buffer_map(resource->buf, NULL, usage); + } + + if (!(usage & PIPE_TRANSFER_WRITE)) { + /* have to wait for the last write */ + rusage = RADEON_USAGE_WRITE; + } + + if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && + ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, + resource->buf, rusage)) { + if (usage & PIPE_TRANSFER_DONTBLOCK) { + ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + return NULL; + } else { + ctx->gfx.flush(ctx, 0, NULL); + busy = true; + } + } + if (radeon_emitted(ctx->dma.cs, 0) && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, + resource->buf, rusage)) { + if (usage & PIPE_TRANSFER_DONTBLOCK) { + ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + return NULL; + } else { + ctx->dma.flush(ctx, 0, NULL); + busy = true; + } + } + + if (busy || !ctx->ws->buffer_wait(resource->buf, 0, rusage)) { + if (usage & PIPE_TRANSFER_DONTBLOCK) { + return NULL; + } else { + /* We will be wait for the GPU. Wait for any offloaded + * CS flush to complete to avoid busy-waiting in the winsys. */ + ctx->ws->cs_sync_flush(ctx->gfx.cs); + if (ctx->dma.cs) + ctx->ws->cs_sync_flush(ctx->dma.cs); + } + } + + /* Setting the CS to NULL will prevent doing checks we have done already. */ + return ctx->ws->buffer_map(resource->buf, NULL, usage); +} + +void r600_init_resource_fields(struct r600_common_screen *rscreen, + struct r600_resource *res, + uint64_t size, unsigned alignment) +{ + struct r600_texture *rtex = (struct r600_texture*)res; + + res->bo_size = size; + res->bo_alignment = alignment; + res->flags = 0; + res->texture_handle_allocated = false; + res->image_handle_allocated = false; + + switch (res->b.b.usage) { + case PIPE_USAGE_STREAM: + res->flags = RADEON_FLAG_GTT_WC; + /* fall through */ + case PIPE_USAGE_STAGING: + /* Transfers are likely to occur more often with these + * resources. */ + res->domains = RADEON_DOMAIN_GTT; + break; + case PIPE_USAGE_DYNAMIC: + /* Older kernels didn't always flush the HDP cache before + * CS execution + */ + if (rscreen->info.drm_major == 2 && + rscreen->info.drm_minor < 40) { + res->domains = RADEON_DOMAIN_GTT; + res->flags |= RADEON_FLAG_GTT_WC; + break; + } + /* fall through */ + case PIPE_USAGE_DEFAULT: + case PIPE_USAGE_IMMUTABLE: + default: + /* Not listing GTT here improves performance in some + * apps. */ + res->domains = RADEON_DOMAIN_VRAM; + res->flags |= RADEON_FLAG_GTT_WC; + break; + } + + if (res->b.b.target == PIPE_BUFFER && + res->b.b.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT | + PIPE_RESOURCE_FLAG_MAP_COHERENT)) { + /* Use GTT for all persistent mappings with older + * kernels, because they didn't always flush the HDP + * cache before CS execution. + * + * Write-combined CPU mappings are fine, the kernel + * ensures all CPU writes finish before the GPU + * executes a command stream. + */ + if (rscreen->info.drm_major == 2 && + rscreen->info.drm_minor < 40) + res->domains = RADEON_DOMAIN_GTT; + } + + /* Tiled textures are unmappable. Always put them in VRAM. */ + if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) || + res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) { + res->domains = RADEON_DOMAIN_VRAM; + res->flags |= RADEON_FLAG_NO_CPU_ACCESS | + RADEON_FLAG_GTT_WC; + } + + /* Only displayable single-sample textures can be shared between + * processes. */ + if (res->b.b.target == PIPE_BUFFER || + res->b.b.nr_samples >= 2 || + (rtex->surface.micro_tile_mode != RADEON_MICRO_MODE_DISPLAY && + /* Raven doesn't use display micro mode for 32bpp, so check this: */ + !(res->b.b.bind & PIPE_BIND_SCANOUT))) + res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING; + + /* If VRAM is just stolen system memory, allow both VRAM and + * GTT, whichever has free space. If a buffer is evicted from + * VRAM to GTT, it will stay there. + * + * DRM 3.6.0 has good BO move throttling, so we can allow VRAM-only + * placements even with a low amount of stolen VRAM. + */ + if (!rscreen->info.has_dedicated_vram && + (rscreen->info.drm_major < 3 || rscreen->info.drm_minor < 6) && + res->domains == RADEON_DOMAIN_VRAM) { + res->domains = RADEON_DOMAIN_VRAM_GTT; + res->flags &= ~RADEON_FLAG_NO_CPU_ACCESS; /* disallowed with VRAM_GTT */ + } + + if (rscreen->debug_flags & DBG_NO_WC) + res->flags &= ~RADEON_FLAG_GTT_WC; + + if (res->b.b.bind & PIPE_BIND_SHARED) + res->flags |= RADEON_FLAG_NO_SUBALLOC; + + /* Set expected VRAM and GART usage for the buffer. */ + res->vram_usage = 0; + res->gart_usage = 0; + + if (res->domains & RADEON_DOMAIN_VRAM) + res->vram_usage = size; + else if (res->domains & RADEON_DOMAIN_GTT) + res->gart_usage = size; +} + +bool r600_alloc_resource(struct r600_common_screen *rscreen, + struct r600_resource *res) +{ + struct pb_buffer *old_buf, *new_buf; + + /* Allocate a new resource. */ + new_buf = rscreen->ws->buffer_create(rscreen->ws, res->bo_size, + res->bo_alignment, + res->domains, res->flags); + if (!new_buf) { + return false; + } + + /* Replace the pointer such that if res->buf wasn't NULL, it won't be + * NULL. This should prevent crashes with multiple contexts using + * the same buffer where one of the contexts invalidates it while + * the others are using it. */ + old_buf = res->buf; + res->buf = new_buf; /* should be atomic */ + + if (rscreen->info.has_virtual_memory) + res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->buf); + else + res->gpu_address = 0; + + pb_reference(&old_buf, NULL); + + util_range_set_empty(&res->valid_buffer_range); + + /* Print debug information. */ + if (rscreen->debug_flags & DBG_VM && res->b.b.target == PIPE_BUFFER) { + fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n", + res->gpu_address, res->gpu_address + res->buf->size, + res->buf->size); + } + return true; +} + +static void r600_buffer_destroy(struct pipe_screen *screen, + struct pipe_resource *buf) +{ + struct r600_resource *rbuffer = r600_resource(buf); + + threaded_resource_deinit(buf); + util_range_destroy(&rbuffer->valid_buffer_range); + pb_reference(&rbuffer->buf, NULL); + FREE(rbuffer); +} + +static bool +r600_invalidate_buffer(struct r600_common_context *rctx, + struct r600_resource *rbuffer) +{ + /* Shared buffers can't be reallocated. */ + if (rbuffer->b.is_shared) + return false; + + /* Sparse buffers can't be reallocated. */ + if (rbuffer->flags & RADEON_FLAG_SPARSE) + return false; + + /* In AMD_pinned_memory, the user pointer association only gets + * broken when the buffer is explicitly re-allocated. + */ + if (rbuffer->b.is_user_ptr) + return false; + + /* Check if mapping this buffer would cause waiting for the GPU. */ + if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || + !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { + rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b); + } else { + util_range_set_empty(&rbuffer->valid_buffer_range); + } + + return true; +} + +/* Replace the storage of dst with src. */ +void r600_replace_buffer_storage(struct pipe_context *ctx, + struct pipe_resource *dst, + struct pipe_resource *src) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_resource *rdst = r600_resource(dst); + struct r600_resource *rsrc = r600_resource(src); + uint64_t old_gpu_address = rdst->gpu_address; + + pb_reference(&rdst->buf, rsrc->buf); + rdst->gpu_address = rsrc->gpu_address; + rdst->b.b.bind = rsrc->b.b.bind; + rdst->flags = rsrc->flags; + + assert(rdst->vram_usage == rsrc->vram_usage); + assert(rdst->gart_usage == rsrc->gart_usage); + assert(rdst->bo_size == rsrc->bo_size); + assert(rdst->bo_alignment == rsrc->bo_alignment); + assert(rdst->domains == rsrc->domains); + + rctx->rebind_buffer(ctx, dst, old_gpu_address); +} + +void r600_invalidate_resource(struct pipe_context *ctx, + struct pipe_resource *resource) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + struct r600_resource *rbuffer = r600_resource(resource); + + /* We currently only do anyting here for buffers */ + if (resource->target == PIPE_BUFFER) + (void)r600_invalidate_buffer(rctx, rbuffer); +} + +static void *r600_buffer_get_transfer(struct pipe_context *ctx, + struct pipe_resource *resource, + unsigned usage, + const struct pipe_box *box, + struct pipe_transfer **ptransfer, + void *data, struct r600_resource *staging, + unsigned offset) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + struct r600_transfer *transfer; + + if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC) + transfer = slab_alloc(&rctx->pool_transfers_unsync); + else + transfer = slab_alloc(&rctx->pool_transfers); + + transfer->b.b.resource = NULL; + pipe_resource_reference(&transfer->b.b.resource, resource); + transfer->b.b.level = 0; + transfer->b.b.usage = usage; + transfer->b.b.box = *box; + transfer->b.b.stride = 0; + transfer->b.b.layer_stride = 0; + transfer->b.staging = NULL; + transfer->offset = offset; + transfer->staging = staging; + *ptransfer = &transfer->b.b; + return data; +} + +static bool r600_can_dma_copy_buffer(struct r600_common_context *rctx, + unsigned dstx, unsigned srcx, unsigned size) +{ + bool dword_aligned = !(dstx % 4) && !(srcx % 4) && !(size % 4); + + return rctx->screen->has_cp_dma || + (dword_aligned && (rctx->dma.cs || + rctx->screen->has_streamout)); + +} + +static void *r600_buffer_transfer_map(struct pipe_context *ctx, + struct pipe_resource *resource, + unsigned level, + unsigned usage, + const struct pipe_box *box, + struct pipe_transfer **ptransfer) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen; + struct r600_resource *rbuffer = r600_resource(resource); + uint8_t *data; + + assert(box->x + box->width <= resource->width0); + + /* From GL_AMD_pinned_memory issues: + * + * 4) Is glMapBuffer on a shared buffer guaranteed to return the + * same system address which was specified at creation time? + * + * RESOLVED: NO. The GL implementation might return a different + * virtual mapping of that memory, although the same physical + * page will be used. + * + * So don't ever use staging buffers. + */ + if (rbuffer->b.is_user_ptr) + usage |= PIPE_TRANSFER_PERSISTENT; + + /* See if the buffer range being mapped has never been initialized, + * in which case it can be mapped unsynchronized. */ + if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | + TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) && + usage & PIPE_TRANSFER_WRITE && + !rbuffer->b.is_shared && + !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) { + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + } + + /* If discarding the entire range, discard the whole resource instead. */ + if (usage & PIPE_TRANSFER_DISCARD_RANGE && + box->x == 0 && box->width == resource->width0) { + usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; + } + + if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && + !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | + TC_TRANSFER_MAP_NO_INVALIDATE))) { + assert(usage & PIPE_TRANSFER_WRITE); + + if (r600_invalidate_buffer(rctx, rbuffer)) { + /* At this point, the buffer is always idle. */ + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + } else { + /* Fall back to a temporary buffer. */ + usage |= PIPE_TRANSFER_DISCARD_RANGE; + } + } + + if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && + !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) && + ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | + PIPE_TRANSFER_PERSISTENT)) && + r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) || + (rbuffer->flags & RADEON_FLAG_SPARSE))) { + assert(usage & PIPE_TRANSFER_WRITE); + + /* Check if mapping this buffer would cause waiting for the GPU. + */ + if (rbuffer->flags & RADEON_FLAG_SPARSE || + r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || + !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { + /* Do a wait-free write-only transfer using a temporary buffer. */ + unsigned offset; + struct r600_resource *staging = NULL; + + u_upload_alloc(ctx->stream_uploader, 0, + box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), + rctx->screen->info.tcc_cache_line_size, + &offset, (struct pipe_resource**)&staging, + (void**)&data); + + if (staging) { + data += box->x % R600_MAP_BUFFER_ALIGNMENT; + return r600_buffer_get_transfer(ctx, resource, usage, box, + ptransfer, data, staging, offset); + } else if (rbuffer->flags & RADEON_FLAG_SPARSE) { + return NULL; + } + } else { + /* At this point, the buffer is always idle (we checked it above). */ + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + } + } + /* Use a staging buffer in cached GTT for reads. */ + else if (((usage & PIPE_TRANSFER_READ) && + !(usage & PIPE_TRANSFER_PERSISTENT) && + (rbuffer->domains & RADEON_DOMAIN_VRAM || + rbuffer->flags & RADEON_FLAG_GTT_WC) && + r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) || + (rbuffer->flags & RADEON_FLAG_SPARSE)) { + struct r600_resource *staging; + + assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC)); + staging = (struct r600_resource*) pipe_buffer_create( + ctx->screen, 0, PIPE_USAGE_STAGING, + box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT)); + if (staging) { + /* Copy the VRAM buffer to the staging buffer. */ + rctx->dma_copy(ctx, &staging->b.b, 0, + box->x % R600_MAP_BUFFER_ALIGNMENT, + 0, 0, resource, 0, box); + + data = r600_buffer_map_sync_with_rings(rctx, staging, + usage & ~PIPE_TRANSFER_UNSYNCHRONIZED); + if (!data) { + r600_resource_reference(&staging, NULL); + return NULL; + } + data += box->x % R600_MAP_BUFFER_ALIGNMENT; + + return r600_buffer_get_transfer(ctx, resource, usage, box, + ptransfer, data, staging, 0); + } else if (rbuffer->flags & RADEON_FLAG_SPARSE) { + return NULL; + } + } + + data = r600_buffer_map_sync_with_rings(rctx, rbuffer, usage); + if (!data) { + return NULL; + } + data += box->x; + + return r600_buffer_get_transfer(ctx, resource, usage, box, + ptransfer, data, NULL, 0); +} + +static void r600_buffer_do_flush_region(struct pipe_context *ctx, + struct pipe_transfer *transfer, + const struct pipe_box *box) +{ + struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; + struct r600_resource *rbuffer = r600_resource(transfer->resource); + + if (rtransfer->staging) { + struct pipe_resource *dst, *src; + unsigned soffset; + struct pipe_box dma_box; + + dst = transfer->resource; + src = &rtransfer->staging->b.b; + soffset = rtransfer->offset + box->x % R600_MAP_BUFFER_ALIGNMENT; + + u_box_1d(soffset, box->width, &dma_box); + + /* Copy the staging buffer into the original one. */ + ctx->resource_copy_region(ctx, dst, 0, box->x, 0, 0, src, 0, &dma_box); + } + + util_range_add(&rbuffer->valid_buffer_range, box->x, + box->x + box->width); +} + +static void r600_buffer_flush_region(struct pipe_context *ctx, + struct pipe_transfer *transfer, + const struct pipe_box *rel_box) +{ + unsigned required_usage = PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_FLUSH_EXPLICIT; + + if ((transfer->usage & required_usage) == required_usage) { + struct pipe_box box; + + u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box); + r600_buffer_do_flush_region(ctx, transfer, &box); + } +} + +static void r600_buffer_transfer_unmap(struct pipe_context *ctx, + struct pipe_transfer *transfer) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; + + if (transfer->usage & PIPE_TRANSFER_WRITE && + !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT)) + r600_buffer_do_flush_region(ctx, transfer, &transfer->box); + + r600_resource_reference(&rtransfer->staging, NULL); + assert(rtransfer->b.staging == NULL); /* for threaded context only */ + pipe_resource_reference(&transfer->resource, NULL); + + /* Don't use pool_transfers_unsync. We are always in the driver + * thread. */ + slab_free(&rctx->pool_transfers, transfer); +} + +void r600_buffer_subdata(struct pipe_context *ctx, + struct pipe_resource *buffer, + unsigned usage, unsigned offset, + unsigned size, const void *data) +{ + struct pipe_transfer *transfer = NULL; + struct pipe_box box; + uint8_t *map = NULL; + + u_box_1d(offset, size, &box); + map = r600_buffer_transfer_map(ctx, buffer, 0, + PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_DISCARD_RANGE | + usage, + &box, &transfer); + if (!map) + return; + + memcpy(map, data, size); + r600_buffer_transfer_unmap(ctx, transfer); +} + +static const struct u_resource_vtbl r600_buffer_vtbl = +{ + NULL, /* get_handle */ + r600_buffer_destroy, /* resource_destroy */ + r600_buffer_transfer_map, /* transfer_map */ + r600_buffer_flush_region, /* transfer_flush_region */ + r600_buffer_transfer_unmap, /* transfer_unmap */ +}; + +static struct r600_resource * +r600_alloc_buffer_struct(struct pipe_screen *screen, + const struct pipe_resource *templ) +{ + struct r600_resource *rbuffer; + + rbuffer = MALLOC_STRUCT(r600_resource); + + rbuffer->b.b = *templ; + rbuffer->b.b.next = NULL; + pipe_reference_init(&rbuffer->b.b.reference, 1); + rbuffer->b.b.screen = screen; + + rbuffer->b.vtbl = &r600_buffer_vtbl; + threaded_resource_init(&rbuffer->b.b); + + rbuffer->buf = NULL; + rbuffer->bind_history = 0; + util_range_init(&rbuffer->valid_buffer_range); + return rbuffer; +} + +struct pipe_resource *r600_buffer_create(struct pipe_screen *screen, + const struct pipe_resource *templ, + unsigned alignment) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + struct r600_resource *rbuffer = r600_alloc_buffer_struct(screen, templ); + + r600_init_resource_fields(rscreen, rbuffer, templ->width0, alignment); + + if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE) + rbuffer->flags |= RADEON_FLAG_SPARSE; + + if (!r600_alloc_resource(rscreen, rbuffer)) { + FREE(rbuffer); + return NULL; + } + return &rbuffer->b.b; +} + +struct pipe_resource *r600_aligned_buffer_create(struct pipe_screen *screen, + unsigned flags, + unsigned usage, + unsigned size, + unsigned alignment) +{ + struct pipe_resource buffer; + + memset(&buffer, 0, sizeof buffer); + buffer.target = PIPE_BUFFER; + buffer.format = PIPE_FORMAT_R8_UNORM; + buffer.bind = 0; + buffer.usage = usage; + buffer.flags = flags; + buffer.width0 = size; + buffer.height0 = 1; + buffer.depth0 = 1; + buffer.array_size = 1; + return r600_buffer_create(screen, &buffer, alignment); +} + +struct pipe_resource * +r600_buffer_from_user_memory(struct pipe_screen *screen, + const struct pipe_resource *templ, + void *user_memory) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + struct radeon_winsys *ws = rscreen->ws; + struct r600_resource *rbuffer = r600_alloc_buffer_struct(screen, templ); + + rbuffer->domains = RADEON_DOMAIN_GTT; + rbuffer->flags = 0; + rbuffer->b.is_user_ptr = true; + util_range_add(&rbuffer->valid_buffer_range, 0, templ->width0); + util_range_add(&rbuffer->b.valid_buffer_range, 0, templ->width0); + + /* Convert a user pointer to a buffer. */ + rbuffer->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0); + if (!rbuffer->buf) { + FREE(rbuffer); + return NULL; + } + + if (rscreen->info.has_virtual_memory) + rbuffer->gpu_address = + ws->buffer_get_virtual_address(rbuffer->buf); + else + rbuffer->gpu_address = 0; + + rbuffer->vram_usage = 0; + rbuffer->gart_usage = templ->width0; + + return &rbuffer->b.b; +} diff --git a/lib/mesa/src/gallium/drivers/r600/r600_cs.h b/lib/mesa/src/gallium/drivers/r600/r600_cs.h new file mode 100644 index 000000000..0efae09f3 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600_cs.h @@ -0,0 +1,209 @@ +/* + * Copyright 2013 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: Marek Olšák <maraeo@gmail.com> + */ + +/** + * This file contains helpers for writing commands to commands streams. + */ + +#ifndef R600_CS_H +#define R600_CS_H + +#include "r600_pipe_common.h" +#include "r600d_common.h" + +/** + * Return true if there is enough memory in VRAM and GTT for the buffers + * added so far. + * + * \param vram VRAM memory size not added to the buffer list yet + * \param gtt GTT memory size not added to the buffer list yet + */ +static inline bool +radeon_cs_memory_below_limit(struct r600_common_screen *screen, + struct radeon_winsys_cs *cs, + uint64_t vram, uint64_t gtt) +{ + vram += cs->used_vram; + gtt += cs->used_gart; + + /* Anything that goes above the VRAM size should go to GTT. */ + if (vram > screen->info.vram_size) + gtt += vram - screen->info.vram_size; + + /* Now we just need to check if we have enough GTT. */ + return gtt < screen->info.gart_size * 0.7; +} + +/** + * Add a buffer to the buffer list for the given command stream (CS). + * + * All buffers used by a CS must be added to the list. This tells the kernel + * driver which buffers are used by GPU commands. Other buffers can + * be swapped out (not accessible) during execution. + * + * The buffer list becomes empty after every context flush and must be + * rebuilt. + */ +static inline unsigned radeon_add_to_buffer_list(struct r600_common_context *rctx, + struct r600_ring *ring, + struct r600_resource *rbo, + enum radeon_bo_usage usage, + enum radeon_bo_priority priority) +{ + assert(usage); + return rctx->ws->cs_add_buffer( + ring->cs, rbo->buf, + (enum radeon_bo_usage)(usage | RADEON_USAGE_SYNCHRONIZED), + rbo->domains, priority) * 4; +} + +/** + * Same as above, but also checks memory usage and flushes the context + * accordingly. + * + * When this SHOULD NOT be used: + * + * - if r600_context_add_resource_size has been called for the buffer + * followed by *_need_cs_space for checking the memory usage + * + * - if r600_need_dma_space has been called for the buffer + * + * - when emitting state packets and draw packets (because preceding packets + * can't be re-emitted at that point) + * + * - if shader resource "enabled_mask" is not up-to-date or there is + * a different constraint disallowing a context flush + */ +static inline unsigned +radeon_add_to_buffer_list_check_mem(struct r600_common_context *rctx, + struct r600_ring *ring, + struct r600_resource *rbo, + enum radeon_bo_usage usage, + enum radeon_bo_priority priority, + bool check_mem) +{ + if (check_mem && + !radeon_cs_memory_below_limit(rctx->screen, ring->cs, + rctx->vram + rbo->vram_usage, + rctx->gtt + rbo->gart_usage)) + ring->flush(rctx, RADEON_FLUSH_ASYNC, NULL); + + return radeon_add_to_buffer_list(rctx, ring, rbo, usage, priority); +} + +static inline void r600_emit_reloc(struct r600_common_context *rctx, + struct r600_ring *ring, struct r600_resource *rbo, + enum radeon_bo_usage usage, + enum radeon_bo_priority priority) +{ + struct radeon_winsys_cs *cs = ring->cs; + bool has_vm = ((struct r600_common_screen*)rctx->b.screen)->info.has_virtual_memory; + unsigned reloc = radeon_add_to_buffer_list(rctx, ring, rbo, usage, priority); + + if (!has_vm) { + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, reloc); + } +} + +static inline void radeon_set_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +{ + assert(reg < R600_CONTEXT_REG_OFFSET); + assert(cs->current.cdw + 2 + num <= cs->current.max_dw); + radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0)); + radeon_emit(cs, (reg - R600_CONFIG_REG_OFFSET) >> 2); +} + +static inline void radeon_set_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +{ + radeon_set_config_reg_seq(cs, reg, 1); + radeon_emit(cs, value); +} + +static inline void radeon_set_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +{ + assert(reg >= R600_CONTEXT_REG_OFFSET); + assert(cs->current.cdw + 2 + num <= cs->current.max_dw); + radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0)); + radeon_emit(cs, (reg - R600_CONTEXT_REG_OFFSET) >> 2); +} + +static inline void radeon_set_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +{ + radeon_set_context_reg_seq(cs, reg, 1); + radeon_emit(cs, value); +} + +static inline void radeon_set_context_reg_idx(struct radeon_winsys_cs *cs, + unsigned reg, unsigned idx, + unsigned value) +{ + assert(reg >= R600_CONTEXT_REG_OFFSET); + assert(cs->current.cdw + 3 <= cs->current.max_dw); + radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0)); + radeon_emit(cs, (reg - R600_CONTEXT_REG_OFFSET) >> 2 | (idx << 28)); + radeon_emit(cs, value); +} + +static inline void radeon_set_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +{ + assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END); + assert(cs->current.cdw + 2 + num <= cs->current.max_dw); + radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0)); + radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2); +} + +static inline void radeon_set_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +{ + radeon_set_sh_reg_seq(cs, reg, 1); + radeon_emit(cs, value); +} + +static inline void radeon_set_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num) +{ + assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); + assert(cs->current.cdw + 2 + num <= cs->current.max_dw); + radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0)); + radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2); +} + +static inline void radeon_set_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value) +{ + radeon_set_uconfig_reg_seq(cs, reg, 1); + radeon_emit(cs, value); +} + +static inline void radeon_set_uconfig_reg_idx(struct radeon_winsys_cs *cs, + unsigned reg, unsigned idx, + unsigned value) +{ + assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); + assert(cs->current.cdw + 3 <= cs->current.max_dw); + radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, 1, 0)); + radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28)); + radeon_emit(cs, value); +} + +#endif diff --git a/lib/mesa/src/gallium/drivers/r600/r600_gpu_load.c b/lib/mesa/src/gallium/drivers/r600/r600_gpu_load.c new file mode 100644 index 000000000..c15fb9dfa --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600_gpu_load.c @@ -0,0 +1,263 @@ +/* + * Copyright 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Marek Olšák <maraeo@gmail.com> + * + */ + +/* The GPU load is measured as follows. + * + * There is a thread which samples the GRBM_STATUS register at a certain + * frequency and the "busy" or "idle" counter is incremented based on + * whether the GUI_ACTIVE bit is set or not. + * + * Then, the user can sample the counters twice and calculate the average + * GPU load between the two samples. + */ + +#include "r600_pipe_common.h" +#include "r600_query.h" +#include "os/os_time.h" + +/* For good accuracy at 1000 fps or lower. This will be inaccurate for higher + * fps (there are too few samples per frame). */ +#define SAMPLES_PER_SEC 10000 + +#define GRBM_STATUS 0x8010 +#define TA_BUSY(x) (((x) >> 14) & 0x1) +#define GDS_BUSY(x) (((x) >> 15) & 0x1) +#define VGT_BUSY(x) (((x) >> 17) & 0x1) +#define IA_BUSY(x) (((x) >> 19) & 0x1) +#define SX_BUSY(x) (((x) >> 20) & 0x1) +#define WD_BUSY(x) (((x) >> 21) & 0x1) +#define SPI_BUSY(x) (((x) >> 22) & 0x1) +#define BCI_BUSY(x) (((x) >> 23) & 0x1) +#define SC_BUSY(x) (((x) >> 24) & 0x1) +#define PA_BUSY(x) (((x) >> 25) & 0x1) +#define DB_BUSY(x) (((x) >> 26) & 0x1) +#define CP_BUSY(x) (((x) >> 29) & 0x1) +#define CB_BUSY(x) (((x) >> 30) & 0x1) +#define GUI_ACTIVE(x) (((x) >> 31) & 0x1) + +#define SRBM_STATUS2 0x0e4c +#define SDMA_BUSY(x) (((x) >> 5) & 0x1) + +#define CP_STAT 0x8680 +#define PFP_BUSY(x) (((x) >> 15) & 0x1) +#define MEQ_BUSY(x) (((x) >> 16) & 0x1) +#define ME_BUSY(x) (((x) >> 17) & 0x1) +#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1) +#define DMA_BUSY(x) (((x) >> 22) & 0x1) +#define SCRATCH_RAM_BUSY(x) (((x) >> 24) & 0x1) + +#define IDENTITY(x) x + +#define UPDATE_COUNTER(field, mask) \ + do { \ + if (mask(value)) \ + p_atomic_inc(&counters->named.field.busy); \ + else \ + p_atomic_inc(&counters->named.field.idle); \ + } while (0) + +static void r600_update_mmio_counters(struct r600_common_screen *rscreen, + union r600_mmio_counters *counters) +{ + uint32_t value = 0; + bool gui_busy, sdma_busy = false; + + /* GRBM_STATUS */ + rscreen->ws->read_registers(rscreen->ws, GRBM_STATUS, 1, &value); + + UPDATE_COUNTER(ta, TA_BUSY); + UPDATE_COUNTER(gds, GDS_BUSY); + UPDATE_COUNTER(vgt, VGT_BUSY); + UPDATE_COUNTER(ia, IA_BUSY); + UPDATE_COUNTER(sx, SX_BUSY); + UPDATE_COUNTER(wd, WD_BUSY); + UPDATE_COUNTER(spi, SPI_BUSY); + UPDATE_COUNTER(bci, BCI_BUSY); + UPDATE_COUNTER(sc, SC_BUSY); + UPDATE_COUNTER(pa, PA_BUSY); + UPDATE_COUNTER(db, DB_BUSY); + UPDATE_COUNTER(cp, CP_BUSY); + UPDATE_COUNTER(cb, CB_BUSY); + UPDATE_COUNTER(gui, GUI_ACTIVE); + gui_busy = GUI_ACTIVE(value); + + value = gui_busy || sdma_busy; + UPDATE_COUNTER(gpu, IDENTITY); +} + +#undef UPDATE_COUNTER + +static int +r600_gpu_load_thread(void *param) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)param; + const int period_us = 1000000 / SAMPLES_PER_SEC; + int sleep_us = period_us; + int64_t cur_time, last_time = os_time_get(); + + while (!p_atomic_read(&rscreen->gpu_load_stop_thread)) { + if (sleep_us) + os_time_sleep(sleep_us); + + /* Make sure we sleep the ideal amount of time to match + * the expected frequency. */ + cur_time = os_time_get(); + + if (os_time_timeout(last_time, last_time + period_us, + cur_time)) + sleep_us = MAX2(sleep_us - 1, 1); + else + sleep_us += 1; + + /*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/ + last_time = cur_time; + + /* Update the counters. */ + r600_update_mmio_counters(rscreen, &rscreen->mmio_counters); + } + p_atomic_dec(&rscreen->gpu_load_stop_thread); + return 0; +} + +void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen) +{ + if (!rscreen->gpu_load_thread) + return; + + p_atomic_inc(&rscreen->gpu_load_stop_thread); + thrd_join(rscreen->gpu_load_thread, NULL); + rscreen->gpu_load_thread = 0; +} + +static uint64_t r600_read_mmio_counter(struct r600_common_screen *rscreen, + unsigned busy_index) +{ + /* Start the thread if needed. */ + if (!rscreen->gpu_load_thread) { + mtx_lock(&rscreen->gpu_load_mutex); + /* Check again inside the mutex. */ + if (!rscreen->gpu_load_thread) + rscreen->gpu_load_thread = + u_thread_create(r600_gpu_load_thread, rscreen); + mtx_unlock(&rscreen->gpu_load_mutex); + } + + unsigned busy = p_atomic_read(&rscreen->mmio_counters.array[busy_index]); + unsigned idle = p_atomic_read(&rscreen->mmio_counters.array[busy_index + 1]); + + return busy | ((uint64_t)idle << 32); +} + +static unsigned r600_end_mmio_counter(struct r600_common_screen *rscreen, + uint64_t begin, unsigned busy_index) +{ + uint64_t end = r600_read_mmio_counter(rscreen, busy_index); + unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff); + unsigned idle = (end >> 32) - (begin >> 32); + + /* Calculate the % of time the busy counter was being incremented. + * + * If no counters were incremented, return the current counter status. + * It's for the case when the load is queried faster than + * the counters are updated. + */ + if (idle || busy) { + return busy*100 / (busy + idle); + } else { + union r600_mmio_counters counters; + + memset(&counters, 0, sizeof(counters)); + r600_update_mmio_counters(rscreen, &counters); + return counters.array[busy_index] ? 100 : 0; + } +} + +#define BUSY_INDEX(rscreen, field) (&rscreen->mmio_counters.named.field.busy - \ + rscreen->mmio_counters.array) + +static unsigned busy_index_from_type(struct r600_common_screen *rscreen, + unsigned type) +{ + switch (type) { + case R600_QUERY_GPU_LOAD: + return BUSY_INDEX(rscreen, gpu); + case R600_QUERY_GPU_SHADERS_BUSY: + return BUSY_INDEX(rscreen, spi); + case R600_QUERY_GPU_TA_BUSY: + return BUSY_INDEX(rscreen, ta); + case R600_QUERY_GPU_GDS_BUSY: + return BUSY_INDEX(rscreen, gds); + case R600_QUERY_GPU_VGT_BUSY: + return BUSY_INDEX(rscreen, vgt); + case R600_QUERY_GPU_IA_BUSY: + return BUSY_INDEX(rscreen, ia); + case R600_QUERY_GPU_SX_BUSY: + return BUSY_INDEX(rscreen, sx); + case R600_QUERY_GPU_WD_BUSY: + return BUSY_INDEX(rscreen, wd); + case R600_QUERY_GPU_BCI_BUSY: + return BUSY_INDEX(rscreen, bci); + case R600_QUERY_GPU_SC_BUSY: + return BUSY_INDEX(rscreen, sc); + case R600_QUERY_GPU_PA_BUSY: + return BUSY_INDEX(rscreen, pa); + case R600_QUERY_GPU_DB_BUSY: + return BUSY_INDEX(rscreen, db); + case R600_QUERY_GPU_CP_BUSY: + return BUSY_INDEX(rscreen, cp); + case R600_QUERY_GPU_CB_BUSY: + return BUSY_INDEX(rscreen, cb); + case R600_QUERY_GPU_SDMA_BUSY: + return BUSY_INDEX(rscreen, sdma); + case R600_QUERY_GPU_PFP_BUSY: + return BUSY_INDEX(rscreen, pfp); + case R600_QUERY_GPU_MEQ_BUSY: + return BUSY_INDEX(rscreen, meq); + case R600_QUERY_GPU_ME_BUSY: + return BUSY_INDEX(rscreen, me); + case R600_QUERY_GPU_SURF_SYNC_BUSY: + return BUSY_INDEX(rscreen, surf_sync); + case R600_QUERY_GPU_CP_DMA_BUSY: + return BUSY_INDEX(rscreen, cp_dma); + case R600_QUERY_GPU_SCRATCH_RAM_BUSY: + return BUSY_INDEX(rscreen, scratch_ram); + default: + unreachable("invalid query type"); + } +} + +uint64_t r600_begin_counter(struct r600_common_screen *rscreen, unsigned type) +{ + unsigned busy_index = busy_index_from_type(rscreen, type); + return r600_read_mmio_counter(rscreen, busy_index); +} + +unsigned r600_end_counter(struct r600_common_screen *rscreen, unsigned type, + uint64_t begin) +{ + unsigned busy_index = busy_index_from_type(rscreen, type); + return r600_end_mmio_counter(rscreen, begin, busy_index); +} diff --git a/lib/mesa/src/gallium/drivers/r600/r600_perfcounter.c b/lib/mesa/src/gallium/drivers/r600/r600_perfcounter.c new file mode 100644 index 000000000..f186acb05 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600_perfcounter.c @@ -0,0 +1,649 @@ +/* + * Copyright 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Nicolai Hähnle <nicolai.haehnle@amd.com> + * + */ + +#include "util/u_memory.h" +#include "r600_query.h" +#include "r600_pipe_common.h" +#include "r600d_common.h" + +/* Max counters per HW block */ +#define R600_QUERY_MAX_COUNTERS 16 + +static struct r600_perfcounter_block * +lookup_counter(struct r600_perfcounters *pc, unsigned index, + unsigned *base_gid, unsigned *sub_index) +{ + struct r600_perfcounter_block *block = pc->blocks; + unsigned bid; + + *base_gid = 0; + for (bid = 0; bid < pc->num_blocks; ++bid, ++block) { + unsigned total = block->num_groups * block->num_selectors; + + if (index < total) { + *sub_index = index; + return block; + } + + index -= total; + *base_gid += block->num_groups; + } + + return NULL; +} + +static struct r600_perfcounter_block * +lookup_group(struct r600_perfcounters *pc, unsigned *index) +{ + unsigned bid; + struct r600_perfcounter_block *block = pc->blocks; + + for (bid = 0; bid < pc->num_blocks; ++bid, ++block) { + if (*index < block->num_groups) + return block; + *index -= block->num_groups; + } + + return NULL; +} + +struct r600_pc_group { + struct r600_pc_group *next; + struct r600_perfcounter_block *block; + unsigned sub_gid; /* only used during init */ + unsigned result_base; /* only used during init */ + int se; + int instance; + unsigned num_counters; + unsigned selectors[R600_QUERY_MAX_COUNTERS]; +}; + +struct r600_pc_counter { + unsigned base; + unsigned qwords; + unsigned stride; /* in uint64s */ +}; + +#define R600_PC_SHADERS_WINDOWING (1 << 31) + +struct r600_query_pc { + struct r600_query_hw b; + + unsigned shaders; + unsigned num_counters; + struct r600_pc_counter *counters; + struct r600_pc_group *groups; +}; + +static void r600_pc_query_destroy(struct r600_common_screen *rscreen, + struct r600_query *rquery) +{ + struct r600_query_pc *query = (struct r600_query_pc *)rquery; + + while (query->groups) { + struct r600_pc_group *group = query->groups; + query->groups = group->next; + FREE(group); + } + + FREE(query->counters); + + r600_query_hw_destroy(rscreen, rquery); +} + +static bool r600_pc_query_prepare_buffer(struct r600_common_screen *screen, + struct r600_query_hw *hwquery, + struct r600_resource *buffer) +{ + /* no-op */ + return true; +} + +static void r600_pc_query_emit_start(struct r600_common_context *ctx, + struct r600_query_hw *hwquery, + struct r600_resource *buffer, uint64_t va) +{ + struct r600_perfcounters *pc = ctx->screen->perfcounters; + struct r600_query_pc *query = (struct r600_query_pc *)hwquery; + struct r600_pc_group *group; + int current_se = -1; + int current_instance = -1; + + if (query->shaders) + pc->emit_shaders(ctx, query->shaders); + + for (group = query->groups; group; group = group->next) { + struct r600_perfcounter_block *block = group->block; + + if (group->se != current_se || group->instance != current_instance) { + current_se = group->se; + current_instance = group->instance; + pc->emit_instance(ctx, group->se, group->instance); + } + + pc->emit_select(ctx, block, group->num_counters, group->selectors); + } + + if (current_se != -1 || current_instance != -1) + pc->emit_instance(ctx, -1, -1); + + pc->emit_start(ctx, buffer, va); +} + +static void r600_pc_query_emit_stop(struct r600_common_context *ctx, + struct r600_query_hw *hwquery, + struct r600_resource *buffer, uint64_t va) +{ + struct r600_perfcounters *pc = ctx->screen->perfcounters; + struct r600_query_pc *query = (struct r600_query_pc *)hwquery; + struct r600_pc_group *group; + + pc->emit_stop(ctx, buffer, va); + + for (group = query->groups; group; group = group->next) { + struct r600_perfcounter_block *block = group->block; + unsigned se = group->se >= 0 ? group->se : 0; + unsigned se_end = se + 1; + + if ((block->flags & R600_PC_BLOCK_SE) && (group->se < 0)) + se_end = ctx->screen->info.max_se; + + do { + unsigned instance = group->instance >= 0 ? group->instance : 0; + + do { + pc->emit_instance(ctx, se, instance); + pc->emit_read(ctx, block, + group->num_counters, group->selectors, + buffer, va); + va += sizeof(uint64_t) * group->num_counters; + } while (group->instance < 0 && ++instance < block->num_instances); + } while (++se < se_end); + } + + pc->emit_instance(ctx, -1, -1); +} + +static void r600_pc_query_clear_result(struct r600_query_hw *hwquery, + union pipe_query_result *result) +{ + struct r600_query_pc *query = (struct r600_query_pc *)hwquery; + + memset(result, 0, sizeof(result->batch[0]) * query->num_counters); +} + +static void r600_pc_query_add_result(struct r600_common_screen *rscreen, + struct r600_query_hw *hwquery, + void *buffer, + union pipe_query_result *result) +{ + struct r600_query_pc *query = (struct r600_query_pc *)hwquery; + uint64_t *results = buffer; + unsigned i, j; + + for (i = 0; i < query->num_counters; ++i) { + struct r600_pc_counter *counter = &query->counters[i]; + + for (j = 0; j < counter->qwords; ++j) { + uint32_t value = results[counter->base + j * counter->stride]; + result->batch[i].u64 += value; + } + } +} + +static struct r600_query_ops batch_query_ops = { + .destroy = r600_pc_query_destroy, + .begin = r600_query_hw_begin, + .end = r600_query_hw_end, + .get_result = r600_query_hw_get_result +}; + +static struct r600_query_hw_ops batch_query_hw_ops = { + .prepare_buffer = r600_pc_query_prepare_buffer, + .emit_start = r600_pc_query_emit_start, + .emit_stop = r600_pc_query_emit_stop, + .clear_result = r600_pc_query_clear_result, + .add_result = r600_pc_query_add_result, +}; + +static struct r600_pc_group *get_group_state(struct r600_common_screen *screen, + struct r600_query_pc *query, + struct r600_perfcounter_block *block, + unsigned sub_gid) +{ + struct r600_pc_group *group = query->groups; + + while (group) { + if (group->block == block && group->sub_gid == sub_gid) + return group; + group = group->next; + } + + group = CALLOC_STRUCT(r600_pc_group); + if (!group) + return NULL; + + group->block = block; + group->sub_gid = sub_gid; + + if (block->flags & R600_PC_BLOCK_SHADER) { + unsigned sub_gids = block->num_instances; + unsigned shader_id; + unsigned shaders; + unsigned query_shaders; + + if (block->flags & R600_PC_BLOCK_SE_GROUPS) + sub_gids = sub_gids * screen->info.max_se; + shader_id = sub_gid / sub_gids; + sub_gid = sub_gid % sub_gids; + + shaders = screen->perfcounters->shader_type_bits[shader_id]; + + query_shaders = query->shaders & ~R600_PC_SHADERS_WINDOWING; + if (query_shaders && query_shaders != shaders) { + fprintf(stderr, "r600_perfcounter: incompatible shader groups\n"); + FREE(group); + return NULL; + } + query->shaders = shaders; + } + + if (block->flags & R600_PC_BLOCK_SHADER_WINDOWED && !query->shaders) { + // A non-zero value in query->shaders ensures that the shader + // masking is reset unless the user explicitly requests one. + query->shaders = R600_PC_SHADERS_WINDOWING; + } + + if (block->flags & R600_PC_BLOCK_SE_GROUPS) { + group->se = sub_gid / block->num_instances; + sub_gid = sub_gid % block->num_instances; + } else { + group->se = -1; + } + + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) { + group->instance = sub_gid; + } else { + group->instance = -1; + } + + group->next = query->groups; + query->groups = group; + + return group; +} + +struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, + unsigned num_queries, + unsigned *query_types) +{ + struct r600_common_screen *screen = + (struct r600_common_screen *)ctx->screen; + struct r600_perfcounters *pc = screen->perfcounters; + struct r600_perfcounter_block *block; + struct r600_pc_group *group; + struct r600_query_pc *query; + unsigned base_gid, sub_gid, sub_index; + unsigned i, j; + + if (!pc) + return NULL; + + query = CALLOC_STRUCT(r600_query_pc); + if (!query) + return NULL; + + query->b.b.ops = &batch_query_ops; + query->b.ops = &batch_query_hw_ops; + + query->num_counters = num_queries; + + /* Collect selectors per group */ + for (i = 0; i < num_queries; ++i) { + unsigned sub_gid; + + if (query_types[i] < R600_QUERY_FIRST_PERFCOUNTER) + goto error; + + block = lookup_counter(pc, query_types[i] - R600_QUERY_FIRST_PERFCOUNTER, + &base_gid, &sub_index); + if (!block) + goto error; + + sub_gid = sub_index / block->num_selectors; + sub_index = sub_index % block->num_selectors; + + group = get_group_state(screen, query, block, sub_gid); + if (!group) + goto error; + + if (group->num_counters >= block->num_counters) { + fprintf(stderr, + "perfcounter group %s: too many selected\n", + block->basename); + goto error; + } + group->selectors[group->num_counters] = sub_index; + ++group->num_counters; + } + + /* Compute result bases and CS size per group */ + query->b.num_cs_dw_begin = pc->num_start_cs_dwords; + query->b.num_cs_dw_end = pc->num_stop_cs_dwords; + + query->b.num_cs_dw_begin += pc->num_instance_cs_dwords; /* conservative */ + query->b.num_cs_dw_end += pc->num_instance_cs_dwords; + + i = 0; + for (group = query->groups; group; group = group->next) { + struct r600_perfcounter_block *block = group->block; + unsigned select_dw, read_dw; + unsigned instances = 1; + + if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0) + instances = screen->info.max_se; + if (group->instance < 0) + instances *= block->num_instances; + + group->result_base = i; + query->b.result_size += sizeof(uint64_t) * instances * group->num_counters; + i += instances * group->num_counters; + + pc->get_size(block, group->num_counters, group->selectors, + &select_dw, &read_dw); + query->b.num_cs_dw_begin += select_dw; + query->b.num_cs_dw_end += instances * read_dw; + query->b.num_cs_dw_begin += pc->num_instance_cs_dwords; /* conservative */ + query->b.num_cs_dw_end += instances * pc->num_instance_cs_dwords; + } + + if (query->shaders) { + if (query->shaders == R600_PC_SHADERS_WINDOWING) + query->shaders = 0xffffffff; + query->b.num_cs_dw_begin += pc->num_shaders_cs_dwords; + } + + /* Map user-supplied query array to result indices */ + query->counters = CALLOC(num_queries, sizeof(*query->counters)); + for (i = 0; i < num_queries; ++i) { + struct r600_pc_counter *counter = &query->counters[i]; + struct r600_perfcounter_block *block; + + block = lookup_counter(pc, query_types[i] - R600_QUERY_FIRST_PERFCOUNTER, + &base_gid, &sub_index); + + sub_gid = sub_index / block->num_selectors; + sub_index = sub_index % block->num_selectors; + + group = get_group_state(screen, query, block, sub_gid); + assert(group != NULL); + + for (j = 0; j < group->num_counters; ++j) { + if (group->selectors[j] == sub_index) + break; + } + + counter->base = group->result_base + j; + counter->stride = group->num_counters; + + counter->qwords = 1; + if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0) + counter->qwords = screen->info.max_se; + if (group->instance < 0) + counter->qwords *= block->num_instances; + } + + if (!r600_query_hw_init(screen, &query->b)) + goto error; + + return (struct pipe_query *)query; + +error: + r600_pc_query_destroy(screen, &query->b.b); + return NULL; +} + +static bool r600_init_block_names(struct r600_common_screen *screen, + struct r600_perfcounter_block *block) +{ + unsigned i, j, k; + unsigned groups_shader = 1, groups_se = 1, groups_instance = 1; + unsigned namelen; + char *groupname; + char *p; + + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) + groups_instance = block->num_instances; + if (block->flags & R600_PC_BLOCK_SE_GROUPS) + groups_se = screen->info.max_se; + if (block->flags & R600_PC_BLOCK_SHADER) + groups_shader = screen->perfcounters->num_shader_types; + + namelen = strlen(block->basename); + block->group_name_stride = namelen + 1; + if (block->flags & R600_PC_BLOCK_SHADER) + block->group_name_stride += 3; + if (block->flags & R600_PC_BLOCK_SE_GROUPS) { + assert(groups_se <= 10); + block->group_name_stride += 1; + + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) + block->group_name_stride += 1; + } + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) { + assert(groups_instance <= 100); + block->group_name_stride += 2; + } + + block->group_names = MALLOC(block->num_groups * block->group_name_stride); + if (!block->group_names) + return false; + + groupname = block->group_names; + for (i = 0; i < groups_shader; ++i) { + const char *shader_suffix = screen->perfcounters->shader_type_suffixes[i]; + unsigned shaderlen = strlen(shader_suffix); + for (j = 0; j < groups_se; ++j) { + for (k = 0; k < groups_instance; ++k) { + strcpy(groupname, block->basename); + p = groupname + namelen; + + if (block->flags & R600_PC_BLOCK_SHADER) { + strcpy(p, shader_suffix); + p += shaderlen; + } + + if (block->flags & R600_PC_BLOCK_SE_GROUPS) { + p += sprintf(p, "%d", j); + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) + *p++ = '_'; + } + + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) + p += sprintf(p, "%d", k); + + groupname += block->group_name_stride; + } + } + } + + assert(block->num_selectors <= 1000); + block->selector_name_stride = block->group_name_stride + 4; + block->selector_names = MALLOC(block->num_groups * block->num_selectors * + block->selector_name_stride); + if (!block->selector_names) + return false; + + groupname = block->group_names; + p = block->selector_names; + for (i = 0; i < block->num_groups; ++i) { + for (j = 0; j < block->num_selectors; ++j) { + sprintf(p, "%s_%03d", groupname, j); + p += block->selector_name_stride; + } + groupname += block->group_name_stride; + } + + return true; +} + +int r600_get_perfcounter_info(struct r600_common_screen *screen, + unsigned index, + struct pipe_driver_query_info *info) +{ + struct r600_perfcounters *pc = screen->perfcounters; + struct r600_perfcounter_block *block; + unsigned base_gid, sub; + + if (!pc) + return 0; + + if (!info) { + unsigned bid, num_queries = 0; + + for (bid = 0; bid < pc->num_blocks; ++bid) { + num_queries += pc->blocks[bid].num_selectors * + pc->blocks[bid].num_groups; + } + + return num_queries; + } + + block = lookup_counter(pc, index, &base_gid, &sub); + if (!block) + return 0; + + if (!block->selector_names) { + if (!r600_init_block_names(screen, block)) + return 0; + } + info->name = block->selector_names + sub * block->selector_name_stride; + info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index; + info->max_value.u64 = 0; + info->type = PIPE_DRIVER_QUERY_TYPE_UINT64; + info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE; + info->group_id = base_gid + sub / block->num_selectors; + info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; + if (sub > 0 && sub + 1 < block->num_selectors * block->num_groups) + info->flags |= PIPE_DRIVER_QUERY_FLAG_DONT_LIST; + return 1; +} + +int r600_get_perfcounter_group_info(struct r600_common_screen *screen, + unsigned index, + struct pipe_driver_query_group_info *info) +{ + struct r600_perfcounters *pc = screen->perfcounters; + struct r600_perfcounter_block *block; + + if (!pc) + return 0; + + if (!info) + return pc->num_groups; + + block = lookup_group(pc, &index); + if (!block) + return 0; + + if (!block->group_names) { + if (!r600_init_block_names(screen, block)) + return 0; + } + info->name = block->group_names + index * block->group_name_stride; + info->num_queries = block->num_selectors; + info->max_active_queries = block->num_counters; + return 1; +} + +void r600_perfcounters_destroy(struct r600_common_screen *rscreen) +{ + if (rscreen->perfcounters) + rscreen->perfcounters->cleanup(rscreen); +} + +bool r600_perfcounters_init(struct r600_perfcounters *pc, + unsigned num_blocks) +{ + pc->blocks = CALLOC(num_blocks, sizeof(struct r600_perfcounter_block)); + if (!pc->blocks) + return false; + + pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false); + pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false); + + return true; +} + +void r600_perfcounters_add_block(struct r600_common_screen *rscreen, + struct r600_perfcounters *pc, + const char *name, unsigned flags, + unsigned counters, unsigned selectors, + unsigned instances, void *data) +{ + struct r600_perfcounter_block *block = &pc->blocks[pc->num_blocks]; + + assert(counters <= R600_QUERY_MAX_COUNTERS); + + block->basename = name; + block->flags = flags; + block->num_counters = counters; + block->num_selectors = selectors; + block->num_instances = MAX2(instances, 1); + block->data = data; + + if (pc->separate_se && (block->flags & R600_PC_BLOCK_SE)) + block->flags |= R600_PC_BLOCK_SE_GROUPS; + if (pc->separate_instance && block->num_instances > 1) + block->flags |= R600_PC_BLOCK_INSTANCE_GROUPS; + + if (block->flags & R600_PC_BLOCK_INSTANCE_GROUPS) { + block->num_groups = block->num_instances; + } else { + block->num_groups = 1; + } + + if (block->flags & R600_PC_BLOCK_SE_GROUPS) + block->num_groups *= rscreen->info.max_se; + if (block->flags & R600_PC_BLOCK_SHADER) + block->num_groups *= pc->num_shader_types; + + ++pc->num_blocks; + pc->num_groups += block->num_groups; +} + +void r600_perfcounters_do_destroy(struct r600_perfcounters *pc) +{ + unsigned i; + + for (i = 0; i < pc->num_blocks; ++i) { + FREE(pc->blocks[i].group_names); + FREE(pc->blocks[i].selector_names); + } + FREE(pc->blocks); + FREE(pc); +} diff --git a/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.c b/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.c new file mode 100644 index 000000000..acad670d6 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.c @@ -0,0 +1,1433 @@ +/* + * Copyright 2013 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Marek Olšák <maraeo@gmail.com> + * + */ + +#include "r600_pipe_common.h" +#include "r600_cs.h" +#include "tgsi/tgsi_parse.h" +#include "util/list.h" +#include "util/u_draw_quad.h" +#include "util/u_memory.h" +#include "util/u_format_s3tc.h" +#include "util/u_upload_mgr.h" +#include "os/os_time.h" +#include "vl/vl_decoder.h" +#include "vl/vl_video_buffer.h" +#include "radeon_video.h" +#include <inttypes.h> +#include <sys/utsname.h> + +#ifndef HAVE_LLVM +#define HAVE_LLVM 0 +#endif + +#if HAVE_LLVM +#include <llvm-c/TargetMachine.h> +#endif + +#ifndef MESA_LLVM_VERSION_PATCH +#define MESA_LLVM_VERSION_PATCH 0 +#endif + +struct r600_multi_fence { + struct pipe_reference reference; + struct pipe_fence_handle *gfx; + struct pipe_fence_handle *sdma; + + /* If the context wasn't flushed at fence creation, this is non-NULL. */ + struct { + struct r600_common_context *ctx; + unsigned ib_index; + } gfx_unflushed; +}; + +/* + * shader binary helpers. + */ +void radeon_shader_binary_init(struct ac_shader_binary *b) +{ + memset(b, 0, sizeof(*b)); +} + +void radeon_shader_binary_clean(struct ac_shader_binary *b) +{ + if (!b) + return; + FREE(b->code); + FREE(b->config); + FREE(b->rodata); + FREE(b->global_symbol_offsets); + FREE(b->relocs); + FREE(b->disasm_string); + FREE(b->llvm_ir_string); +} + +/* + * pipe_context + */ + +/** + * Write an EOP event. + * + * \param event EVENT_TYPE_* + * \param event_flags Optional cache flush flags (TC) + * \param data_sel 1 = fence, 3 = timestamp + * \param buf Buffer + * \param va GPU address + * \param old_value Previous fence value (for a bug workaround) + * \param new_value Fence value to write for this event. + */ +void r600_gfx_write_event_eop(struct r600_common_context *ctx, + unsigned event, unsigned event_flags, + unsigned data_sel, + struct r600_resource *buf, uint64_t va, + uint32_t new_fence, unsigned query_type) +{ + struct radeon_winsys_cs *cs = ctx->gfx.cs; + unsigned op = EVENT_TYPE(event) | + EVENT_INDEX(5) | + event_flags; + unsigned sel = EOP_DATA_SEL(data_sel); + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); + radeon_emit(cs, op); + radeon_emit(cs, va); + radeon_emit(cs, ((va >> 32) & 0xffff) | sel); + radeon_emit(cs, new_fence); /* immediate data */ + radeon_emit(cs, 0); /* unused */ + + if (buf) + r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE, + RADEON_PRIO_QUERY); +} + +unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen) +{ + unsigned dwords = 6; + + if (!screen->info.has_virtual_memory) + dwords += 2; + + return dwords; +} + +void r600_gfx_wait_fence(struct r600_common_context *ctx, + uint64_t va, uint32_t ref, uint32_t mask) +{ + struct radeon_winsys_cs *cs = ctx->gfx.cs; + + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, ref); /* reference value */ + radeon_emit(cs, mask); /* mask */ + radeon_emit(cs, 4); /* poll interval */ +} + +void r600_draw_rectangle(struct blitter_context *blitter, + void *vertex_elements_cso, + blitter_get_vs_func get_vs, + int x1, int y1, int x2, int y2, + float depth, unsigned num_instances, + enum blitter_attrib_type type, + const union blitter_attrib *attrib) +{ + struct r600_common_context *rctx = + (struct r600_common_context*)util_blitter_get_pipe(blitter); + struct pipe_viewport_state viewport; + struct pipe_resource *buf = NULL; + unsigned offset = 0; + float *vb; + + rctx->b.bind_vertex_elements_state(&rctx->b, vertex_elements_cso); + rctx->b.bind_vs_state(&rctx->b, get_vs(blitter)); + + /* Some operations (like color resolve on r6xx) don't work + * with the conventional primitive types. + * One that works is PT_RECTLIST, which we use here. */ + + /* setup viewport */ + viewport.scale[0] = 1.0f; + viewport.scale[1] = 1.0f; + viewport.scale[2] = 1.0f; + viewport.translate[0] = 0.0f; + viewport.translate[1] = 0.0f; + viewport.translate[2] = 0.0f; + rctx->b.set_viewport_states(&rctx->b, 0, 1, &viewport); + + /* Upload vertices. The hw rectangle has only 3 vertices, + * The 4th one is derived from the first 3. + * The vertex specification should match u_blitter's vertex element state. */ + u_upload_alloc(rctx->b.stream_uploader, 0, sizeof(float) * 24, + rctx->screen->info.tcc_cache_line_size, + &offset, &buf, (void**)&vb); + if (!buf) + return; + + vb[0] = x1; + vb[1] = y1; + vb[2] = depth; + vb[3] = 1; + + vb[8] = x1; + vb[9] = y2; + vb[10] = depth; + vb[11] = 1; + + vb[16] = x2; + vb[17] = y1; + vb[18] = depth; + vb[19] = 1; + + switch (type) { + case UTIL_BLITTER_ATTRIB_COLOR: + memcpy(vb+4, attrib->color, sizeof(float)*4); + memcpy(vb+12, attrib->color, sizeof(float)*4); + memcpy(vb+20, attrib->color, sizeof(float)*4); + break; + case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW: + case UTIL_BLITTER_ATTRIB_TEXCOORD_XY: + vb[6] = vb[14] = vb[22] = attrib->texcoord.z; + vb[7] = vb[15] = vb[23] = attrib->texcoord.w; + /* fall through */ + vb[4] = attrib->texcoord.x1; + vb[5] = attrib->texcoord.y1; + vb[12] = attrib->texcoord.x1; + vb[13] = attrib->texcoord.y2; + vb[20] = attrib->texcoord.x2; + vb[21] = attrib->texcoord.y1; + break; + default:; /* Nothing to do. */ + } + + /* draw */ + struct pipe_vertex_buffer vbuffer = {}; + vbuffer.buffer.resource = buf; + vbuffer.stride = 2 * 4 * sizeof(float); /* vertex size */ + vbuffer.buffer_offset = offset; + + rctx->b.set_vertex_buffers(&rctx->b, blitter->vb_slot, 1, &vbuffer); + util_draw_arrays_instanced(&rctx->b, R600_PRIM_RECTANGLE_LIST, 0, 3, + 0, num_instances); + pipe_resource_reference(&buf, NULL); +} + +static void r600_dma_emit_wait_idle(struct r600_common_context *rctx) +{ + struct radeon_winsys_cs *cs = rctx->dma.cs; + + if (rctx->chip_class >= EVERGREEN) + radeon_emit(cs, 0xf0000000); /* NOP */ + else { + /* TODO: R600-R700 should use the FENCE packet. + * CS checker support is required. */ + } +} + +void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, + struct r600_resource *dst, struct r600_resource *src) +{ + uint64_t vram = ctx->dma.cs->used_vram; + uint64_t gtt = ctx->dma.cs->used_gart; + + if (dst) { + vram += dst->vram_usage; + gtt += dst->gart_usage; + } + if (src) { + vram += src->vram_usage; + gtt += src->gart_usage; + } + + /* Flush the GFX IB if DMA depends on it. */ + if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && + ((dst && + ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf, + RADEON_USAGE_READWRITE)) || + (src && + ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf, + RADEON_USAGE_WRITE)))) + ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + + /* Flush if there's not enough space, or if the memory usage per IB + * is too large. + * + * IBs using too little memory are limited by the IB submission overhead. + * IBs using too much memory are limited by the kernel/TTM overhead. + * Too long IBs create CPU-GPU pipeline bubbles and add latency. + * + * This heuristic makes sure that DMA requests are executed + * very soon after the call is made and lowers memory usage. + * It improves texture upload performance by keeping the DMA + * engine busy while uploads are being submitted. + */ + num_dw++; /* for emit_wait_idle below */ + if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) || + ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 || + !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) { + ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw); + } + + /* Wait for idle if either buffer has been used in the IB before to + * prevent read-after-write hazards. + */ + if ((dst && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf, + RADEON_USAGE_READWRITE)) || + (src && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf, + RADEON_USAGE_WRITE))) + r600_dma_emit_wait_idle(ctx); + + /* If GPUVM is not supported, the CS checker needs 2 entries + * in the buffer list per packet, which has to be done manually. + */ + if (ctx->screen->info.has_virtual_memory) { + if (dst) + radeon_add_to_buffer_list(ctx, &ctx->dma, dst, + RADEON_USAGE_WRITE, + RADEON_PRIO_SDMA_BUFFER); + if (src) + radeon_add_to_buffer_list(ctx, &ctx->dma, src, + RADEON_USAGE_READ, + RADEON_PRIO_SDMA_BUFFER); + } + + /* this function is called before all DMA calls, so increment this. */ + ctx->num_dma_calls++; +} + +static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags) +{ +} + +void r600_preflush_suspend_features(struct r600_common_context *ctx) +{ + /* suspend queries */ + if (!LIST_IS_EMPTY(&ctx->active_queries)) + r600_suspend_queries(ctx); + + ctx->streamout.suspended = false; + if (ctx->streamout.begin_emitted) { + r600_emit_streamout_end(ctx); + ctx->streamout.suspended = true; + } +} + +void r600_postflush_resume_features(struct r600_common_context *ctx) +{ + if (ctx->streamout.suspended) { + ctx->streamout.append_bitmask = ctx->streamout.enabled_mask; + r600_streamout_buffers_dirty(ctx); + } + + /* resume queries */ + if (!LIST_IS_EMPTY(&ctx->active_queries)) + r600_resume_queries(ctx); +} + +static void r600_add_fence_dependency(struct r600_common_context *rctx, + struct pipe_fence_handle *fence) +{ + struct radeon_winsys *ws = rctx->ws; + + if (rctx->dma.cs) + ws->cs_add_fence_dependency(rctx->dma.cs, fence); + ws->cs_add_fence_dependency(rctx->gfx.cs, fence); +} + +static void r600_fence_server_sync(struct pipe_context *ctx, + struct pipe_fence_handle *fence) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence; + + /* Only amdgpu needs to handle fence dependencies (for fence imports). + * radeon synchronizes all rings by default and will not implement + * fence imports. + */ + if (rctx->screen->info.drm_major == 2) + return; + + /* Only imported fences need to be handled by fence_server_sync, + * because the winsys handles synchronizations automatically for BOs + * within the process. + * + * Simply skip unflushed fences here, and the winsys will drop no-op + * dependencies (i.e. dependencies within the same ring). + */ + if (rfence->gfx_unflushed.ctx) + return; + + /* All unflushed commands will not start execution before + * this fence dependency is signalled. + * + * Should we flush the context to allow more GPU parallelism? + */ + if (rfence->sdma) + r600_add_fence_dependency(rctx, rfence->sdma); + if (rfence->gfx) + r600_add_fence_dependency(rctx, rfence->gfx); +} + +static void r600_flush_from_st(struct pipe_context *ctx, + struct pipe_fence_handle **fence, + unsigned flags) +{ + struct pipe_screen *screen = ctx->screen; + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct radeon_winsys *ws = rctx->ws; + struct pipe_fence_handle *gfx_fence = NULL; + struct pipe_fence_handle *sdma_fence = NULL; + bool deferred_fence = false; + unsigned rflags = RADEON_FLUSH_ASYNC; + + if (flags & PIPE_FLUSH_END_OF_FRAME) + rflags |= RADEON_FLUSH_END_OF_FRAME; + + /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */ + if (rctx->dma.cs) + rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL); + + if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) { + if (fence) + ws->fence_reference(&gfx_fence, rctx->last_gfx_fence); + if (!(flags & PIPE_FLUSH_DEFERRED)) + ws->cs_sync_flush(rctx->gfx.cs); + } else { + /* Instead of flushing, create a deferred fence. Constraints: + * - The state tracker must allow a deferred flush. + * - The state tracker must request a fence. + * Thread safety in fence_finish must be ensured by the state tracker. + */ + if (flags & PIPE_FLUSH_DEFERRED && fence) { + gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs); + deferred_fence = true; + } else { + rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL); + } + } + + /* Both engines can signal out of order, so we need to keep both fences. */ + if (fence) { + struct r600_multi_fence *multi_fence = + CALLOC_STRUCT(r600_multi_fence); + if (!multi_fence) { + ws->fence_reference(&sdma_fence, NULL); + ws->fence_reference(&gfx_fence, NULL); + goto finish; + } + + multi_fence->reference.count = 1; + /* If both fences are NULL, fence_finish will always return true. */ + multi_fence->gfx = gfx_fence; + multi_fence->sdma = sdma_fence; + + if (deferred_fence) { + multi_fence->gfx_unflushed.ctx = rctx; + multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes; + } + + screen->fence_reference(screen, fence, NULL); + *fence = (struct pipe_fence_handle*)multi_fence; + } +finish: + if (!(flags & PIPE_FLUSH_DEFERRED)) { + if (rctx->dma.cs) + ws->cs_sync_flush(rctx->dma.cs); + ws->cs_sync_flush(rctx->gfx.cs); + } +} + +static void r600_flush_dma_ring(void *ctx, unsigned flags, + struct pipe_fence_handle **fence) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct radeon_winsys_cs *cs = rctx->dma.cs; + struct radeon_saved_cs saved; + bool check_vm = + (rctx->screen->debug_flags & DBG_CHECK_VM) && + rctx->check_vm_faults; + + if (!radeon_emitted(cs, 0)) { + if (fence) + rctx->ws->fence_reference(fence, rctx->last_sdma_fence); + return; + } + + if (check_vm) + radeon_save_cs(rctx->ws, cs, &saved, true); + + rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence); + if (fence) + rctx->ws->fence_reference(fence, rctx->last_sdma_fence); + + if (check_vm) { + /* Use conservative timeout 800ms, after which we won't wait any + * longer and assume the GPU is hung. + */ + rctx->ws->fence_wait(rctx->ws, rctx->last_sdma_fence, 800*1000*1000); + + rctx->check_vm_faults(rctx, &saved, RING_DMA); + radeon_clear_saved_cs(&saved); + } +} + +/** + * Store a linearized copy of all chunks of \p cs together with the buffer + * list in \p saved. + */ +void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs, + struct radeon_saved_cs *saved, bool get_buffer_list) +{ + uint32_t *buf; + unsigned i; + + /* Save the IB chunks. */ + saved->num_dw = cs->prev_dw + cs->current.cdw; + saved->ib = MALLOC(4 * saved->num_dw); + if (!saved->ib) + goto oom; + + buf = saved->ib; + for (i = 0; i < cs->num_prev; ++i) { + memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4); + buf += cs->prev[i].cdw; + } + memcpy(buf, cs->current.buf, cs->current.cdw * 4); + + if (!get_buffer_list) + return; + + /* Save the buffer list. */ + saved->bo_count = ws->cs_get_buffer_list(cs, NULL); + saved->bo_list = CALLOC(saved->bo_count, + sizeof(saved->bo_list[0])); + if (!saved->bo_list) { + FREE(saved->ib); + goto oom; + } + ws->cs_get_buffer_list(cs, saved->bo_list); + + return; + +oom: + fprintf(stderr, "%s: out of memory\n", __func__); + memset(saved, 0, sizeof(*saved)); +} + +void radeon_clear_saved_cs(struct radeon_saved_cs *saved) +{ + FREE(saved->ib); + FREE(saved->bo_list); + + memset(saved, 0, sizeof(*saved)); +} + +static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + unsigned latest = rctx->ws->query_value(rctx->ws, + RADEON_GPU_RESET_COUNTER); + + if (rctx->gpu_reset_counter == latest) + return PIPE_NO_RESET; + + rctx->gpu_reset_counter = latest; + return PIPE_UNKNOWN_CONTEXT_RESET; +} + +static void r600_set_debug_callback(struct pipe_context *ctx, + const struct pipe_debug_callback *cb) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + + if (cb) + rctx->debug = *cb; + else + memset(&rctx->debug, 0, sizeof(rctx->debug)); +} + +static void r600_set_device_reset_callback(struct pipe_context *ctx, + const struct pipe_device_reset_callback *cb) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + + if (cb) + rctx->device_reset_callback = *cb; + else + memset(&rctx->device_reset_callback, 0, + sizeof(rctx->device_reset_callback)); +} + +bool r600_check_device_reset(struct r600_common_context *rctx) +{ + enum pipe_reset_status status; + + if (!rctx->device_reset_callback.reset) + return false; + + if (!rctx->b.get_device_reset_status) + return false; + + status = rctx->b.get_device_reset_status(&rctx->b); + if (status == PIPE_NO_RESET) + return false; + + rctx->device_reset_callback.reset(rctx->device_reset_callback.data, status); + return true; +} + +static void r600_dma_clear_buffer_fallback(struct pipe_context *ctx, + struct pipe_resource *dst, + uint64_t offset, uint64_t size, + unsigned value) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + + rctx->clear_buffer(ctx, dst, offset, size, value, R600_COHERENCY_NONE); +} + +static bool r600_resource_commit(struct pipe_context *pctx, + struct pipe_resource *resource, + unsigned level, struct pipe_box *box, + bool commit) +{ + struct r600_common_context *ctx = (struct r600_common_context *)pctx; + struct r600_resource *res = r600_resource(resource); + + /* + * Since buffer commitment changes cannot be pipelined, we need to + * (a) flush any pending commands that refer to the buffer we're about + * to change, and + * (b) wait for threaded submit to finish, including those that were + * triggered by some other, earlier operation. + */ + if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && + ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, + res->buf, RADEON_USAGE_READWRITE)) { + ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + } + if (radeon_emitted(ctx->dma.cs, 0) && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, + res->buf, RADEON_USAGE_READWRITE)) { + ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + } + + ctx->ws->cs_sync_flush(ctx->dma.cs); + ctx->ws->cs_sync_flush(ctx->gfx.cs); + + assert(resource->target == PIPE_BUFFER); + + return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit); +} + +bool r600_common_context_init(struct r600_common_context *rctx, + struct r600_common_screen *rscreen, + unsigned context_flags) +{ + slab_create_child(&rctx->pool_transfers, &rscreen->pool_transfers); + slab_create_child(&rctx->pool_transfers_unsync, &rscreen->pool_transfers); + + rctx->screen = rscreen; + rctx->ws = rscreen->ws; + rctx->family = rscreen->family; + rctx->chip_class = rscreen->chip_class; + + rctx->b.invalidate_resource = r600_invalidate_resource; + rctx->b.resource_commit = r600_resource_commit; + rctx->b.transfer_map = u_transfer_map_vtbl; + rctx->b.transfer_flush_region = u_transfer_flush_region_vtbl; + rctx->b.transfer_unmap = u_transfer_unmap_vtbl; + rctx->b.texture_subdata = u_default_texture_subdata; + rctx->b.memory_barrier = r600_memory_barrier; + rctx->b.flush = r600_flush_from_st; + rctx->b.set_debug_callback = r600_set_debug_callback; + rctx->b.fence_server_sync = r600_fence_server_sync; + rctx->dma_clear_buffer = r600_dma_clear_buffer_fallback; + + /* evergreen_compute.c has a special codepath for global buffers. + * Everything else can use the direct path. + */ + if ((rscreen->chip_class == EVERGREEN || rscreen->chip_class == CAYMAN) && + (context_flags & PIPE_CONTEXT_COMPUTE_ONLY)) + rctx->b.buffer_subdata = u_default_buffer_subdata; + else + rctx->b.buffer_subdata = r600_buffer_subdata; + + if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) { + rctx->b.get_device_reset_status = r600_get_reset_status; + rctx->gpu_reset_counter = + rctx->ws->query_value(rctx->ws, + RADEON_GPU_RESET_COUNTER); + } + + rctx->b.set_device_reset_callback = r600_set_device_reset_callback; + + r600_init_context_texture_functions(rctx); + r600_init_viewport_functions(rctx); + r600_streamout_init(rctx); + r600_query_init(rctx); + cayman_init_msaa(&rctx->b); + + rctx->allocator_zeroed_memory = + u_suballocator_create(&rctx->b, rscreen->info.gart_page_size, + 0, PIPE_USAGE_DEFAULT, 0, true); + if (!rctx->allocator_zeroed_memory) + return false; + + rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024, + 0, PIPE_USAGE_STREAM); + if (!rctx->b.stream_uploader) + return false; + + rctx->b.const_uploader = u_upload_create(&rctx->b, 128 * 1024, + 0, PIPE_USAGE_DEFAULT); + if (!rctx->b.const_uploader) + return false; + + rctx->ctx = rctx->ws->ctx_create(rctx->ws); + if (!rctx->ctx) + return false; + + if (rscreen->info.num_sdma_rings && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) { + rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA, + r600_flush_dma_ring, + rctx); + rctx->dma.flush = r600_flush_dma_ring; + } + + return true; +} + +void r600_common_context_cleanup(struct r600_common_context *rctx) +{ + if (rctx->query_result_shader) + rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader); + + if (rctx->gfx.cs) + rctx->ws->cs_destroy(rctx->gfx.cs); + if (rctx->dma.cs) + rctx->ws->cs_destroy(rctx->dma.cs); + if (rctx->ctx) + rctx->ws->ctx_destroy(rctx->ctx); + + if (rctx->b.stream_uploader) + u_upload_destroy(rctx->b.stream_uploader); + if (rctx->b.const_uploader) + u_upload_destroy(rctx->b.const_uploader); + + slab_destroy_child(&rctx->pool_transfers); + slab_destroy_child(&rctx->pool_transfers_unsync); + + if (rctx->allocator_zeroed_memory) { + u_suballocator_destroy(rctx->allocator_zeroed_memory); + } + rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL); + rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL); + r600_resource_reference(&rctx->eop_bug_scratch, NULL); +} + +/* + * pipe_screen + */ + +static const struct debug_named_value common_debug_options[] = { + /* logging */ + { "tex", DBG_TEX, "Print texture info" }, + { "nir", DBG_NIR, "Enable experimental NIR shaders" }, + { "compute", DBG_COMPUTE, "Print compute info" }, + { "vm", DBG_VM, "Print virtual addresses when creating resources" }, + { "info", DBG_INFO, "Print driver information" }, + + /* shaders */ + { "fs", DBG_FS, "Print fetch shaders" }, + { "vs", DBG_VS, "Print vertex shaders" }, + { "gs", DBG_GS, "Print geometry shaders" }, + { "ps", DBG_PS, "Print pixel shaders" }, + { "cs", DBG_CS, "Print compute shaders" }, + { "tcs", DBG_TCS, "Print tessellation control shaders" }, + { "tes", DBG_TES, "Print tessellation evaluation shaders" }, + { "noir", DBG_NO_IR, "Don't print the LLVM IR"}, + { "notgsi", DBG_NO_TGSI, "Don't print the TGSI"}, + { "noasm", DBG_NO_ASM, "Don't print disassembled shaders"}, + { "preoptir", DBG_PREOPT_IR, "Print the LLVM IR before initial optimizations" }, + { "checkir", DBG_CHECK_IR, "Enable additional sanity checks on shader IR" }, + { "nooptvariant", DBG_NO_OPT_VARIANT, "Disable compiling optimized shader variants." }, + + { "testdma", DBG_TEST_DMA, "Invoke SDMA tests and exit." }, + { "testvmfaultcp", DBG_TEST_VMFAULT_CP, "Invoke a CP VM fault test and exit." }, + { "testvmfaultsdma", DBG_TEST_VMFAULT_SDMA, "Invoke a SDMA VM fault test and exit." }, + { "testvmfaultshader", DBG_TEST_VMFAULT_SHADER, "Invoke a shader VM fault test and exit." }, + + /* features */ + { "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" }, + { "nohyperz", DBG_NO_HYPERZ, "Disable Hyper-Z" }, + /* GL uses the word INVALIDATE, gallium uses the word DISCARD */ + { "noinvalrange", DBG_NO_DISCARD_RANGE, "Disable handling of INVALIDATE_RANGE map flags" }, + { "no2d", DBG_NO_2D_TILING, "Disable 2D tiling" }, + { "notiling", DBG_NO_TILING, "Disable tiling" }, + { "switch_on_eop", DBG_SWITCH_ON_EOP, "Program WD/IA to switch on end-of-packet." }, + { "forcedma", DBG_FORCE_DMA, "Use asynchronous DMA for all operations when possible." }, + { "precompile", DBG_PRECOMPILE, "Compile one shader variant at shader creation." }, + { "nowc", DBG_NO_WC, "Disable GTT write combining" }, + { "check_vm", DBG_CHECK_VM, "Check VM faults and dump debug info." }, + { "unsafemath", DBG_UNSAFE_MATH, "Enable unsafe math shader optimizations" }, + + DEBUG_NAMED_VALUE_END /* must be last */ +}; + +static const char* r600_get_vendor(struct pipe_screen* pscreen) +{ + return "X.Org"; +} + +static const char* r600_get_device_vendor(struct pipe_screen* pscreen) +{ + return "AMD"; +} + +static const char *r600_get_marketing_name(struct radeon_winsys *ws) +{ + if (!ws->get_chip_name) + return NULL; + return ws->get_chip_name(ws); +} + +static const char *r600_get_family_name(const struct r600_common_screen *rscreen) +{ + switch (rscreen->info.family) { + case CHIP_R600: return "AMD R600"; + case CHIP_RV610: return "AMD RV610"; + case CHIP_RV630: return "AMD RV630"; + case CHIP_RV670: return "AMD RV670"; + case CHIP_RV620: return "AMD RV620"; + case CHIP_RV635: return "AMD RV635"; + case CHIP_RS780: return "AMD RS780"; + case CHIP_RS880: return "AMD RS880"; + case CHIP_RV770: return "AMD RV770"; + case CHIP_RV730: return "AMD RV730"; + case CHIP_RV710: return "AMD RV710"; + case CHIP_RV740: return "AMD RV740"; + case CHIP_CEDAR: return "AMD CEDAR"; + case CHIP_REDWOOD: return "AMD REDWOOD"; + case CHIP_JUNIPER: return "AMD JUNIPER"; + case CHIP_CYPRESS: return "AMD CYPRESS"; + case CHIP_HEMLOCK: return "AMD HEMLOCK"; + case CHIP_PALM: return "AMD PALM"; + case CHIP_SUMO: return "AMD SUMO"; + case CHIP_SUMO2: return "AMD SUMO2"; + case CHIP_BARTS: return "AMD BARTS"; + case CHIP_TURKS: return "AMD TURKS"; + case CHIP_CAICOS: return "AMD CAICOS"; + case CHIP_CAYMAN: return "AMD CAYMAN"; + case CHIP_ARUBA: return "AMD ARUBA"; + default: return "AMD unknown"; + } +} + +static void r600_disk_cache_create(struct r600_common_screen *rscreen) +{ + /* Don't use the cache if shader dumping is enabled. */ + if (rscreen->debug_flags & DBG_ALL_SHADERS) + return; + + uint32_t mesa_timestamp; + if (disk_cache_get_function_timestamp(r600_disk_cache_create, + &mesa_timestamp)) { + char *timestamp_str; + int res = -1; + + res = asprintf(×tamp_str, "%u",mesa_timestamp); + if (res != -1) { + /* These flags affect shader compilation. */ + uint64_t shader_debug_flags = + rscreen->debug_flags & + (DBG_FS_CORRECT_DERIVS_AFTER_KILL | + DBG_UNSAFE_MATH); + + rscreen->disk_shader_cache = + disk_cache_create(r600_get_family_name(rscreen), + timestamp_str, + shader_debug_flags); + free(timestamp_str); + } + } +} + +static struct disk_cache *r600_get_disk_shader_cache(struct pipe_screen *pscreen) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen; + return rscreen->disk_shader_cache; +} + +static const char* r600_get_name(struct pipe_screen* pscreen) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)pscreen; + + return rscreen->renderer_string; +} + +static float r600_get_paramf(struct pipe_screen* pscreen, + enum pipe_capf param) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen *)pscreen; + + switch (param) { + case PIPE_CAPF_MAX_LINE_WIDTH: + case PIPE_CAPF_MAX_LINE_WIDTH_AA: + case PIPE_CAPF_MAX_POINT_WIDTH: + case PIPE_CAPF_MAX_POINT_WIDTH_AA: + if (rscreen->family >= CHIP_CEDAR) + return 16384.0f; + else + return 8192.0f; + case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: + return 16.0f; + case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: + return 16.0f; + case PIPE_CAPF_GUARD_BAND_LEFT: + case PIPE_CAPF_GUARD_BAND_TOP: + case PIPE_CAPF_GUARD_BAND_RIGHT: + case PIPE_CAPF_GUARD_BAND_BOTTOM: + return 0.0f; + } + return 0.0f; +} + +static int r600_get_video_param(struct pipe_screen *screen, + enum pipe_video_profile profile, + enum pipe_video_entrypoint entrypoint, + enum pipe_video_cap param) +{ + switch (param) { + case PIPE_VIDEO_CAP_SUPPORTED: + return vl_profile_supported(screen, profile, entrypoint); + case PIPE_VIDEO_CAP_NPOT_TEXTURES: + return 1; + case PIPE_VIDEO_CAP_MAX_WIDTH: + case PIPE_VIDEO_CAP_MAX_HEIGHT: + return vl_video_buffer_max_size(screen); + case PIPE_VIDEO_CAP_PREFERED_FORMAT: + return PIPE_FORMAT_NV12; + case PIPE_VIDEO_CAP_PREFERS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: + return true; + case PIPE_VIDEO_CAP_MAX_LEVEL: + return vl_level_supported(screen, profile); + default: + return 0; + } +} + +const char *r600_get_llvm_processor_name(enum radeon_family family) +{ + switch (family) { + case CHIP_R600: + case CHIP_RV630: + case CHIP_RV635: + case CHIP_RV670: + return "r600"; + case CHIP_RV610: + case CHIP_RV620: + case CHIP_RS780: + case CHIP_RS880: + return "rs880"; + case CHIP_RV710: + return "rv710"; + case CHIP_RV730: + return "rv730"; + case CHIP_RV740: + case CHIP_RV770: + return "rv770"; + case CHIP_PALM: + case CHIP_CEDAR: + return "cedar"; + case CHIP_SUMO: + case CHIP_SUMO2: + return "sumo"; + case CHIP_REDWOOD: + return "redwood"; + case CHIP_JUNIPER: + return "juniper"; + case CHIP_HEMLOCK: + case CHIP_CYPRESS: + return "cypress"; + case CHIP_BARTS: + return "barts"; + case CHIP_TURKS: + return "turks"; + case CHIP_CAICOS: + return "caicos"; + case CHIP_CAYMAN: + case CHIP_ARUBA: + return "cayman"; + + default: + return ""; + } +} + +static unsigned get_max_threads_per_block(struct r600_common_screen *screen, + enum pipe_shader_ir ir_type) +{ + return 256; +} + +static int r600_get_compute_param(struct pipe_screen *screen, + enum pipe_shader_ir ir_type, + enum pipe_compute_cap param, + void *ret) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen *)screen; + + //TODO: select these params by asic + switch (param) { + case PIPE_COMPUTE_CAP_IR_TARGET: { + const char *gpu; + const char *triple = "r600--"; + gpu = r600_get_llvm_processor_name(rscreen->family); + if (ret) { + sprintf(ret, "%s-%s", gpu, triple); + } + /* +2 for dash and terminating NIL byte */ + return (strlen(triple) + strlen(gpu) + 2) * sizeof(char); + } + case PIPE_COMPUTE_CAP_GRID_DIMENSION: + if (ret) { + uint64_t *grid_dimension = ret; + grid_dimension[0] = 3; + } + return 1 * sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: + if (ret) { + uint64_t *grid_size = ret; + grid_size[0] = 65535; + grid_size[1] = 65535; + grid_size[2] = 65535; + } + return 3 * sizeof(uint64_t) ; + + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: + if (ret) { + uint64_t *block_size = ret; + unsigned threads_per_block = get_max_threads_per_block(rscreen, ir_type); + block_size[0] = threads_per_block; + block_size[1] = threads_per_block; + block_size[2] = threads_per_block; + } + return 3 * sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: + if (ret) { + uint64_t *max_threads_per_block = ret; + *max_threads_per_block = get_max_threads_per_block(rscreen, ir_type); + } + return sizeof(uint64_t); + case PIPE_COMPUTE_CAP_ADDRESS_BITS: + if (ret) { + uint32_t *address_bits = ret; + address_bits[0] = 32; + } + return 1 * sizeof(uint32_t); + + case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: + if (ret) { + uint64_t *max_global_size = ret; + uint64_t max_mem_alloc_size; + + r600_get_compute_param(screen, ir_type, + PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, + &max_mem_alloc_size); + + /* In OpenCL, the MAX_MEM_ALLOC_SIZE must be at least + * 1/4 of the MAX_GLOBAL_SIZE. Since the + * MAX_MEM_ALLOC_SIZE is fixed for older kernels, + * make sure we never report more than + * 4 * MAX_MEM_ALLOC_SIZE. + */ + *max_global_size = MIN2(4 * max_mem_alloc_size, + MAX2(rscreen->info.gart_size, + rscreen->info.vram_size)); + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: + if (ret) { + uint64_t *max_local_size = ret; + /* Value reported by the closed source driver. */ + *max_local_size = 32768; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: + if (ret) { + uint64_t *max_input_size = ret; + /* Value reported by the closed source driver. */ + *max_input_size = 1024; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: + if (ret) { + uint64_t *max_mem_alloc_size = ret; + + *max_mem_alloc_size = rscreen->info.max_alloc_size; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: + if (ret) { + uint32_t *max_clock_frequency = ret; + *max_clock_frequency = rscreen->info.max_shader_clock; + } + return sizeof(uint32_t); + + case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: + if (ret) { + uint32_t *max_compute_units = ret; + *max_compute_units = rscreen->info.num_good_compute_units; + } + return sizeof(uint32_t); + + case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: + if (ret) { + uint32_t *images_supported = ret; + *images_supported = 0; + } + return sizeof(uint32_t); + case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: + break; /* unused */ + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: + if (ret) { + uint32_t *subgroup_size = ret; + *subgroup_size = r600_wavefront_size(rscreen->family); + } + return sizeof(uint32_t); + case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: + if (ret) { + uint64_t *max_variable_threads_per_block = ret; + *max_variable_threads_per_block = 0; + } + return sizeof(uint64_t); + } + + fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param); + return 0; +} + +static uint64_t r600_get_timestamp(struct pipe_screen *screen) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + + return 1000000 * rscreen->ws->query_value(rscreen->ws, RADEON_TIMESTAMP) / + rscreen->info.clock_crystal_freq; +} + +static void r600_fence_reference(struct pipe_screen *screen, + struct pipe_fence_handle **dst, + struct pipe_fence_handle *src) +{ + struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws; + struct r600_multi_fence **rdst = (struct r600_multi_fence **)dst; + struct r600_multi_fence *rsrc = (struct r600_multi_fence *)src; + + if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) { + ws->fence_reference(&(*rdst)->gfx, NULL); + ws->fence_reference(&(*rdst)->sdma, NULL); + FREE(*rdst); + } + *rdst = rsrc; +} + +static boolean r600_fence_finish(struct pipe_screen *screen, + struct pipe_context *ctx, + struct pipe_fence_handle *fence, + uint64_t timeout) +{ + struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws; + struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence; + struct r600_common_context *rctx; + int64_t abs_timeout = os_time_get_absolute_timeout(timeout); + + ctx = threaded_context_unwrap_sync(ctx); + rctx = ctx ? (struct r600_common_context*)ctx : NULL; + + if (rfence->sdma) { + if (!rws->fence_wait(rws, rfence->sdma, timeout)) + return false; + + /* Recompute the timeout after waiting. */ + if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { + int64_t time = os_time_get_nano(); + timeout = abs_timeout > time ? abs_timeout - time : 0; + } + } + + if (!rfence->gfx) + return true; + + /* Flush the gfx IB if it hasn't been flushed yet. */ + if (rctx && + rfence->gfx_unflushed.ctx == rctx && + rfence->gfx_unflushed.ib_index == rctx->num_gfx_cs_flushes) { + rctx->gfx.flush(rctx, timeout ? 0 : RADEON_FLUSH_ASYNC, NULL); + rfence->gfx_unflushed.ctx = NULL; + + if (!timeout) + return false; + + /* Recompute the timeout after all that. */ + if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { + int64_t time = os_time_get_nano(); + timeout = abs_timeout > time ? abs_timeout - time : 0; + } + } + + return rws->fence_wait(rws, rfence->gfx, timeout); +} + +static void r600_query_memory_info(struct pipe_screen *screen, + struct pipe_memory_info *info) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + struct radeon_winsys *ws = rscreen->ws; + unsigned vram_usage, gtt_usage; + + info->total_device_memory = rscreen->info.vram_size / 1024; + info->total_staging_memory = rscreen->info.gart_size / 1024; + + /* The real TTM memory usage is somewhat random, because: + * + * 1) TTM delays freeing memory, because it can only free it after + * fences expire. + * + * 2) The memory usage can be really low if big VRAM evictions are + * taking place, but the real usage is well above the size of VRAM. + * + * Instead, return statistics of this process. + */ + vram_usage = ws->query_value(ws, RADEON_REQUESTED_VRAM_MEMORY) / 1024; + gtt_usage = ws->query_value(ws, RADEON_REQUESTED_GTT_MEMORY) / 1024; + + info->avail_device_memory = + vram_usage <= info->total_device_memory ? + info->total_device_memory - vram_usage : 0; + info->avail_staging_memory = + gtt_usage <= info->total_staging_memory ? + info->total_staging_memory - gtt_usage : 0; + + info->device_memory_evicted = + ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024; + + if (rscreen->info.drm_major == 3 && rscreen->info.drm_minor >= 4) + info->nr_device_memory_evictions = + ws->query_value(ws, RADEON_NUM_EVICTIONS); + else + /* Just return the number of evicted 64KB pages. */ + info->nr_device_memory_evictions = info->device_memory_evicted / 64; +} + +struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen, + const struct pipe_resource *templ) +{ + if (templ->target == PIPE_BUFFER) { + return r600_buffer_create(screen, templ, 256); + } else { + return r600_texture_create(screen, templ); + } +} + +bool r600_common_screen_init(struct r600_common_screen *rscreen, + struct radeon_winsys *ws) +{ + char family_name[32] = {}, llvm_string[32] = {}, kernel_version[128] = {}; + struct utsname uname_data; + const char *chip_name; + + ws->query_info(ws, &rscreen->info); + rscreen->ws = ws; + + if ((chip_name = r600_get_marketing_name(ws))) + snprintf(family_name, sizeof(family_name), "%s / ", + r600_get_family_name(rscreen) + 4); + else + chip_name = r600_get_family_name(rscreen); + + if (uname(&uname_data) == 0) + snprintf(kernel_version, sizeof(kernel_version), + " / %s", uname_data.release); + + if (HAVE_LLVM > 0) { + snprintf(llvm_string, sizeof(llvm_string), + ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff, + HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH); + } + + snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string), + "%s (%sDRM %i.%i.%i%s%s)", + chip_name, family_name, rscreen->info.drm_major, + rscreen->info.drm_minor, rscreen->info.drm_patchlevel, + kernel_version, llvm_string); + + rscreen->b.get_name = r600_get_name; + rscreen->b.get_vendor = r600_get_vendor; + rscreen->b.get_device_vendor = r600_get_device_vendor; + rscreen->b.get_disk_shader_cache = r600_get_disk_shader_cache; + rscreen->b.get_compute_param = r600_get_compute_param; + rscreen->b.get_paramf = r600_get_paramf; + rscreen->b.get_timestamp = r600_get_timestamp; + rscreen->b.fence_finish = r600_fence_finish; + rscreen->b.fence_reference = r600_fence_reference; + rscreen->b.resource_destroy = u_resource_destroy_vtbl; + rscreen->b.resource_from_user_memory = r600_buffer_from_user_memory; + rscreen->b.query_memory_info = r600_query_memory_info; + + if (rscreen->info.has_hw_decode) { + rscreen->b.get_video_param = rvid_get_video_param; + rscreen->b.is_video_format_supported = rvid_is_format_supported; + } else { + rscreen->b.get_video_param = r600_get_video_param; + rscreen->b.is_video_format_supported = vl_video_buffer_is_format_supported; + } + + r600_init_screen_texture_functions(rscreen); + r600_init_screen_query_functions(rscreen); + + rscreen->family = rscreen->info.family; + rscreen->chip_class = rscreen->info.chip_class; + rscreen->debug_flags |= debug_get_flags_option("R600_DEBUG", common_debug_options, 0); + + r600_disk_cache_create(rscreen); + + slab_create_parent(&rscreen->pool_transfers, sizeof(struct r600_transfer), 64); + + rscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1)); + if (rscreen->force_aniso >= 0) { + printf("radeon: Forcing anisotropy filter to %ix\n", + /* round down to a power of two */ + 1 << util_logbase2(rscreen->force_aniso)); + } + + (void) mtx_init(&rscreen->aux_context_lock, mtx_plain); + (void) mtx_init(&rscreen->gpu_load_mutex, mtx_plain); + + if (rscreen->debug_flags & DBG_INFO) { + printf("pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n", + rscreen->info.pci_domain, rscreen->info.pci_bus, + rscreen->info.pci_dev, rscreen->info.pci_func); + printf("pci_id = 0x%x\n", rscreen->info.pci_id); + printf("family = %i (%s)\n", rscreen->info.family, + r600_get_family_name(rscreen)); + printf("chip_class = %i\n", rscreen->info.chip_class); + printf("pte_fragment_size = %u\n", rscreen->info.pte_fragment_size); + printf("gart_page_size = %u\n", rscreen->info.gart_page_size); + printf("gart_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.gart_size, 1024*1024)); + printf("vram_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_size, 1024*1024)); + printf("vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_vis_size, 1024*1024)); + printf("max_alloc_size = %i MB\n", + (int)DIV_ROUND_UP(rscreen->info.max_alloc_size, 1024*1024)); + printf("min_alloc_size = %u\n", rscreen->info.min_alloc_size); + printf("has_dedicated_vram = %u\n", rscreen->info.has_dedicated_vram); + printf("has_virtual_memory = %i\n", rscreen->info.has_virtual_memory); + printf("gfx_ib_pad_with_type2 = %i\n", rscreen->info.gfx_ib_pad_with_type2); + printf("has_hw_decode = %u\n", rscreen->info.has_hw_decode); + printf("num_sdma_rings = %i\n", rscreen->info.num_sdma_rings); + printf("num_compute_rings = %u\n", rscreen->info.num_compute_rings); + printf("uvd_fw_version = %u\n", rscreen->info.uvd_fw_version); + printf("vce_fw_version = %u\n", rscreen->info.vce_fw_version); + printf("me_fw_version = %i\n", rscreen->info.me_fw_version); + printf("pfp_fw_version = %i\n", rscreen->info.pfp_fw_version); + printf("ce_fw_version = %i\n", rscreen->info.ce_fw_version); + printf("vce_harvest_config = %i\n", rscreen->info.vce_harvest_config); + printf("clock_crystal_freq = %i\n", rscreen->info.clock_crystal_freq); + printf("tcc_cache_line_size = %u\n", rscreen->info.tcc_cache_line_size); + printf("drm = %i.%i.%i\n", rscreen->info.drm_major, + rscreen->info.drm_minor, rscreen->info.drm_patchlevel); + printf("has_userptr = %i\n", rscreen->info.has_userptr); + printf("has_syncobj = %u\n", rscreen->info.has_syncobj); + + printf("r600_max_quad_pipes = %i\n", rscreen->info.r600_max_quad_pipes); + printf("max_shader_clock = %i\n", rscreen->info.max_shader_clock); + printf("num_good_compute_units = %i\n", rscreen->info.num_good_compute_units); + printf("max_se = %i\n", rscreen->info.max_se); + printf("max_sh_per_se = %i\n", rscreen->info.max_sh_per_se); + + printf("r600_gb_backend_map = %i\n", rscreen->info.r600_gb_backend_map); + printf("r600_gb_backend_map_valid = %i\n", rscreen->info.r600_gb_backend_map_valid); + printf("r600_num_banks = %i\n", rscreen->info.r600_num_banks); + printf("num_render_backends = %i\n", rscreen->info.num_render_backends); + printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes); + printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes); + printf("enabled_rb_mask = 0x%x\n", rscreen->info.enabled_rb_mask); + printf("max_alignment = %u\n", (unsigned)rscreen->info.max_alignment); + } + return true; +} + +void r600_destroy_common_screen(struct r600_common_screen *rscreen) +{ + r600_perfcounters_destroy(rscreen); + r600_gpu_load_kill_thread(rscreen); + + mtx_destroy(&rscreen->gpu_load_mutex); + mtx_destroy(&rscreen->aux_context_lock); + rscreen->aux_context->destroy(rscreen->aux_context); + + slab_destroy_parent(&rscreen->pool_transfers); + + disk_cache_destroy(rscreen->disk_shader_cache); + rscreen->ws->destroy(rscreen->ws); + FREE(rscreen); +} + +bool r600_can_dump_shader(struct r600_common_screen *rscreen, + unsigned processor) +{ + return rscreen->debug_flags & (1 << processor); +} + +bool r600_extra_shader_checks(struct r600_common_screen *rscreen, unsigned processor) +{ + return (rscreen->debug_flags & DBG_CHECK_IR) || + r600_can_dump_shader(rscreen, processor); +} + +void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst, + uint64_t offset, uint64_t size, unsigned value) +{ + struct r600_common_context *rctx = (struct r600_common_context*)rscreen->aux_context; + + mtx_lock(&rscreen->aux_context_lock); + rctx->dma_clear_buffer(&rctx->b, dst, offset, size, value); + rscreen->aux_context->flush(rscreen->aux_context, NULL, 0); + mtx_unlock(&rscreen->aux_context_lock); +} diff --git a/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.h b/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.h new file mode 100644 index 000000000..a6406cfdb --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.h @@ -0,0 +1,932 @@ +/* + * Copyright 2013 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Marek Olšák <maraeo@gmail.com> + * + */ + +/** + * This file contains common screen and context structures and functions + * for r600g and radeonsi. + */ + +#ifndef R600_PIPE_COMMON_H +#define R600_PIPE_COMMON_H + +#include <stdio.h> + +#include "amd/common/ac_binary.h" + +#include "radeon/radeon_winsys.h" + +#include "util/disk_cache.h" +#include "util/u_blitter.h" +#include "util/list.h" +#include "util/u_range.h" +#include "util/slab.h" +#include "util/u_suballoc.h" +#include "util/u_transfer.h" +#include "util/u_threaded_context.h" + +struct u_log_context; + +#define ATI_VENDOR_ID 0x1002 + +#define R600_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) +#define R600_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) +#define R600_RESOURCE_FLAG_FORCE_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2) +#define R600_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4) + +#define R600_CONTEXT_STREAMOUT_FLUSH (1u << 0) +/* Pipeline & streamout query controls. */ +#define R600_CONTEXT_START_PIPELINE_STATS (1u << 1) +#define R600_CONTEXT_STOP_PIPELINE_STATS (1u << 2) +#define R600_CONTEXT_FLUSH_FOR_RENDER_COND (1u << 3) +#define R600_CONTEXT_PRIVATE_FLAG (1u << 4) + +/* special primitive types */ +#define R600_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX + +#define R600_NOT_QUERY 0xffffffff + +/* Debug flags. */ +#define DBG_VS (1 << PIPE_SHADER_VERTEX) +#define DBG_PS (1 << PIPE_SHADER_FRAGMENT) +#define DBG_GS (1 << PIPE_SHADER_GEOMETRY) +#define DBG_TCS (1 << PIPE_SHADER_TESS_CTRL) +#define DBG_TES (1 << PIPE_SHADER_TESS_EVAL) +#define DBG_CS (1 << PIPE_SHADER_COMPUTE) +#define DBG_ALL_SHADERS (DBG_FS - 1) +#define DBG_FS (1 << 6) /* fetch shader */ +#define DBG_TEX (1 << 7) +#define DBG_NIR (1 << 8) +#define DBG_COMPUTE (1 << 9) +/* gap */ +#define DBG_VM (1 << 11) +#define DBG_NO_IR (1 << 12) +#define DBG_NO_TGSI (1 << 13) +#define DBG_NO_ASM (1 << 14) +#define DBG_PREOPT_IR (1 << 15) +#define DBG_CHECK_IR (1 << 16) +#define DBG_NO_OPT_VARIANT (1 << 17) +#define DBG_FS_CORRECT_DERIVS_AFTER_KILL (1 << 18) +/* gaps */ +#define DBG_TEST_DMA (1 << 20) +/* Bits 21-31 are reserved for the r600g driver. */ +/* features */ +#define DBG_NO_ASYNC_DMA (1ull << 32) +#define DBG_NO_HYPERZ (1ull << 33) +#define DBG_NO_DISCARD_RANGE (1ull << 34) +#define DBG_NO_2D_TILING (1ull << 35) +#define DBG_NO_TILING (1ull << 36) +#define DBG_SWITCH_ON_EOP (1ull << 37) +#define DBG_FORCE_DMA (1ull << 38) +#define DBG_PRECOMPILE (1ull << 39) +#define DBG_INFO (1ull << 40) +#define DBG_NO_WC (1ull << 41) +#define DBG_CHECK_VM (1ull << 42) +/* gap */ +#define DBG_UNSAFE_MATH (1ull << 49) +#define DBG_TEST_VMFAULT_CP (1ull << 51) +#define DBG_TEST_VMFAULT_SDMA (1ull << 52) +#define DBG_TEST_VMFAULT_SHADER (1ull << 53) + +#define R600_MAP_BUFFER_ALIGNMENT 64 +#define R600_MAX_VIEWPORTS 16 + +#define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024 + +enum r600_coherency { + R600_COHERENCY_NONE, /* no cache flushes needed */ + R600_COHERENCY_SHADER, + R600_COHERENCY_CB_META, +}; + +#ifdef PIPE_ARCH_BIG_ENDIAN +#define R600_BIG_ENDIAN 1 +#else +#define R600_BIG_ENDIAN 0 +#endif + +struct r600_common_context; +struct r600_perfcounters; +struct tgsi_shader_info; +struct r600_qbo_state; + +void radeon_shader_binary_init(struct ac_shader_binary *b); +void radeon_shader_binary_clean(struct ac_shader_binary *b); + +/* Only 32-bit buffer allocations are supported, gallium doesn't support more + * at the moment. + */ +struct r600_resource { + struct threaded_resource b; + + /* Winsys objects. */ + struct pb_buffer *buf; + uint64_t gpu_address; + /* Memory usage if the buffer placement is optimal. */ + uint64_t vram_usage; + uint64_t gart_usage; + + /* Resource properties. */ + uint64_t bo_size; + unsigned bo_alignment; + enum radeon_bo_domain domains; + enum radeon_bo_flag flags; + unsigned bind_history; + + /* The buffer range which is initialized (with a write transfer, + * streamout, DMA, or as a random access target). The rest of + * the buffer is considered invalid and can be mapped unsynchronized. + * + * This allows unsychronized mapping of a buffer range which hasn't + * been used yet. It's for applications which forget to use + * the unsynchronized map flag and expect the driver to figure it out. + */ + struct util_range valid_buffer_range; + + /* Whether the resource has been exported via resource_get_handle. */ + unsigned external_usage; /* PIPE_HANDLE_USAGE_* */ + + /* Whether this resource is referenced by bindless handles. */ + bool texture_handle_allocated; + bool image_handle_allocated; +}; + +struct r600_transfer { + struct threaded_transfer b; + struct r600_resource *staging; + unsigned offset; +}; + +struct r600_fmask_info { + uint64_t offset; + uint64_t size; + unsigned alignment; + unsigned pitch_in_pixels; + unsigned bank_height; + unsigned slice_tile_max; + unsigned tile_mode_index; + unsigned tile_swizzle; +}; + +struct r600_cmask_info { + uint64_t offset; + uint64_t size; + unsigned alignment; + unsigned slice_tile_max; + uint64_t base_address_reg; +}; + +struct r600_texture { + struct r600_resource resource; + + uint64_t size; + unsigned num_level0_transfers; + enum pipe_format db_render_format; + bool is_depth; + bool db_compatible; + bool can_sample_z; + bool can_sample_s; + unsigned dirty_level_mask; /* each bit says if that mipmap is compressed */ + unsigned stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */ + struct r600_texture *flushed_depth_texture; + struct radeon_surf surface; + + /* Colorbuffer compression and fast clear. */ + struct r600_fmask_info fmask; + struct r600_cmask_info cmask; + struct r600_resource *cmask_buffer; + unsigned cb_color_info; /* fast clear enable bit */ + unsigned color_clear_value[2]; + unsigned last_msaa_resolve_target_micro_mode; + + /* Depth buffer compression and fast clear. */ + uint64_t htile_offset; + bool depth_cleared; /* if it was cleared at least once */ + float depth_clear_value; + bool stencil_cleared; /* if it was cleared at least once */ + uint8_t stencil_clear_value; + + bool non_disp_tiling; /* R600-Cayman only */ + + /* Counter that should be non-zero if the texture is bound to a + * framebuffer. Implemented in radeonsi only. + */ + uint32_t framebuffers_bound; +}; + +struct r600_surface { + struct pipe_surface base; + + /* These can vary with block-compressed textures. */ + unsigned width0; + unsigned height0; + + bool color_initialized; + bool depth_initialized; + + /* Misc. color flags. */ + bool alphatest_bypass; + bool export_16bpc; + bool color_is_int8; + bool color_is_int10; + + /* Color registers. */ + unsigned cb_color_info; + unsigned cb_color_base; + unsigned cb_color_view; + unsigned cb_color_size; /* R600 only */ + unsigned cb_color_dim; /* EG only */ + unsigned cb_color_pitch; /* EG and later */ + unsigned cb_color_slice; /* EG and later */ + unsigned cb_color_attrib; /* EG and later */ + unsigned cb_color_fmask; /* CB_COLORn_FMASK (EG and later) or CB_COLORn_FRAG (r600) */ + unsigned cb_color_fmask_slice; /* EG and later */ + unsigned cb_color_cmask; /* CB_COLORn_TILE (r600 only) */ + unsigned cb_color_mask; /* R600 only */ + struct r600_resource *cb_buffer_fmask; /* Used for FMASK relocations. R600 only */ + struct r600_resource *cb_buffer_cmask; /* Used for CMASK relocations. R600 only */ + + /* DB registers. */ + uint64_t db_depth_base; /* DB_Z_READ/WRITE_BASE (EG and later) or DB_DEPTH_BASE (r600) */ + uint64_t db_stencil_base; /* EG and later */ + uint64_t db_htile_data_base; + unsigned db_depth_info; /* R600 only, then SI and later */ + unsigned db_z_info; /* EG and later */ + unsigned db_depth_view; + unsigned db_depth_size; + unsigned db_depth_slice; /* EG and later */ + unsigned db_stencil_info; /* EG and later */ + unsigned db_prefetch_limit; /* R600 only */ + unsigned db_htile_surface; + unsigned db_preload_control; /* EG and later */ +}; + +struct r600_mmio_counter { + unsigned busy; + unsigned idle; +}; + +union r600_mmio_counters { + struct { + /* For global GPU load including SDMA. */ + struct r600_mmio_counter gpu; + + /* GRBM_STATUS */ + struct r600_mmio_counter spi; + struct r600_mmio_counter gui; + struct r600_mmio_counter ta; + struct r600_mmio_counter gds; + struct r600_mmio_counter vgt; + struct r600_mmio_counter ia; + struct r600_mmio_counter sx; + struct r600_mmio_counter wd; + struct r600_mmio_counter bci; + struct r600_mmio_counter sc; + struct r600_mmio_counter pa; + struct r600_mmio_counter db; + struct r600_mmio_counter cp; + struct r600_mmio_counter cb; + + /* SRBM_STATUS2 */ + struct r600_mmio_counter sdma; + + /* CP_STAT */ + struct r600_mmio_counter pfp; + struct r600_mmio_counter meq; + struct r600_mmio_counter me; + struct r600_mmio_counter surf_sync; + struct r600_mmio_counter cp_dma; + struct r600_mmio_counter scratch_ram; + } named; + unsigned array[0]; +}; + +struct r600_memory_object { + struct pipe_memory_object b; + struct pb_buffer *buf; + uint32_t stride; + uint32_t offset; +}; + +struct r600_common_screen { + struct pipe_screen b; + struct radeon_winsys *ws; + enum radeon_family family; + enum chip_class chip_class; + struct radeon_info info; + uint64_t debug_flags; + bool has_cp_dma; + bool has_streamout; + + struct disk_cache *disk_shader_cache; + + struct slab_parent_pool pool_transfers; + + /* Texture filter settings. */ + int force_aniso; /* -1 = disabled */ + + /* Auxiliary context. Mainly used to initialize resources. + * It must be locked prior to using and flushed before unlocking. */ + struct pipe_context *aux_context; + mtx_t aux_context_lock; + + /* This must be in the screen, because UE4 uses one context for + * compilation and another one for rendering. + */ + unsigned num_compilations; + /* Along with ST_DEBUG=precompile, this should show if applications + * are loading shaders on demand. This is a monotonic counter. + */ + unsigned num_shaders_created; + unsigned num_shader_cache_hits; + + /* GPU load thread. */ + mtx_t gpu_load_mutex; + thrd_t gpu_load_thread; + union r600_mmio_counters mmio_counters; + volatile unsigned gpu_load_stop_thread; /* bool */ + + char renderer_string[100]; + + /* Performance counters. */ + struct r600_perfcounters *perfcounters; + + /* If pipe_screen wants to recompute and re-emit the framebuffer, + * sampler, and image states of all contexts, it should atomically + * increment this. + * + * Each context will compare this with its own last known value of + * the counter before drawing and re-emit the states accordingly. + */ + unsigned dirty_tex_counter; + + /* Atomically increment this counter when an existing texture's + * metadata is enabled or disabled in a way that requires changing + * contexts' compressed texture binding masks. + */ + unsigned compressed_colortex_counter; + + struct { + /* Context flags to set so that all writes from earlier jobs + * in the CP are seen by L2 clients. + */ + unsigned cp_to_L2; + + /* Context flags to set so that all writes from earlier jobs + * that end in L2 are seen by CP. + */ + unsigned L2_to_cp; + + /* Context flags to set so that all writes from earlier + * compute jobs are seen by L2 clients. + */ + unsigned compute_to_L2; + } barrier_flags; + + void (*query_opaque_metadata)(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + struct radeon_bo_metadata *md); + + void (*apply_opaque_metadata)(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + struct radeon_bo_metadata *md); +}; + +/* This encapsulates a state or an operation which can emitted into the GPU + * command stream. */ +struct r600_atom { + void (*emit)(struct r600_common_context *ctx, struct r600_atom *state); + unsigned num_dw; + unsigned short id; +}; + +struct r600_so_target { + struct pipe_stream_output_target b; + + /* The buffer where BUFFER_FILLED_SIZE is stored. */ + struct r600_resource *buf_filled_size; + unsigned buf_filled_size_offset; + bool buf_filled_size_valid; + + unsigned stride_in_dw; +}; + +struct r600_streamout { + struct r600_atom begin_atom; + bool begin_emitted; + unsigned num_dw_for_end; + + unsigned enabled_mask; + unsigned num_targets; + struct r600_so_target *targets[PIPE_MAX_SO_BUFFERS]; + + unsigned append_bitmask; + bool suspended; + + /* External state which comes from the vertex shader, + * it must be set explicitly when binding a shader. */ + uint16_t *stride_in_dw; + unsigned enabled_stream_buffers_mask; /* stream0 buffers0-3 in 4 LSB */ + + /* The state of VGT_STRMOUT_BUFFER_(CONFIG|EN). */ + unsigned hw_enabled_mask; + + /* The state of VGT_STRMOUT_(CONFIG|EN). */ + struct r600_atom enable_atom; + bool streamout_enabled; + bool prims_gen_query_enabled; + int num_prims_gen_queries; +}; + +struct r600_signed_scissor { + int minx; + int miny; + int maxx; + int maxy; +}; + +struct r600_scissors { + struct r600_atom atom; + unsigned dirty_mask; + struct pipe_scissor_state states[R600_MAX_VIEWPORTS]; +}; + +struct r600_viewports { + struct r600_atom atom; + unsigned dirty_mask; + unsigned depth_range_dirty_mask; + struct pipe_viewport_state states[R600_MAX_VIEWPORTS]; + struct r600_signed_scissor as_scissor[R600_MAX_VIEWPORTS]; +}; + +struct r600_ring { + struct radeon_winsys_cs *cs; + void (*flush)(void *ctx, unsigned flags, + struct pipe_fence_handle **fence); +}; + +/* Saved CS data for debugging features. */ +struct radeon_saved_cs { + uint32_t *ib; + unsigned num_dw; + + struct radeon_bo_list_item *bo_list; + unsigned bo_count; +}; + +struct r600_common_context { + struct pipe_context b; /* base class */ + + struct r600_common_screen *screen; + struct radeon_winsys *ws; + struct radeon_winsys_ctx *ctx; + enum radeon_family family; + enum chip_class chip_class; + struct r600_ring gfx; + struct r600_ring dma; + struct pipe_fence_handle *last_gfx_fence; + struct pipe_fence_handle *last_sdma_fence; + struct r600_resource *eop_bug_scratch; + unsigned num_gfx_cs_flushes; + unsigned initial_gfx_cs_size; + unsigned gpu_reset_counter; + unsigned last_dirty_tex_counter; + unsigned last_compressed_colortex_counter; + unsigned last_num_draw_calls; + + struct threaded_context *tc; + struct u_suballocator *allocator_zeroed_memory; + struct slab_child_pool pool_transfers; + struct slab_child_pool pool_transfers_unsync; /* for threaded_context */ + + /* Current unaccounted memory usage. */ + uint64_t vram; + uint64_t gtt; + + /* States. */ + struct r600_streamout streamout; + struct r600_scissors scissors; + struct r600_viewports viewports; + bool scissor_enabled; + bool clip_halfz; + bool vs_writes_viewport_index; + bool vs_disables_clipping_viewport; + + /* Additional context states. */ + unsigned flags; /* flush flags */ + + /* Queries. */ + /* Maintain the list of active queries for pausing between IBs. */ + int num_occlusion_queries; + int num_perfect_occlusion_queries; + struct list_head active_queries; + unsigned num_cs_dw_queries_suspend; + /* Misc stats. */ + unsigned num_draw_calls; + unsigned num_decompress_calls; + unsigned num_mrt_draw_calls; + unsigned num_prim_restart_calls; + unsigned num_spill_draw_calls; + unsigned num_compute_calls; + unsigned num_spill_compute_calls; + unsigned num_dma_calls; + unsigned num_cp_dma_calls; + unsigned num_vs_flushes; + unsigned num_ps_flushes; + unsigned num_cs_flushes; + unsigned num_cb_cache_flushes; + unsigned num_db_cache_flushes; + unsigned num_L2_invalidates; + unsigned num_L2_writebacks; + unsigned num_resident_handles; + uint64_t num_alloc_tex_transfer_bytes; + + /* Render condition. */ + struct r600_atom render_cond_atom; + struct pipe_query *render_cond; + unsigned render_cond_mode; + bool render_cond_invert; + bool render_cond_force_off; /* for u_blitter */ + + /* MSAA sample locations. + * The first index is the sample index. + * The second index is the coordinate: X, Y. */ + float sample_locations_1x[1][2]; + float sample_locations_2x[2][2]; + float sample_locations_4x[4][2]; + float sample_locations_8x[8][2]; + float sample_locations_16x[16][2]; + + struct pipe_debug_callback debug; + struct pipe_device_reset_callback device_reset_callback; + struct u_log_context *log; + + void *query_result_shader; + + /* Copy one resource to another using async DMA. */ + void (*dma_copy)(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dst_x, unsigned dst_y, unsigned dst_z, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box); + + void (*dma_clear_buffer)(struct pipe_context *ctx, struct pipe_resource *dst, + uint64_t offset, uint64_t size, unsigned value); + + void (*clear_buffer)(struct pipe_context *ctx, struct pipe_resource *dst, + uint64_t offset, uint64_t size, unsigned value, + enum r600_coherency coher); + + void (*blit_decompress_depth)(struct pipe_context *ctx, + struct r600_texture *texture, + struct r600_texture *staging, + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned first_sample, unsigned last_sample); + + /* Reallocate the buffer and update all resource bindings where + * the buffer is bound, including all resource descriptors. */ + void (*invalidate_buffer)(struct pipe_context *ctx, struct pipe_resource *buf); + + /* Update all resource bindings where the buffer is bound, including + * all resource descriptors. This is invalidate_buffer without + * the invalidation. */ + void (*rebind_buffer)(struct pipe_context *ctx, struct pipe_resource *buf, + uint64_t old_gpu_address); + + void (*save_qbo_state)(struct pipe_context *ctx, struct r600_qbo_state *st); + + /* This ensures there is enough space in the command stream. */ + void (*need_gfx_cs_space)(struct pipe_context *ctx, unsigned num_dw, + bool include_draw_vbo); + + void (*set_atom_dirty)(struct r600_common_context *ctx, + struct r600_atom *atom, bool dirty); + + void (*check_vm_faults)(struct r600_common_context *ctx, + struct radeon_saved_cs *saved, + enum ring_type ring); +}; + +/* r600_buffer_common.c */ +bool r600_rings_is_buffer_referenced(struct r600_common_context *ctx, + struct pb_buffer *buf, + enum radeon_bo_usage usage); +void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, + struct r600_resource *resource, + unsigned usage); +void r600_buffer_subdata(struct pipe_context *ctx, + struct pipe_resource *buffer, + unsigned usage, unsigned offset, + unsigned size, const void *data); +void r600_init_resource_fields(struct r600_common_screen *rscreen, + struct r600_resource *res, + uint64_t size, unsigned alignment); +bool r600_alloc_resource(struct r600_common_screen *rscreen, + struct r600_resource *res); +struct pipe_resource *r600_buffer_create(struct pipe_screen *screen, + const struct pipe_resource *templ, + unsigned alignment); +struct pipe_resource * r600_aligned_buffer_create(struct pipe_screen *screen, + unsigned flags, + unsigned usage, + unsigned size, + unsigned alignment); +struct pipe_resource * +r600_buffer_from_user_memory(struct pipe_screen *screen, + const struct pipe_resource *templ, + void *user_memory); +void +r600_invalidate_resource(struct pipe_context *ctx, + struct pipe_resource *resource); +void r600_replace_buffer_storage(struct pipe_context *ctx, + struct pipe_resource *dst, + struct pipe_resource *src); + +/* r600_common_pipe.c */ +void r600_gfx_write_event_eop(struct r600_common_context *ctx, + unsigned event, unsigned event_flags, + unsigned data_sel, + struct r600_resource *buf, uint64_t va, + uint32_t new_fence, unsigned query_type); +unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen); +void r600_gfx_wait_fence(struct r600_common_context *ctx, + uint64_t va, uint32_t ref, uint32_t mask); +void r600_draw_rectangle(struct blitter_context *blitter, + void *vertex_elements_cso, + blitter_get_vs_func get_vs, + int x1, int y1, int x2, int y2, + float depth, unsigned num_instances, + enum blitter_attrib_type type, + const union blitter_attrib *attrib); +bool r600_common_screen_init(struct r600_common_screen *rscreen, + struct radeon_winsys *ws); +void r600_destroy_common_screen(struct r600_common_screen *rscreen); +void r600_preflush_suspend_features(struct r600_common_context *ctx); +void r600_postflush_resume_features(struct r600_common_context *ctx); +bool r600_common_context_init(struct r600_common_context *rctx, + struct r600_common_screen *rscreen, + unsigned context_flags); +void r600_common_context_cleanup(struct r600_common_context *rctx); +bool r600_can_dump_shader(struct r600_common_screen *rscreen, + unsigned processor); +bool r600_extra_shader_checks(struct r600_common_screen *rscreen, + unsigned processor); +void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst, + uint64_t offset, uint64_t size, unsigned value); +struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen, + const struct pipe_resource *templ); +const char *r600_get_llvm_processor_name(enum radeon_family family); +void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, + struct r600_resource *dst, struct r600_resource *src); +void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs, + struct radeon_saved_cs *saved, bool get_buffer_list); +void radeon_clear_saved_cs(struct radeon_saved_cs *saved); +bool r600_check_device_reset(struct r600_common_context *rctx); + +/* r600_gpu_load.c */ +void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen); +uint64_t r600_begin_counter(struct r600_common_screen *rscreen, unsigned type); +unsigned r600_end_counter(struct r600_common_screen *rscreen, unsigned type, + uint64_t begin); + +/* r600_perfcounters.c */ +void r600_perfcounters_destroy(struct r600_common_screen *rscreen); + +/* r600_query.c */ +void r600_init_screen_query_functions(struct r600_common_screen *rscreen); +void r600_query_init(struct r600_common_context *rctx); +void r600_suspend_queries(struct r600_common_context *ctx); +void r600_resume_queries(struct r600_common_context *ctx); +void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen); + +/* r600_streamout.c */ +void r600_streamout_buffers_dirty(struct r600_common_context *rctx); +void r600_set_streamout_targets(struct pipe_context *ctx, + unsigned num_targets, + struct pipe_stream_output_target **targets, + const unsigned *offset); +void r600_emit_streamout_end(struct r600_common_context *rctx); +void r600_update_prims_generated_query_state(struct r600_common_context *rctx, + unsigned type, int diff); +void r600_streamout_init(struct r600_common_context *rctx); + +/* r600_test_dma.c */ +void r600_test_dma(struct r600_common_screen *rscreen); + +/* r600_texture.c */ +bool r600_prepare_for_dma_blit(struct r600_common_context *rctx, + struct r600_texture *rdst, + unsigned dst_level, unsigned dstx, + unsigned dsty, unsigned dstz, + struct r600_texture *rsrc, + unsigned src_level, + const struct pipe_box *src_box); +void r600_texture_get_fmask_info(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + unsigned nr_samples, + struct r600_fmask_info *out); +void r600_texture_get_cmask_info(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + struct r600_cmask_info *out); +bool r600_init_flushed_depth_texture(struct pipe_context *ctx, + struct pipe_resource *texture, + struct r600_texture **staging); +void r600_print_texture_info(struct r600_common_screen *rscreen, + struct r600_texture *rtex, struct u_log_context *log); +struct pipe_resource *r600_texture_create(struct pipe_screen *screen, + const struct pipe_resource *templ); +struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe, + struct pipe_resource *texture, + const struct pipe_surface *templ, + unsigned width0, unsigned height0, + unsigned width, unsigned height); +unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap); +void evergreen_do_fast_color_clear(struct r600_common_context *rctx, + struct pipe_framebuffer_state *fb, + struct r600_atom *fb_state, + unsigned *buffers, ubyte *dirty_cbufs, + const union pipe_color_union *color); +void r600_init_screen_texture_functions(struct r600_common_screen *rscreen); +void r600_init_context_texture_functions(struct r600_common_context *rctx); + +/* r600_viewport.c */ +void evergreen_apply_scissor_bug_workaround(struct r600_common_context *rctx, + struct pipe_scissor_state *scissor); +void r600_viewport_set_rast_deps(struct r600_common_context *rctx, + bool scissor_enable, bool clip_halfz); +void r600_update_vs_writes_viewport_index(struct r600_common_context *rctx, + struct tgsi_shader_info *info); +void r600_init_viewport_functions(struct r600_common_context *rctx); + +/* cayman_msaa.c */ +extern const uint32_t eg_sample_locs_2x[4]; +extern const unsigned eg_max_dist_2x; +extern const uint32_t eg_sample_locs_4x[4]; +extern const unsigned eg_max_dist_4x; +void cayman_get_sample_position(struct pipe_context *ctx, unsigned sample_count, + unsigned sample_index, float *out_value); +void cayman_init_msaa(struct pipe_context *ctx); +void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples); +void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, + int ps_iter_samples, int overrast_samples, + unsigned sc_mode_cntl_1); + + +/* Inline helpers. */ + +static inline struct r600_resource *r600_resource(struct pipe_resource *r) +{ + return (struct r600_resource*)r; +} + +static inline void +r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res) +{ + pipe_resource_reference((struct pipe_resource **)ptr, + (struct pipe_resource *)res); +} + +static inline void +r600_texture_reference(struct r600_texture **ptr, struct r600_texture *res) +{ + pipe_resource_reference((struct pipe_resource **)ptr, &res->resource.b.b); +} + +static inline void +r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_resource *res = (struct r600_resource *)r; + + if (res) { + /* Add memory usage for need_gfx_cs_space */ + rctx->vram += res->vram_usage; + rctx->gtt += res->gart_usage; + } +} + +static inline bool r600_get_strmout_en(struct r600_common_context *rctx) +{ + return rctx->streamout.streamout_enabled || + rctx->streamout.prims_gen_query_enabled; +} + +#define SQ_TEX_XY_FILTER_POINT 0x00 +#define SQ_TEX_XY_FILTER_BILINEAR 0x01 +#define SQ_TEX_XY_FILTER_ANISO_POINT 0x02 +#define SQ_TEX_XY_FILTER_ANISO_BILINEAR 0x03 + +static inline unsigned eg_tex_filter(unsigned filter, unsigned max_aniso) +{ + if (filter == PIPE_TEX_FILTER_LINEAR) + return max_aniso > 1 ? SQ_TEX_XY_FILTER_ANISO_BILINEAR + : SQ_TEX_XY_FILTER_BILINEAR; + else + return max_aniso > 1 ? SQ_TEX_XY_FILTER_ANISO_POINT + : SQ_TEX_XY_FILTER_POINT; +} + +static inline unsigned r600_tex_aniso_filter(unsigned filter) +{ + if (filter < 2) + return 0; + if (filter < 4) + return 1; + if (filter < 8) + return 2; + if (filter < 16) + return 3; + return 4; +} + +static inline unsigned r600_wavefront_size(enum radeon_family family) +{ + switch (family) { + case CHIP_RV610: + case CHIP_RS780: + case CHIP_RV620: + case CHIP_RS880: + return 16; + case CHIP_RV630: + case CHIP_RV635: + case CHIP_RV730: + case CHIP_RV710: + case CHIP_PALM: + case CHIP_CEDAR: + return 32; + default: + return 64; + } +} + +static inline enum radeon_bo_priority +r600_get_sampler_view_priority(struct r600_resource *res) +{ + if (res->b.b.target == PIPE_BUFFER) + return RADEON_PRIO_SAMPLER_BUFFER; + + if (res->b.b.nr_samples > 1) + return RADEON_PRIO_SAMPLER_TEXTURE_MSAA; + + return RADEON_PRIO_SAMPLER_TEXTURE; +} + +static inline bool +r600_can_sample_zs(struct r600_texture *tex, bool stencil_sampler) +{ + return (stencil_sampler && tex->can_sample_s) || + (!stencil_sampler && tex->can_sample_z); +} + +static inline bool +r600_htile_enabled(struct r600_texture *tex, unsigned level) +{ + return tex->htile_offset && level == 0; +} + +#define COMPUTE_DBG(rscreen, fmt, args...) \ + do { \ + if ((rscreen->b.debug_flags & DBG_COMPUTE)) fprintf(stderr, fmt, ##args); \ + } while (0); + +#define R600_ERR(fmt, args...) \ + fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) + +/* For MSAA sample positions. */ +#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \ + (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) | \ + (((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) | \ + (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \ + (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28)) + +static inline int S_FIXED(float value, unsigned frac_bits) +{ + return value * (1 << frac_bits); +} + +#endif diff --git a/lib/mesa/src/gallium/drivers/r600/r600_query.c b/lib/mesa/src/gallium/drivers/r600/r600_query.c new file mode 100644 index 000000000..aa3e36f56 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600_query.c @@ -0,0 +1,2126 @@ +/* + * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> + * Copyright 2014 Marek Olšák <marek.olsak@amd.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "r600_query.h" +#include "r600_pipe.h" +#include "r600_cs.h" +#include "util/u_memory.h" +#include "util/u_upload_mgr.h" +#include "os/os_time.h" +#include "tgsi/tgsi_text.h" + +#define R600_MAX_STREAMS 4 + +struct r600_hw_query_params { + unsigned start_offset; + unsigned end_offset; + unsigned fence_offset; + unsigned pair_stride; + unsigned pair_count; +}; + +/* Queries without buffer handling or suspend/resume. */ +struct r600_query_sw { + struct r600_query b; + + uint64_t begin_result; + uint64_t end_result; + + uint64_t begin_time; + uint64_t end_time; + + /* Fence for GPU_FINISHED. */ + struct pipe_fence_handle *fence; +}; + +static void r600_query_sw_destroy(struct r600_common_screen *rscreen, + struct r600_query *rquery) +{ + struct r600_query_sw *query = (struct r600_query_sw *)rquery; + + rscreen->b.fence_reference(&rscreen->b, &query->fence, NULL); + FREE(query); +} + +static enum radeon_value_id winsys_id_from_type(unsigned type) +{ + switch (type) { + case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY; + case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY; + case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM; + case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT; + case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS; + case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS; + case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS; + case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS; + case R600_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER; + case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED; + case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS; + case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS; + case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE; + case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE; + case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE; + case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE; + case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK; + case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK; + case R600_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME; + default: unreachable("query type does not correspond to winsys id"); + } +} + +static bool r600_query_sw_begin(struct r600_common_context *rctx, + struct r600_query *rquery) +{ + struct r600_query_sw *query = (struct r600_query_sw *)rquery; + enum radeon_value_id ws_id; + + switch(query->b.type) { + case PIPE_QUERY_TIMESTAMP_DISJOINT: + case PIPE_QUERY_GPU_FINISHED: + break; + case R600_QUERY_DRAW_CALLS: + query->begin_result = rctx->num_draw_calls; + break; + case R600_QUERY_DECOMPRESS_CALLS: + query->begin_result = rctx->num_decompress_calls; + break; + case R600_QUERY_MRT_DRAW_CALLS: + query->begin_result = rctx->num_mrt_draw_calls; + break; + case R600_QUERY_PRIM_RESTART_CALLS: + query->begin_result = rctx->num_prim_restart_calls; + break; + case R600_QUERY_SPILL_DRAW_CALLS: + query->begin_result = rctx->num_spill_draw_calls; + break; + case R600_QUERY_COMPUTE_CALLS: + query->begin_result = rctx->num_compute_calls; + break; + case R600_QUERY_SPILL_COMPUTE_CALLS: + query->begin_result = rctx->num_spill_compute_calls; + break; + case R600_QUERY_DMA_CALLS: + query->begin_result = rctx->num_dma_calls; + break; + case R600_QUERY_CP_DMA_CALLS: + query->begin_result = rctx->num_cp_dma_calls; + break; + case R600_QUERY_NUM_VS_FLUSHES: + query->begin_result = rctx->num_vs_flushes; + break; + case R600_QUERY_NUM_PS_FLUSHES: + query->begin_result = rctx->num_ps_flushes; + break; + case R600_QUERY_NUM_CS_FLUSHES: + query->begin_result = rctx->num_cs_flushes; + break; + case R600_QUERY_NUM_CB_CACHE_FLUSHES: + query->begin_result = rctx->num_cb_cache_flushes; + break; + case R600_QUERY_NUM_DB_CACHE_FLUSHES: + query->begin_result = rctx->num_db_cache_flushes; + break; + case R600_QUERY_NUM_L2_INVALIDATES: + query->begin_result = rctx->num_L2_invalidates; + break; + case R600_QUERY_NUM_L2_WRITEBACKS: + query->begin_result = rctx->num_L2_writebacks; + break; + case R600_QUERY_NUM_RESIDENT_HANDLES: + query->begin_result = rctx->num_resident_handles; + break; + case R600_QUERY_TC_OFFLOADED_SLOTS: + query->begin_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0; + break; + case R600_QUERY_TC_DIRECT_SLOTS: + query->begin_result = rctx->tc ? rctx->tc->num_direct_slots : 0; + break; + case R600_QUERY_TC_NUM_SYNCS: + query->begin_result = rctx->tc ? rctx->tc->num_syncs : 0; + break; + case R600_QUERY_REQUESTED_VRAM: + case R600_QUERY_REQUESTED_GTT: + case R600_QUERY_MAPPED_VRAM: + case R600_QUERY_MAPPED_GTT: + case R600_QUERY_VRAM_USAGE: + case R600_QUERY_VRAM_VIS_USAGE: + case R600_QUERY_GTT_USAGE: + case R600_QUERY_GPU_TEMPERATURE: + case R600_QUERY_CURRENT_GPU_SCLK: + case R600_QUERY_CURRENT_GPU_MCLK: + case R600_QUERY_NUM_MAPPED_BUFFERS: + query->begin_result = 0; + break; + case R600_QUERY_BUFFER_WAIT_TIME: + case R600_QUERY_NUM_GFX_IBS: + case R600_QUERY_NUM_SDMA_IBS: + case R600_QUERY_NUM_BYTES_MOVED: + case R600_QUERY_NUM_EVICTIONS: + case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: { + enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); + query->begin_result = rctx->ws->query_value(rctx->ws, ws_id); + break; + } + case R600_QUERY_GFX_BO_LIST_SIZE: + ws_id = winsys_id_from_type(query->b.type); + query->begin_result = rctx->ws->query_value(rctx->ws, ws_id); + query->begin_time = rctx->ws->query_value(rctx->ws, + RADEON_NUM_GFX_IBS); + break; + case R600_QUERY_CS_THREAD_BUSY: + ws_id = winsys_id_from_type(query->b.type); + query->begin_result = rctx->ws->query_value(rctx->ws, ws_id); + query->begin_time = os_time_get_nano(); + break; + case R600_QUERY_GALLIUM_THREAD_BUSY: + query->begin_result = + rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0; + query->begin_time = os_time_get_nano(); + break; + case R600_QUERY_GPU_LOAD: + case R600_QUERY_GPU_SHADERS_BUSY: + case R600_QUERY_GPU_TA_BUSY: + case R600_QUERY_GPU_GDS_BUSY: + case R600_QUERY_GPU_VGT_BUSY: + case R600_QUERY_GPU_IA_BUSY: + case R600_QUERY_GPU_SX_BUSY: + case R600_QUERY_GPU_WD_BUSY: + case R600_QUERY_GPU_BCI_BUSY: + case R600_QUERY_GPU_SC_BUSY: + case R600_QUERY_GPU_PA_BUSY: + case R600_QUERY_GPU_DB_BUSY: + case R600_QUERY_GPU_CP_BUSY: + case R600_QUERY_GPU_CB_BUSY: + case R600_QUERY_GPU_SDMA_BUSY: + case R600_QUERY_GPU_PFP_BUSY: + case R600_QUERY_GPU_MEQ_BUSY: + case R600_QUERY_GPU_ME_BUSY: + case R600_QUERY_GPU_SURF_SYNC_BUSY: + case R600_QUERY_GPU_CP_DMA_BUSY: + case R600_QUERY_GPU_SCRATCH_RAM_BUSY: + query->begin_result = r600_begin_counter(rctx->screen, + query->b.type); + break; + case R600_QUERY_NUM_COMPILATIONS: + query->begin_result = p_atomic_read(&rctx->screen->num_compilations); + break; + case R600_QUERY_NUM_SHADERS_CREATED: + query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created); + break; + case R600_QUERY_NUM_SHADER_CACHE_HITS: + query->begin_result = + p_atomic_read(&rctx->screen->num_shader_cache_hits); + break; + case R600_QUERY_GPIN_ASIC_ID: + case R600_QUERY_GPIN_NUM_SIMD: + case R600_QUERY_GPIN_NUM_RB: + case R600_QUERY_GPIN_NUM_SPI: + case R600_QUERY_GPIN_NUM_SE: + break; + default: + unreachable("r600_query_sw_begin: bad query type"); + } + + return true; +} + +static bool r600_query_sw_end(struct r600_common_context *rctx, + struct r600_query *rquery) +{ + struct r600_query_sw *query = (struct r600_query_sw *)rquery; + enum radeon_value_id ws_id; + + switch(query->b.type) { + case PIPE_QUERY_TIMESTAMP_DISJOINT: + break; + case PIPE_QUERY_GPU_FINISHED: + rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED); + break; + case R600_QUERY_DRAW_CALLS: + query->end_result = rctx->num_draw_calls; + break; + case R600_QUERY_DECOMPRESS_CALLS: + query->end_result = rctx->num_decompress_calls; + break; + case R600_QUERY_MRT_DRAW_CALLS: + query->end_result = rctx->num_mrt_draw_calls; + break; + case R600_QUERY_PRIM_RESTART_CALLS: + query->end_result = rctx->num_prim_restart_calls; + break; + case R600_QUERY_SPILL_DRAW_CALLS: + query->end_result = rctx->num_spill_draw_calls; + break; + case R600_QUERY_COMPUTE_CALLS: + query->end_result = rctx->num_compute_calls; + break; + case R600_QUERY_SPILL_COMPUTE_CALLS: + query->end_result = rctx->num_spill_compute_calls; + break; + case R600_QUERY_DMA_CALLS: + query->end_result = rctx->num_dma_calls; + break; + case R600_QUERY_CP_DMA_CALLS: + query->end_result = rctx->num_cp_dma_calls; + break; + case R600_QUERY_NUM_VS_FLUSHES: + query->end_result = rctx->num_vs_flushes; + break; + case R600_QUERY_NUM_PS_FLUSHES: + query->end_result = rctx->num_ps_flushes; + break; + case R600_QUERY_NUM_CS_FLUSHES: + query->end_result = rctx->num_cs_flushes; + break; + case R600_QUERY_NUM_CB_CACHE_FLUSHES: + query->end_result = rctx->num_cb_cache_flushes; + break; + case R600_QUERY_NUM_DB_CACHE_FLUSHES: + query->end_result = rctx->num_db_cache_flushes; + break; + case R600_QUERY_NUM_L2_INVALIDATES: + query->end_result = rctx->num_L2_invalidates; + break; + case R600_QUERY_NUM_L2_WRITEBACKS: + query->end_result = rctx->num_L2_writebacks; + break; + case R600_QUERY_NUM_RESIDENT_HANDLES: + query->end_result = rctx->num_resident_handles; + break; + case R600_QUERY_TC_OFFLOADED_SLOTS: + query->end_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0; + break; + case R600_QUERY_TC_DIRECT_SLOTS: + query->end_result = rctx->tc ? rctx->tc->num_direct_slots : 0; + break; + case R600_QUERY_TC_NUM_SYNCS: + query->end_result = rctx->tc ? rctx->tc->num_syncs : 0; + break; + case R600_QUERY_REQUESTED_VRAM: + case R600_QUERY_REQUESTED_GTT: + case R600_QUERY_MAPPED_VRAM: + case R600_QUERY_MAPPED_GTT: + case R600_QUERY_VRAM_USAGE: + case R600_QUERY_VRAM_VIS_USAGE: + case R600_QUERY_GTT_USAGE: + case R600_QUERY_GPU_TEMPERATURE: + case R600_QUERY_CURRENT_GPU_SCLK: + case R600_QUERY_CURRENT_GPU_MCLK: + case R600_QUERY_BUFFER_WAIT_TIME: + case R600_QUERY_NUM_MAPPED_BUFFERS: + case R600_QUERY_NUM_GFX_IBS: + case R600_QUERY_NUM_SDMA_IBS: + case R600_QUERY_NUM_BYTES_MOVED: + case R600_QUERY_NUM_EVICTIONS: + case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: { + enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); + query->end_result = rctx->ws->query_value(rctx->ws, ws_id); + break; + } + case R600_QUERY_GFX_BO_LIST_SIZE: + ws_id = winsys_id_from_type(query->b.type); + query->end_result = rctx->ws->query_value(rctx->ws, ws_id); + query->end_time = rctx->ws->query_value(rctx->ws, + RADEON_NUM_GFX_IBS); + break; + case R600_QUERY_CS_THREAD_BUSY: + ws_id = winsys_id_from_type(query->b.type); + query->end_result = rctx->ws->query_value(rctx->ws, ws_id); + query->end_time = os_time_get_nano(); + break; + case R600_QUERY_GALLIUM_THREAD_BUSY: + query->end_result = + rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0; + query->end_time = os_time_get_nano(); + break; + case R600_QUERY_GPU_LOAD: + case R600_QUERY_GPU_SHADERS_BUSY: + case R600_QUERY_GPU_TA_BUSY: + case R600_QUERY_GPU_GDS_BUSY: + case R600_QUERY_GPU_VGT_BUSY: + case R600_QUERY_GPU_IA_BUSY: + case R600_QUERY_GPU_SX_BUSY: + case R600_QUERY_GPU_WD_BUSY: + case R600_QUERY_GPU_BCI_BUSY: + case R600_QUERY_GPU_SC_BUSY: + case R600_QUERY_GPU_PA_BUSY: + case R600_QUERY_GPU_DB_BUSY: + case R600_QUERY_GPU_CP_BUSY: + case R600_QUERY_GPU_CB_BUSY: + case R600_QUERY_GPU_SDMA_BUSY: + case R600_QUERY_GPU_PFP_BUSY: + case R600_QUERY_GPU_MEQ_BUSY: + case R600_QUERY_GPU_ME_BUSY: + case R600_QUERY_GPU_SURF_SYNC_BUSY: + case R600_QUERY_GPU_CP_DMA_BUSY: + case R600_QUERY_GPU_SCRATCH_RAM_BUSY: + query->end_result = r600_end_counter(rctx->screen, + query->b.type, + query->begin_result); + query->begin_result = 0; + break; + case R600_QUERY_NUM_COMPILATIONS: + query->end_result = p_atomic_read(&rctx->screen->num_compilations); + break; + case R600_QUERY_NUM_SHADERS_CREATED: + query->end_result = p_atomic_read(&rctx->screen->num_shaders_created); + break; + case R600_QUERY_NUM_SHADER_CACHE_HITS: + query->end_result = + p_atomic_read(&rctx->screen->num_shader_cache_hits); + break; + case R600_QUERY_GPIN_ASIC_ID: + case R600_QUERY_GPIN_NUM_SIMD: + case R600_QUERY_GPIN_NUM_RB: + case R600_QUERY_GPIN_NUM_SPI: + case R600_QUERY_GPIN_NUM_SE: + break; + default: + unreachable("r600_query_sw_end: bad query type"); + } + + return true; +} + +static bool r600_query_sw_get_result(struct r600_common_context *rctx, + struct r600_query *rquery, + bool wait, + union pipe_query_result *result) +{ + struct r600_query_sw *query = (struct r600_query_sw *)rquery; + + switch (query->b.type) { + case PIPE_QUERY_TIMESTAMP_DISJOINT: + /* Convert from cycles per millisecond to cycles per second (Hz). */ + result->timestamp_disjoint.frequency = + (uint64_t)rctx->screen->info.clock_crystal_freq * 1000; + result->timestamp_disjoint.disjoint = false; + return true; + case PIPE_QUERY_GPU_FINISHED: { + struct pipe_screen *screen = rctx->b.screen; + struct pipe_context *ctx = rquery->b.flushed ? NULL : &rctx->b; + + result->b = screen->fence_finish(screen, ctx, query->fence, + wait ? PIPE_TIMEOUT_INFINITE : 0); + return result->b; + } + + case R600_QUERY_GFX_BO_LIST_SIZE: + result->u64 = (query->end_result - query->begin_result) / + (query->end_time - query->begin_time); + return true; + case R600_QUERY_CS_THREAD_BUSY: + case R600_QUERY_GALLIUM_THREAD_BUSY: + result->u64 = (query->end_result - query->begin_result) * 100 / + (query->end_time - query->begin_time); + return true; + case R600_QUERY_GPIN_ASIC_ID: + result->u32 = 0; + return true; + case R600_QUERY_GPIN_NUM_SIMD: + result->u32 = rctx->screen->info.num_good_compute_units; + return true; + case R600_QUERY_GPIN_NUM_RB: + result->u32 = rctx->screen->info.num_render_backends; + return true; + case R600_QUERY_GPIN_NUM_SPI: + result->u32 = 1; /* all supported chips have one SPI per SE */ + return true; + case R600_QUERY_GPIN_NUM_SE: + result->u32 = rctx->screen->info.max_se; + return true; + } + + result->u64 = query->end_result - query->begin_result; + + switch (query->b.type) { + case R600_QUERY_BUFFER_WAIT_TIME: + case R600_QUERY_GPU_TEMPERATURE: + result->u64 /= 1000; + break; + case R600_QUERY_CURRENT_GPU_SCLK: + case R600_QUERY_CURRENT_GPU_MCLK: + result->u64 *= 1000000; + break; + } + + return true; +} + + +static struct r600_query_ops sw_query_ops = { + .destroy = r600_query_sw_destroy, + .begin = r600_query_sw_begin, + .end = r600_query_sw_end, + .get_result = r600_query_sw_get_result, + .get_result_resource = NULL +}; + +static struct pipe_query *r600_query_sw_create(unsigned query_type) +{ + struct r600_query_sw *query; + + query = CALLOC_STRUCT(r600_query_sw); + if (!query) + return NULL; + + query->b.type = query_type; + query->b.ops = &sw_query_ops; + + return (struct pipe_query *)query; +} + +void r600_query_hw_destroy(struct r600_common_screen *rscreen, + struct r600_query *rquery) +{ + struct r600_query_hw *query = (struct r600_query_hw *)rquery; + struct r600_query_buffer *prev = query->buffer.previous; + + /* Release all query buffers. */ + while (prev) { + struct r600_query_buffer *qbuf = prev; + prev = prev->previous; + r600_resource_reference(&qbuf->buf, NULL); + FREE(qbuf); + } + + r600_resource_reference(&query->buffer.buf, NULL); + r600_resource_reference(&query->workaround_buf, NULL); + FREE(rquery); +} + +static struct r600_resource *r600_new_query_buffer(struct r600_common_screen *rscreen, + struct r600_query_hw *query) +{ + unsigned buf_size = MAX2(query->result_size, + rscreen->info.min_alloc_size); + + /* Queries are normally read by the CPU after + * being written by the gpu, hence staging is probably a good + * usage pattern. + */ + struct r600_resource *buf = (struct r600_resource*) + pipe_buffer_create(&rscreen->b, 0, + PIPE_USAGE_STAGING, buf_size); + if (!buf) + return NULL; + + if (!query->ops->prepare_buffer(rscreen, query, buf)) { + r600_resource_reference(&buf, NULL); + return NULL; + } + + return buf; +} + +static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen, + struct r600_query_hw *query, + struct r600_resource *buffer) +{ + /* Callers ensure that the buffer is currently unused by the GPU. */ + uint32_t *results = rscreen->ws->buffer_map(buffer->buf, NULL, + PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_UNSYNCHRONIZED); + if (!results) + return false; + + memset(results, 0, buffer->b.b.width0); + + if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER || + query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) { + unsigned max_rbs = rscreen->info.num_render_backends; + unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask; + unsigned num_results; + unsigned i, j; + + /* Set top bits for unused backends. */ + num_results = buffer->b.b.width0 / query->result_size; + for (j = 0; j < num_results; j++) { + for (i = 0; i < max_rbs; i++) { + if (!(enabled_rb_mask & (1<<i))) { + results[(i * 4)+1] = 0x80000000; + results[(i * 4)+3] = 0x80000000; + } + } + results += 4 * max_rbs; + } + } + + return true; +} + +static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, + struct r600_query *rquery, + bool wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset); + +static struct r600_query_ops query_hw_ops = { + .destroy = r600_query_hw_destroy, + .begin = r600_query_hw_begin, + .end = r600_query_hw_end, + .get_result = r600_query_hw_get_result, + .get_result_resource = r600_query_hw_get_result_resource, +}; + +static void r600_query_hw_do_emit_start(struct r600_common_context *ctx, + struct r600_query_hw *query, + struct r600_resource *buffer, + uint64_t va); +static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, + struct r600_query_hw *query, + struct r600_resource *buffer, + uint64_t va); +static void r600_query_hw_add_result(struct r600_common_screen *rscreen, + struct r600_query_hw *, void *buffer, + union pipe_query_result *result); +static void r600_query_hw_clear_result(struct r600_query_hw *, + union pipe_query_result *); + +static struct r600_query_hw_ops query_hw_default_hw_ops = { + .prepare_buffer = r600_query_hw_prepare_buffer, + .emit_start = r600_query_hw_do_emit_start, + .emit_stop = r600_query_hw_do_emit_stop, + .clear_result = r600_query_hw_clear_result, + .add_result = r600_query_hw_add_result, +}; + +bool r600_query_hw_init(struct r600_common_screen *rscreen, + struct r600_query_hw *query) +{ + query->buffer.buf = r600_new_query_buffer(rscreen, query); + if (!query->buffer.buf) + return false; + + return true; +} + +static struct pipe_query *r600_query_hw_create(struct r600_common_screen *rscreen, + unsigned query_type, + unsigned index) +{ + struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw); + if (!query) + return NULL; + + query->b.type = query_type; + query->b.ops = &query_hw_ops; + query->ops = &query_hw_default_hw_ops; + + switch (query_type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + query->result_size = 16 * rscreen->info.num_render_backends; + query->result_size += 16; /* for the fence + alignment */ + query->num_cs_dw_begin = 6; + query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen); + break; + case PIPE_QUERY_TIME_ELAPSED: + query->result_size = 24; + query->num_cs_dw_begin = 8; + query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen); + break; + case PIPE_QUERY_TIMESTAMP: + query->result_size = 16; + query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen); + query->flags = R600_QUERY_HW_FLAG_NO_START; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ + query->result_size = 32; + query->num_cs_dw_begin = 6; + query->num_cs_dw_end = 6; + query->stream = index; + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ + query->result_size = 32 * R600_MAX_STREAMS; + query->num_cs_dw_begin = 6 * R600_MAX_STREAMS; + query->num_cs_dw_end = 6 * R600_MAX_STREAMS; + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + /* 11 values on EG, 8 on R600. */ + query->result_size = (rscreen->chip_class >= EVERGREEN ? 11 : 8) * 16; + query->result_size += 8; /* for the fence + alignment */ + query->num_cs_dw_begin = 6; + query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen); + break; + default: + assert(0); + FREE(query); + return NULL; + } + + if (!r600_query_hw_init(rscreen, query)) { + FREE(query); + return NULL; + } + + return (struct pipe_query *)query; +} + +static void r600_update_occlusion_query_state(struct r600_common_context *rctx, + unsigned type, int diff) +{ + if (type == PIPE_QUERY_OCCLUSION_COUNTER || + type == PIPE_QUERY_OCCLUSION_PREDICATE) { + bool old_enable = rctx->num_occlusion_queries != 0; + bool old_perfect_enable = + rctx->num_perfect_occlusion_queries != 0; + bool enable, perfect_enable; + + rctx->num_occlusion_queries += diff; + assert(rctx->num_occlusion_queries >= 0); + + if (type == PIPE_QUERY_OCCLUSION_COUNTER) { + rctx->num_perfect_occlusion_queries += diff; + assert(rctx->num_perfect_occlusion_queries >= 0); + } + + enable = rctx->num_occlusion_queries != 0; + perfect_enable = rctx->num_perfect_occlusion_queries != 0; + + if (enable != old_enable || perfect_enable != old_perfect_enable) { + struct r600_context *ctx = (struct r600_context*)rctx; + r600_mark_atom_dirty(ctx, &ctx->db_misc_state.atom); + } + } +} + +static unsigned event_type_for_stream(unsigned stream) +{ + switch (stream) { + default: + case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS; + case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1; + case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2; + case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3; + } +} + +static void emit_sample_streamout(struct radeon_winsys_cs *cs, uint64_t va, + unsigned stream) +{ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); +} + +static void r600_query_hw_do_emit_start(struct r600_common_context *ctx, + struct r600_query_hw *query, + struct r600_resource *buffer, + uint64_t va) +{ + struct radeon_winsys_cs *cs = ctx->gfx.cs; + + switch (query->b.type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + emit_sample_streamout(cs, va, query->stream); + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) + emit_sample_streamout(cs, va + 32 * stream, stream); + break; + case PIPE_QUERY_TIME_ELAPSED: + /* Write the timestamp after the last draw is done. + * (bottom-of-pipe) + */ + r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, + 0, EOP_DATA_SEL_TIMESTAMP, + NULL, va, 0, query->b.type); + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + break; + default: + assert(0); + } + r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE, + RADEON_PRIO_QUERY); +} + +static void r600_query_hw_emit_start(struct r600_common_context *ctx, + struct r600_query_hw *query) +{ + uint64_t va; + + if (!query->buffer.buf) + return; // previous buffer allocation failure + + r600_update_occlusion_query_state(ctx, query->b.type, 1); + r600_update_prims_generated_query_state(ctx, query->b.type, 1); + + ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end, + true); + + /* Get a new query buffer if needed. */ + if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) { + struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer); + *qbuf = query->buffer; + query->buffer.results_end = 0; + query->buffer.previous = qbuf; + query->buffer.buf = r600_new_query_buffer(ctx->screen, query); + if (!query->buffer.buf) + return; + } + + /* emit begin query */ + va = query->buffer.buf->gpu_address + query->buffer.results_end; + + query->ops->emit_start(ctx, query, query->buffer.buf, va); + + ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end; +} + +static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, + struct r600_query_hw *query, + struct r600_resource *buffer, + uint64_t va) +{ + struct radeon_winsys_cs *cs = ctx->gfx.cs; + uint64_t fence_va = 0; + + switch (query->b.type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + va += 8; + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + + fence_va = va + ctx->screen->info.num_render_backends * 16 - 8; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + va += 16; + emit_sample_streamout(cs, va, query->stream); + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + va += 16; + for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) + emit_sample_streamout(cs, va + 32 * stream, stream); + break; + case PIPE_QUERY_TIME_ELAPSED: + va += 8; + /* fall through */ + case PIPE_QUERY_TIMESTAMP: + r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, + 0, EOP_DATA_SEL_TIMESTAMP, NULL, va, + 0, query->b.type); + fence_va = va + 8; + break; + case PIPE_QUERY_PIPELINE_STATISTICS: { + unsigned sample_size = (query->result_size - 8) / 2; + + va += sample_size; + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + + fence_va = va + sample_size; + break; + } + default: + assert(0); + } + r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE, + RADEON_PRIO_QUERY); + + if (fence_va) + r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, + EOP_DATA_SEL_VALUE_32BIT, + query->buffer.buf, fence_va, 0x80000000, + query->b.type); +} + +static void r600_query_hw_emit_stop(struct r600_common_context *ctx, + struct r600_query_hw *query) +{ + uint64_t va; + + if (!query->buffer.buf) + return; // previous buffer allocation failure + + /* The queries which need begin already called this in begin_query. */ + if (query->flags & R600_QUERY_HW_FLAG_NO_START) { + ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, false); + } + + /* emit end query */ + va = query->buffer.buf->gpu_address + query->buffer.results_end; + + query->ops->emit_stop(ctx, query, query->buffer.buf, va); + + query->buffer.results_end += query->result_size; + + if (!(query->flags & R600_QUERY_HW_FLAG_NO_START)) + ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end; + + r600_update_occlusion_query_state(ctx, query->b.type, -1); + r600_update_prims_generated_query_state(ctx, query->b.type, -1); +} + +static void emit_set_predicate(struct r600_common_context *ctx, + struct r600_resource *buf, uint64_t va, + uint32_t op) +{ + struct radeon_winsys_cs *cs = ctx->gfx.cs; + + radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); + radeon_emit(cs, va); + radeon_emit(cs, op | ((va >> 32) & 0xFF)); + r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_READ, + RADEON_PRIO_QUERY); +} + +static void r600_emit_query_predication(struct r600_common_context *ctx, + struct r600_atom *atom) +{ + struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond; + struct r600_query_buffer *qbuf; + uint32_t op; + bool flag_wait, invert; + + if (!query) + return; + + invert = ctx->render_cond_invert; + flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT || + ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT; + + if (query->workaround_buf) { + op = PRED_OP(PREDICATION_OP_BOOL64); + } else { + switch (query->b.type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + op = PRED_OP(PREDICATION_OP_ZPASS); + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + op = PRED_OP(PREDICATION_OP_PRIMCOUNT); + invert = !invert; + break; + default: + assert(0); + return; + } + } + + /* if true then invert, see GL_ARB_conditional_render_inverted */ + if (invert) + op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */ + else + op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */ + + /* Use the value written by compute shader as a workaround. Note that + * the wait flag does not apply in this predication mode. + * + * The shader outputs the result value to L2. Workarounds only affect VI + * and later, where the CP reads data from L2, so we don't need an + * additional flush. + */ + if (query->workaround_buf) { + uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset; + emit_set_predicate(ctx, query->workaround_buf, va, op); + return; + } + + op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; + + /* emit predicate packets for all data blocks */ + for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { + unsigned results_base = 0; + uint64_t va_base = qbuf->buf->gpu_address; + + while (results_base < qbuf->results_end) { + uint64_t va = va_base + results_base; + + if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) { + for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) { + emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op); + + /* set CONTINUE bit for all packets except the first */ + op |= PREDICATION_CONTINUE; + } + } else { + emit_set_predicate(ctx, qbuf->buf, va, op); + op |= PREDICATION_CONTINUE; + } + + results_base += query->result_size; + } + } +} + +static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index) +{ + struct r600_common_screen *rscreen = + (struct r600_common_screen *)ctx->screen; + + if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || + query_type == PIPE_QUERY_GPU_FINISHED || + query_type >= PIPE_QUERY_DRIVER_SPECIFIC) + return r600_query_sw_create(query_type); + + return r600_query_hw_create(rscreen, query_type, index); +} + +static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_query *rquery = (struct r600_query *)query; + + rquery->ops->destroy(rctx->screen, rquery); +} + +static boolean r600_begin_query(struct pipe_context *ctx, + struct pipe_query *query) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_query *rquery = (struct r600_query *)query; + + return rquery->ops->begin(rctx, rquery); +} + +void r600_query_hw_reset_buffers(struct r600_common_context *rctx, + struct r600_query_hw *query) +{ + struct r600_query_buffer *prev = query->buffer.previous; + + /* Discard the old query buffers. */ + while (prev) { + struct r600_query_buffer *qbuf = prev; + prev = prev->previous; + r600_resource_reference(&qbuf->buf, NULL); + FREE(qbuf); + } + + query->buffer.results_end = 0; + query->buffer.previous = NULL; + + /* Obtain a new buffer if the current one can't be mapped without a stall. */ + if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) || + !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) { + r600_resource_reference(&query->buffer.buf, NULL); + query->buffer.buf = r600_new_query_buffer(rctx->screen, query); + } else { + if (!query->ops->prepare_buffer(rctx->screen, query, query->buffer.buf)) + r600_resource_reference(&query->buffer.buf, NULL); + } +} + +bool r600_query_hw_begin(struct r600_common_context *rctx, + struct r600_query *rquery) +{ + struct r600_query_hw *query = (struct r600_query_hw *)rquery; + + if (query->flags & R600_QUERY_HW_FLAG_NO_START) { + assert(0); + return false; + } + + if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES)) + r600_query_hw_reset_buffers(rctx, query); + + r600_resource_reference(&query->workaround_buf, NULL); + + r600_query_hw_emit_start(rctx, query); + if (!query->buffer.buf) + return false; + + LIST_ADDTAIL(&query->list, &rctx->active_queries); + return true; +} + +static bool r600_end_query(struct pipe_context *ctx, struct pipe_query *query) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_query *rquery = (struct r600_query *)query; + + return rquery->ops->end(rctx, rquery); +} + +bool r600_query_hw_end(struct r600_common_context *rctx, + struct r600_query *rquery) +{ + struct r600_query_hw *query = (struct r600_query_hw *)rquery; + + if (query->flags & R600_QUERY_HW_FLAG_NO_START) + r600_query_hw_reset_buffers(rctx, query); + + r600_query_hw_emit_stop(rctx, query); + + if (!(query->flags & R600_QUERY_HW_FLAG_NO_START)) + LIST_DELINIT(&query->list); + + if (!query->buffer.buf) + return false; + + return true; +} + +static void r600_get_hw_query_params(struct r600_common_context *rctx, + struct r600_query_hw *rquery, int index, + struct r600_hw_query_params *params) +{ + unsigned max_rbs = rctx->screen->info.num_render_backends; + + params->pair_stride = 0; + params->pair_count = 1; + + switch (rquery->b.type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + params->start_offset = 0; + params->end_offset = 8; + params->fence_offset = max_rbs * 16; + params->pair_stride = 16; + params->pair_count = max_rbs; + break; + case PIPE_QUERY_TIME_ELAPSED: + params->start_offset = 0; + params->end_offset = 8; + params->fence_offset = 16; + break; + case PIPE_QUERY_TIMESTAMP: + params->start_offset = 0; + params->end_offset = 0; + params->fence_offset = 8; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + params->start_offset = 8; + params->end_offset = 24; + params->fence_offset = params->end_offset + 4; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + params->start_offset = 0; + params->end_offset = 16; + params->fence_offset = params->end_offset + 4; + break; + case PIPE_QUERY_SO_STATISTICS: + params->start_offset = 8 - index * 8; + params->end_offset = 24 - index * 8; + params->fence_offset = params->end_offset + 4; + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + params->pair_count = R600_MAX_STREAMS; + params->pair_stride = 32; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + params->start_offset = 0; + params->end_offset = 16; + + /* We can re-use the high dword of the last 64-bit value as a + * fence: it is initialized as 0, and the high bit is set by + * the write of the streamout stats event. + */ + params->fence_offset = rquery->result_size - 4; + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + { + /* Offsets apply to EG+ */ + static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80}; + params->start_offset = offsets[index]; + params->end_offset = 88 + offsets[index]; + params->fence_offset = 2 * 88; + break; + } + default: + unreachable("r600_get_hw_query_params unsupported"); + } +} + +static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index, + bool test_status_bit) +{ + uint32_t *current_result = (uint32_t*)map; + uint64_t start, end; + + start = (uint64_t)current_result[start_index] | + (uint64_t)current_result[start_index+1] << 32; + end = (uint64_t)current_result[end_index] | + (uint64_t)current_result[end_index+1] << 32; + + if (!test_status_bit || + ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) { + return end - start; + } + return 0; +} + +static void r600_query_hw_add_result(struct r600_common_screen *rscreen, + struct r600_query_hw *query, + void *buffer, + union pipe_query_result *result) +{ + unsigned max_rbs = rscreen->info.num_render_backends; + + switch (query->b.type) { + case PIPE_QUERY_OCCLUSION_COUNTER: { + for (unsigned i = 0; i < max_rbs; ++i) { + unsigned results_base = i * 16; + result->u64 += + r600_query_read_result(buffer + results_base, 0, 2, true); + } + break; + } + case PIPE_QUERY_OCCLUSION_PREDICATE: { + for (unsigned i = 0; i < max_rbs; ++i) { + unsigned results_base = i * 16; + result->b = result->b || + r600_query_read_result(buffer + results_base, 0, 2, true) != 0; + } + break; + } + case PIPE_QUERY_TIME_ELAPSED: + result->u64 += r600_query_read_result(buffer, 0, 2, false); + break; + case PIPE_QUERY_TIMESTAMP: + result->u64 = *(uint64_t*)buffer; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + /* SAMPLE_STREAMOUTSTATS stores this structure: + * { + * u64 NumPrimitivesWritten; + * u64 PrimitiveStorageNeeded; + * } + * We only need NumPrimitivesWritten here. */ + result->u64 += r600_query_read_result(buffer, 2, 6, true); + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + /* Here we read PrimitiveStorageNeeded. */ + result->u64 += r600_query_read_result(buffer, 0, 4, true); + break; + case PIPE_QUERY_SO_STATISTICS: + result->so_statistics.num_primitives_written += + r600_query_read_result(buffer, 2, 6, true); + result->so_statistics.primitives_storage_needed += + r600_query_read_result(buffer, 0, 4, true); + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + result->b = result->b || + r600_query_read_result(buffer, 2, 6, true) != + r600_query_read_result(buffer, 0, 4, true); + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) { + result->b = result->b || + r600_query_read_result(buffer, 2, 6, true) != + r600_query_read_result(buffer, 0, 4, true); + buffer = (char *)buffer + 32; + } + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + if (rscreen->chip_class >= EVERGREEN) { + result->pipeline_statistics.ps_invocations += + r600_query_read_result(buffer, 0, 22, false); + result->pipeline_statistics.c_primitives += + r600_query_read_result(buffer, 2, 24, false); + result->pipeline_statistics.c_invocations += + r600_query_read_result(buffer, 4, 26, false); + result->pipeline_statistics.vs_invocations += + r600_query_read_result(buffer, 6, 28, false); + result->pipeline_statistics.gs_invocations += + r600_query_read_result(buffer, 8, 30, false); + result->pipeline_statistics.gs_primitives += + r600_query_read_result(buffer, 10, 32, false); + result->pipeline_statistics.ia_primitives += + r600_query_read_result(buffer, 12, 34, false); + result->pipeline_statistics.ia_vertices += + r600_query_read_result(buffer, 14, 36, false); + result->pipeline_statistics.hs_invocations += + r600_query_read_result(buffer, 16, 38, false); + result->pipeline_statistics.ds_invocations += + r600_query_read_result(buffer, 18, 40, false); + result->pipeline_statistics.cs_invocations += + r600_query_read_result(buffer, 20, 42, false); + } else { + result->pipeline_statistics.ps_invocations += + r600_query_read_result(buffer, 0, 16, false); + result->pipeline_statistics.c_primitives += + r600_query_read_result(buffer, 2, 18, false); + result->pipeline_statistics.c_invocations += + r600_query_read_result(buffer, 4, 20, false); + result->pipeline_statistics.vs_invocations += + r600_query_read_result(buffer, 6, 22, false); + result->pipeline_statistics.gs_invocations += + r600_query_read_result(buffer, 8, 24, false); + result->pipeline_statistics.gs_primitives += + r600_query_read_result(buffer, 10, 26, false); + result->pipeline_statistics.ia_primitives += + r600_query_read_result(buffer, 12, 28, false); + result->pipeline_statistics.ia_vertices += + r600_query_read_result(buffer, 14, 30, false); + } +#if 0 /* for testing */ + printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, " + "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, " + "Clipper prims=%llu, PS=%llu, CS=%llu\n", + result->pipeline_statistics.ia_vertices, + result->pipeline_statistics.ia_primitives, + result->pipeline_statistics.vs_invocations, + result->pipeline_statistics.hs_invocations, + result->pipeline_statistics.ds_invocations, + result->pipeline_statistics.gs_invocations, + result->pipeline_statistics.gs_primitives, + result->pipeline_statistics.c_invocations, + result->pipeline_statistics.c_primitives, + result->pipeline_statistics.ps_invocations, + result->pipeline_statistics.cs_invocations); +#endif + break; + default: + assert(0); + } +} + +static boolean r600_get_query_result(struct pipe_context *ctx, + struct pipe_query *query, boolean wait, + union pipe_query_result *result) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_query *rquery = (struct r600_query *)query; + + return rquery->ops->get_result(rctx, rquery, wait, result); +} + +static void r600_get_query_result_resource(struct pipe_context *ctx, + struct pipe_query *query, + boolean wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_query *rquery = (struct r600_query *)query; + + rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index, + resource, offset); +} + +static void r600_query_hw_clear_result(struct r600_query_hw *query, + union pipe_query_result *result) +{ + util_query_clear_result(result, query->b.type); +} + +bool r600_query_hw_get_result(struct r600_common_context *rctx, + struct r600_query *rquery, + bool wait, union pipe_query_result *result) +{ + struct r600_common_screen *rscreen = rctx->screen; + struct r600_query_hw *query = (struct r600_query_hw *)rquery; + struct r600_query_buffer *qbuf; + + query->ops->clear_result(query, result); + + for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { + unsigned usage = PIPE_TRANSFER_READ | + (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); + unsigned results_base = 0; + void *map; + + if (rquery->b.flushed) + map = rctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); + else + map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf, usage); + + if (!map) + return false; + + while (results_base != qbuf->results_end) { + query->ops->add_result(rscreen, query, map + results_base, + result); + results_base += query->result_size; + } + } + + /* Convert the time to expected units. */ + if (rquery->type == PIPE_QUERY_TIME_ELAPSED || + rquery->type == PIPE_QUERY_TIMESTAMP) { + result->u64 = (1000000 * result->u64) / rscreen->info.clock_crystal_freq; + } + return true; +} + +/* Create the compute shader that is used to collect the results. + * + * One compute grid with a single thread is launched for every query result + * buffer. The thread (optionally) reads a previous summary buffer, then + * accumulates data from the query result buffer, and writes the result either + * to a summary buffer to be consumed by the next grid invocation or to the + * user-supplied buffer. + * + * Data layout: + * + * CONST + * 0.x = end_offset + * 0.y = result_stride + * 0.z = result_count + * 0.w = bit field: + * 1: read previously accumulated values + * 2: write accumulated values for chaining + * 4: write result available + * 8: convert result to boolean (0/1) + * 16: only read one dword and use that as result + * 32: apply timestamp conversion + * 64: store full 64 bits result + * 128: store signed 32 bits result + * 256: SO_OVERFLOW mode: take the difference of two successive half-pairs + * 1.x = fence_offset + * 1.y = pair_stride + * 1.z = pair_count + * + * BUFFER[0] = query result buffer + * BUFFER[1] = previous summary buffer + * BUFFER[2] = next summary buffer or user-supplied buffer + */ +static void r600_create_query_result_shader(struct r600_common_context *rctx) +{ + /* TEMP[0].xy = accumulated result so far + * TEMP[0].z = result not available + * + * TEMP[1].x = current result index + * TEMP[1].y = current pair index + */ + static const char text_tmpl[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL BUFFER[0]\n" + "DCL BUFFER[1]\n" + "DCL BUFFER[2]\n" + "DCL CONST[0][0..1]\n" + "DCL TEMP[0..5]\n" + "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n" + "IMM[1] UINT32 {1, 2, 4, 8}\n" + "IMM[2] UINT32 {16, 32, 64, 128}\n" + "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */ + "IMM[4] UINT32 {256, 0, 0, 0}\n" + + "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n" + "UIF TEMP[5]\n" + /* Check result availability. */ + "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n" + "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n" + "MOV TEMP[1], TEMP[0].zzzz\n" + "NOT TEMP[0].z, TEMP[0].zzzz\n" + + /* Load result if available. */ + "UIF TEMP[1]\n" + "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n" + "ENDIF\n" + "ELSE\n" + /* Load previously accumulated result if requested. */ + "MOV TEMP[0], IMM[0].xxxx\n" + "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n" + "UIF TEMP[4]\n" + "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n" + "ENDIF\n" + + "MOV TEMP[1].x, IMM[0].xxxx\n" + "BGNLOOP\n" + /* Break if accumulated result so far is not available. */ + "UIF TEMP[0].zzzz\n" + "BRK\n" + "ENDIF\n" + + /* Break if result_index >= result_count. */ + "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + + /* Load fence and check result availability */ + "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n" + "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" + "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n" + "NOT TEMP[0].z, TEMP[0].zzzz\n" + "UIF TEMP[0].zzzz\n" + "BRK\n" + "ENDIF\n" + + "MOV TEMP[1].y, IMM[0].xxxx\n" + "BGNLOOP\n" + /* Load start and end. */ + "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n" + "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n" + "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" + + "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n" + "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" + + "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n" + + "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n" + "UIF TEMP[5].zzzz\n" + /* Load second start/end half-pair and + * take the difference + */ + "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n" + "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" + "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" + + "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n" + "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n" + "ENDIF\n" + + "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n" + + /* Increment pair index */ + "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n" + "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + "ENDLOOP\n" + + /* Increment result index */ + "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n" + "ENDLOOP\n" + "ENDIF\n" + + "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n" + "UIF TEMP[4]\n" + /* Store accumulated data for chaining. */ + "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n" + "ELSE\n" + "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n" + "UIF TEMP[4]\n" + /* Store result availability. */ + "NOT TEMP[0].z, TEMP[0]\n" + "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n" + "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n" + + "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" + "UIF TEMP[4]\n" + "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n" + "ENDIF\n" + "ELSE\n" + /* Store result if it is available. */ + "NOT TEMP[4], TEMP[0].zzzz\n" + "UIF TEMP[4]\n" + /* Apply timestamp conversion */ + "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n" + "UIF TEMP[4]\n" + "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n" + "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n" + "ENDIF\n" + + /* Convert to boolean */ + "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n" + "UIF TEMP[4]\n" + "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n" + "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n" + "MOV TEMP[0].y, IMM[0].xxxx\n" + "ENDIF\n" + + "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" + "UIF TEMP[4]\n" + "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n" + "ELSE\n" + /* Clamping */ + "UIF TEMP[0].yyyy\n" + "MOV TEMP[0].x, IMM[0].wwww\n" + "ENDIF\n" + + "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n" + "UIF TEMP[4]\n" + "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n" + "ENDIF\n" + + "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" + "ENDIF\n" + "ENDIF\n" + "ENDIF\n" + "ENDIF\n" + + "END\n"; + + char text[sizeof(text_tmpl) + 32]; + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {}; + + /* Hard code the frequency into the shader so that the backend can + * use the full range of optimizations for divide-by-constant. + */ + snprintf(text, sizeof(text), text_tmpl, + rctx->screen->info.clock_crystal_freq); + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state); +} + +static void r600_restore_qbo_state(struct r600_common_context *rctx, + struct r600_qbo_state *st) +{ + rctx->b.bind_compute_state(&rctx->b, st->saved_compute); + + rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); + pipe_resource_reference(&st->saved_const0.buffer, NULL); + + rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo); + for (unsigned i = 0; i < 3; ++i) + pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL); +} + +static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, + struct r600_query *rquery, + bool wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset) +{ + struct r600_query_hw *query = (struct r600_query_hw *)rquery; + struct r600_query_buffer *qbuf; + struct r600_query_buffer *qbuf_prev; + struct pipe_resource *tmp_buffer = NULL; + unsigned tmp_buffer_offset = 0; + struct r600_qbo_state saved_state = {}; + struct pipe_grid_info grid = {}; + struct pipe_constant_buffer constant_buffer = {}; + struct pipe_shader_buffer ssbo[3]; + struct r600_hw_query_params params; + struct { + uint32_t end_offset; + uint32_t result_stride; + uint32_t result_count; + uint32_t config; + uint32_t fence_offset; + uint32_t pair_stride; + uint32_t pair_count; + } consts; + + if (!rctx->query_result_shader) { + r600_create_query_result_shader(rctx); + if (!rctx->query_result_shader) + return; + } + + if (query->buffer.previous) { + u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16, + &tmp_buffer_offset, &tmp_buffer); + if (!tmp_buffer) + return; + } + + rctx->save_qbo_state(&rctx->b, &saved_state); + + r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, ¶ms); + consts.end_offset = params.end_offset - params.start_offset; + consts.fence_offset = params.fence_offset - params.start_offset; + consts.result_stride = query->result_size; + consts.pair_stride = params.pair_stride; + consts.pair_count = params.pair_count; + + constant_buffer.buffer_size = sizeof(consts); + constant_buffer.user_buffer = &consts; + + ssbo[1].buffer = tmp_buffer; + ssbo[1].buffer_offset = tmp_buffer_offset; + ssbo[1].buffer_size = 16; + + ssbo[2] = ssbo[1]; + + rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader); + + grid.block[0] = 1; + grid.block[1] = 1; + grid.block[2] = 1; + grid.grid[0] = 1; + grid.grid[1] = 1; + grid.grid[2] = 1; + + consts.config = 0; + if (index < 0) + consts.config |= 4; + if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) + consts.config |= 8; + else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || + query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) + consts.config |= 8 | 256; + else if (query->b.type == PIPE_QUERY_TIMESTAMP || + query->b.type == PIPE_QUERY_TIME_ELAPSED) + consts.config |= 32; + + switch (result_type) { + case PIPE_QUERY_TYPE_U64: + case PIPE_QUERY_TYPE_I64: + consts.config |= 64; + break; + case PIPE_QUERY_TYPE_I32: + consts.config |= 128; + break; + case PIPE_QUERY_TYPE_U32: + break; + } + + rctx->flags |= rctx->screen->barrier_flags.cp_to_L2; + + for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) { + if (query->b.type != PIPE_QUERY_TIMESTAMP) { + qbuf_prev = qbuf->previous; + consts.result_count = qbuf->results_end / query->result_size; + consts.config &= ~3; + if (qbuf != &query->buffer) + consts.config |= 1; + if (qbuf->previous) + consts.config |= 2; + } else { + /* Only read the last timestamp. */ + qbuf_prev = NULL; + consts.result_count = 0; + consts.config |= 16; + params.start_offset += qbuf->results_end - query->result_size; + } + + rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); + + ssbo[0].buffer = &qbuf->buf->b.b; + ssbo[0].buffer_offset = params.start_offset; + ssbo[0].buffer_size = qbuf->results_end - params.start_offset; + + if (!qbuf->previous) { + ssbo[2].buffer = resource; + ssbo[2].buffer_offset = offset; + ssbo[2].buffer_size = 8; + + } + + rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo); + + if (wait && qbuf == &query->buffer) { + uint64_t va; + + /* Wait for result availability. Wait only for readiness + * of the last entry, since the fence writes should be + * serialized in the CP. + */ + va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size; + va += params.fence_offset; + + r600_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000); + } + + rctx->b.launch_grid(&rctx->b, &grid); + rctx->flags |= rctx->screen->barrier_flags.compute_to_L2; + } + + r600_restore_qbo_state(rctx, &saved_state); + pipe_resource_reference(&tmp_buffer, NULL); +} + +static void r600_render_condition(struct pipe_context *ctx, + struct pipe_query *query, + boolean condition, + enum pipe_render_cond_flag mode) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_query_hw *rquery = (struct r600_query_hw *)query; + struct r600_query_buffer *qbuf; + struct r600_atom *atom = &rctx->render_cond_atom; + + /* Compute the size of SET_PREDICATION packets. */ + atom->num_dw = 0; + if (query) { + for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous) + atom->num_dw += (qbuf->results_end / rquery->result_size) * 5; + + if (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) + atom->num_dw *= R600_MAX_STREAMS; + } + + rctx->render_cond = query; + rctx->render_cond_invert = condition; + rctx->render_cond_mode = mode; + + rctx->set_atom_dirty(rctx, atom, query != NULL); +} + +void r600_suspend_queries(struct r600_common_context *ctx) +{ + struct r600_query_hw *query; + + LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) { + r600_query_hw_emit_stop(ctx, query); + } + assert(ctx->num_cs_dw_queries_suspend == 0); +} + +static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx, + struct list_head *query_list) +{ + struct r600_query_hw *query; + unsigned num_dw = 0; + + LIST_FOR_EACH_ENTRY(query, query_list, list) { + /* begin + end */ + num_dw += query->num_cs_dw_begin + query->num_cs_dw_end; + + /* Workaround for the fact that + * num_cs_dw_nontimer_queries_suspend is incremented for every + * resumed query, which raises the bar in need_cs_space for + * queries about to be resumed. + */ + num_dw += query->num_cs_dw_end; + } + /* primitives generated query */ + num_dw += ctx->streamout.enable_atom.num_dw; + /* guess for ZPASS enable or PERFECT_ZPASS_COUNT enable updates */ + num_dw += 13; + + return num_dw; +} + +void r600_resume_queries(struct r600_common_context *ctx) +{ + struct r600_query_hw *query; + unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries); + + assert(ctx->num_cs_dw_queries_suspend == 0); + + /* Check CS space here. Resuming must not be interrupted by flushes. */ + ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, true); + + LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) { + r600_query_hw_emit_start(ctx, query); + } +} + +/* Fix radeon_info::enabled_rb_mask for R600, R700, EVERGREEN, NI. */ +void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen) +{ + struct r600_common_context *ctx = + (struct r600_common_context*)rscreen->aux_context; + struct radeon_winsys_cs *cs = ctx->gfx.cs; + struct r600_resource *buffer; + uint32_t *results; + unsigned i, mask = 0; + unsigned max_rbs = ctx->screen->info.num_render_backends; + + assert(rscreen->chip_class <= CAYMAN); + + /* if backend_map query is supported by the kernel */ + if (rscreen->info.r600_gb_backend_map_valid) { + unsigned num_tile_pipes = rscreen->info.num_tile_pipes; + unsigned backend_map = rscreen->info.r600_gb_backend_map; + unsigned item_width, item_mask; + + if (ctx->chip_class >= EVERGREEN) { + item_width = 4; + item_mask = 0x7; + } else { + item_width = 2; + item_mask = 0x3; + } + + while (num_tile_pipes--) { + i = backend_map & item_mask; + mask |= (1<<i); + backend_map >>= item_width; + } + if (mask != 0) { + rscreen->info.enabled_rb_mask = mask; + return; + } + } + + /* otherwise backup path for older kernels */ + + /* create buffer for event data */ + buffer = (struct r600_resource*) + pipe_buffer_create(ctx->b.screen, 0, + PIPE_USAGE_STAGING, max_rbs * 16); + if (!buffer) + return; + + /* initialize buffer with zeroes */ + results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE); + if (results) { + memset(results, 0, max_rbs * 4 * 4); + + /* emit EVENT_WRITE for ZPASS_DONE */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); + radeon_emit(cs, buffer->gpu_address); + radeon_emit(cs, buffer->gpu_address >> 32); + + r600_emit_reloc(ctx, &ctx->gfx, buffer, + RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); + + /* analyze results */ + results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ); + if (results) { + for(i = 0; i < max_rbs; i++) { + /* at least highest bit will be set if backend is used */ + if (results[i*4 + 1]) + mask |= (1<<i); + } + } + } + + r600_resource_reference(&buffer, NULL); + + if (mask) + rscreen->info.enabled_rb_mask = mask; +} + +#define XFULL(name_, query_type_, type_, result_type_, group_id_) \ + { \ + .name = name_, \ + .query_type = R600_QUERY_##query_type_, \ + .type = PIPE_DRIVER_QUERY_TYPE_##type_, \ + .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \ + .group_id = group_id_ \ + } + +#define X(name_, query_type_, type_, result_type_) \ + XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0) + +#define XG(group_, name_, query_type_, type_, result_type_) \ + XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_) + +static struct pipe_driver_query_info r600_driver_query_list[] = { + X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE), + X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE), + X("num-shader-cache-hits", NUM_SHADER_CACHE_HITS, UINT64, CUMULATIVE), + X("draw-calls", DRAW_CALLS, UINT64, AVERAGE), + X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE), + X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE), + X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE), + X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE), + X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE), + X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE), + X("dma-calls", DMA_CALLS, UINT64, AVERAGE), + X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE), + X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE), + X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE), + X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE), + X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE), + X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE), + X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE), + X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE), + X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE), + X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE), + X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE), + X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE), + X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE), + X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE), + X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE), + X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE), + X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE), + X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE), + X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE), + X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE), + X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE), + X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE), + X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE), + X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE), + X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE), + X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE), + X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE), + X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE), + X("GTT-usage", GTT_USAGE, BYTES, AVERAGE), + + /* GPIN queries are for the benefit of old versions of GPUPerfStudio, + * which use it as a fallback path to detect the GPU type. + * + * Note: The names of these queries are significant for GPUPerfStudio + * (and possibly their order as well). */ + XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE), + XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE), + XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE), + XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE), + XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE), + + X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE), + X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE), + X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE), + + /* The following queries must be at the end of the list because their + * availability is adjusted dynamically based on the DRM version. */ + X("GPU-load", GPU_LOAD, UINT64, AVERAGE), + X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE), + X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE), + X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE), + X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE), + X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE), + X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE), + X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE), + X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE), + X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE), + X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE), + X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE), + X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE), + X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE), + X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE), + X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE), + X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE), + X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE), + X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE), + X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE), + X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE), +}; + +#undef X +#undef XG +#undef XFULL + +static unsigned r600_get_num_queries(struct r600_common_screen *rscreen) +{ + if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42) + return ARRAY_SIZE(r600_driver_query_list); + else + return ARRAY_SIZE(r600_driver_query_list) - 25; +} + +static int r600_get_driver_query_info(struct pipe_screen *screen, + unsigned index, + struct pipe_driver_query_info *info) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + unsigned num_queries = r600_get_num_queries(rscreen); + + if (!info) { + unsigned num_perfcounters = + r600_get_perfcounter_info(rscreen, 0, NULL); + + return num_queries + num_perfcounters; + } + + if (index >= num_queries) + return r600_get_perfcounter_info(rscreen, index - num_queries, info); + + *info = r600_driver_query_list[index]; + + switch (info->query_type) { + case R600_QUERY_REQUESTED_VRAM: + case R600_QUERY_VRAM_USAGE: + case R600_QUERY_MAPPED_VRAM: + info->max_value.u64 = rscreen->info.vram_size; + break; + case R600_QUERY_REQUESTED_GTT: + case R600_QUERY_GTT_USAGE: + case R600_QUERY_MAPPED_GTT: + info->max_value.u64 = rscreen->info.gart_size; + break; + case R600_QUERY_GPU_TEMPERATURE: + info->max_value.u64 = 125; + break; + case R600_QUERY_VRAM_VIS_USAGE: + info->max_value.u64 = rscreen->info.vram_vis_size; + break; + } + + if (info->group_id != ~(unsigned)0 && rscreen->perfcounters) + info->group_id += rscreen->perfcounters->num_groups; + + return 1; +} + +/* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware + * performance counter groups, so be careful when changing this and related + * functions. + */ +static int r600_get_driver_query_group_info(struct pipe_screen *screen, + unsigned index, + struct pipe_driver_query_group_info *info) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen *)screen; + unsigned num_pc_groups = 0; + + if (rscreen->perfcounters) + num_pc_groups = rscreen->perfcounters->num_groups; + + if (!info) + return num_pc_groups + R600_NUM_SW_QUERY_GROUPS; + + if (index < num_pc_groups) + return r600_get_perfcounter_group_info(rscreen, index, info); + + index -= num_pc_groups; + if (index >= R600_NUM_SW_QUERY_GROUPS) + return 0; + + info->name = "GPIN"; + info->max_active_queries = 5; + info->num_queries = 5; + return 1; +} + +void r600_query_init(struct r600_common_context *rctx) +{ + rctx->b.create_query = r600_create_query; + rctx->b.create_batch_query = r600_create_batch_query; + rctx->b.destroy_query = r600_destroy_query; + rctx->b.begin_query = r600_begin_query; + rctx->b.end_query = r600_end_query; + rctx->b.get_query_result = r600_get_query_result; + rctx->b.get_query_result_resource = r600_get_query_result_resource; + rctx->render_cond_atom.emit = r600_emit_query_predication; + + if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0) + rctx->b.render_condition = r600_render_condition; + + LIST_INITHEAD(&rctx->active_queries); +} + +void r600_init_screen_query_functions(struct r600_common_screen *rscreen) +{ + rscreen->b.get_driver_query_info = r600_get_driver_query_info; + rscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info; +} diff --git a/lib/mesa/src/gallium/drivers/r600/r600_query.h b/lib/mesa/src/gallium/drivers/r600/r600_query.h new file mode 100644 index 000000000..1a3c6839e --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600_query.h @@ -0,0 +1,326 @@ +/* + * Copyright 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Nicolai Hähnle <nicolai.haehnle@amd.com> + * + */ + +#ifndef R600_QUERY_H +#define R600_QUERY_H + +#include "util/u_threaded_context.h" + +struct pipe_context; +struct pipe_query; +struct pipe_resource; + +struct r600_common_context; +struct r600_common_screen; +struct r600_query; +struct r600_query_hw; +struct r600_resource; + +enum { + R600_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC, + R600_QUERY_DECOMPRESS_CALLS, + R600_QUERY_MRT_DRAW_CALLS, + R600_QUERY_PRIM_RESTART_CALLS, + R600_QUERY_SPILL_DRAW_CALLS, + R600_QUERY_COMPUTE_CALLS, + R600_QUERY_SPILL_COMPUTE_CALLS, + R600_QUERY_DMA_CALLS, + R600_QUERY_CP_DMA_CALLS, + R600_QUERY_NUM_VS_FLUSHES, + R600_QUERY_NUM_PS_FLUSHES, + R600_QUERY_NUM_CS_FLUSHES, + R600_QUERY_NUM_CB_CACHE_FLUSHES, + R600_QUERY_NUM_DB_CACHE_FLUSHES, + R600_QUERY_NUM_L2_INVALIDATES, + R600_QUERY_NUM_L2_WRITEBACKS, + R600_QUERY_NUM_RESIDENT_HANDLES, + R600_QUERY_TC_OFFLOADED_SLOTS, + R600_QUERY_TC_DIRECT_SLOTS, + R600_QUERY_TC_NUM_SYNCS, + R600_QUERY_CS_THREAD_BUSY, + R600_QUERY_GALLIUM_THREAD_BUSY, + R600_QUERY_REQUESTED_VRAM, + R600_QUERY_REQUESTED_GTT, + R600_QUERY_MAPPED_VRAM, + R600_QUERY_MAPPED_GTT, + R600_QUERY_BUFFER_WAIT_TIME, + R600_QUERY_NUM_MAPPED_BUFFERS, + R600_QUERY_NUM_GFX_IBS, + R600_QUERY_NUM_SDMA_IBS, + R600_QUERY_GFX_BO_LIST_SIZE, + R600_QUERY_NUM_BYTES_MOVED, + R600_QUERY_NUM_EVICTIONS, + R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS, + R600_QUERY_VRAM_USAGE, + R600_QUERY_VRAM_VIS_USAGE, + R600_QUERY_GTT_USAGE, + R600_QUERY_GPU_TEMPERATURE, + R600_QUERY_CURRENT_GPU_SCLK, + R600_QUERY_CURRENT_GPU_MCLK, + R600_QUERY_GPU_LOAD, + R600_QUERY_GPU_SHADERS_BUSY, + R600_QUERY_GPU_TA_BUSY, + R600_QUERY_GPU_GDS_BUSY, + R600_QUERY_GPU_VGT_BUSY, + R600_QUERY_GPU_IA_BUSY, + R600_QUERY_GPU_SX_BUSY, + R600_QUERY_GPU_WD_BUSY, + R600_QUERY_GPU_BCI_BUSY, + R600_QUERY_GPU_SC_BUSY, + R600_QUERY_GPU_PA_BUSY, + R600_QUERY_GPU_DB_BUSY, + R600_QUERY_GPU_CP_BUSY, + R600_QUERY_GPU_CB_BUSY, + R600_QUERY_GPU_SDMA_BUSY, + R600_QUERY_GPU_PFP_BUSY, + R600_QUERY_GPU_MEQ_BUSY, + R600_QUERY_GPU_ME_BUSY, + R600_QUERY_GPU_SURF_SYNC_BUSY, + R600_QUERY_GPU_CP_DMA_BUSY, + R600_QUERY_GPU_SCRATCH_RAM_BUSY, + R600_QUERY_NUM_COMPILATIONS, + R600_QUERY_NUM_SHADERS_CREATED, + R600_QUERY_NUM_SHADER_CACHE_HITS, + R600_QUERY_GPIN_ASIC_ID, + R600_QUERY_GPIN_NUM_SIMD, + R600_QUERY_GPIN_NUM_RB, + R600_QUERY_GPIN_NUM_SPI, + R600_QUERY_GPIN_NUM_SE, + + R600_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100, +}; + +enum { + R600_QUERY_GROUP_GPIN = 0, + R600_NUM_SW_QUERY_GROUPS +}; + +struct r600_query_ops { + void (*destroy)(struct r600_common_screen *, struct r600_query *); + bool (*begin)(struct r600_common_context *, struct r600_query *); + bool (*end)(struct r600_common_context *, struct r600_query *); + bool (*get_result)(struct r600_common_context *, + struct r600_query *, bool wait, + union pipe_query_result *result); + void (*get_result_resource)(struct r600_common_context *, + struct r600_query *, bool wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset); +}; + +struct r600_query { + struct threaded_query b; + struct r600_query_ops *ops; + + /* The type of query */ + unsigned type; +}; + +enum { + R600_QUERY_HW_FLAG_NO_START = (1 << 0), + /* gap */ + /* whether begin_query doesn't clear the result */ + R600_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2), +}; + +struct r600_query_hw_ops { + bool (*prepare_buffer)(struct r600_common_screen *, + struct r600_query_hw *, + struct r600_resource *); + void (*emit_start)(struct r600_common_context *, + struct r600_query_hw *, + struct r600_resource *buffer, uint64_t va); + void (*emit_stop)(struct r600_common_context *, + struct r600_query_hw *, + struct r600_resource *buffer, uint64_t va); + void (*clear_result)(struct r600_query_hw *, union pipe_query_result *); + void (*add_result)(struct r600_common_screen *screen, + struct r600_query_hw *, void *buffer, + union pipe_query_result *result); +}; + +struct r600_query_buffer { + /* The buffer where query results are stored. */ + struct r600_resource *buf; + /* Offset of the next free result after current query data */ + unsigned results_end; + /* If a query buffer is full, a new buffer is created and the old one + * is put in here. When we calculate the result, we sum up the samples + * from all buffers. */ + struct r600_query_buffer *previous; +}; + +struct r600_query_hw { + struct r600_query b; + struct r600_query_hw_ops *ops; + unsigned flags; + + /* The query buffer and how many results are in it. */ + struct r600_query_buffer buffer; + /* Size of the result in memory for both begin_query and end_query, + * this can be one or two numbers, or it could even be a size of a structure. */ + unsigned result_size; + /* The number of dwords for begin_query or end_query. */ + unsigned num_cs_dw_begin; + unsigned num_cs_dw_end; + /* Linked list of queries */ + struct list_head list; + /* For transform feedback: which stream the query is for */ + unsigned stream; + + /* Workaround via compute shader */ + struct r600_resource *workaround_buf; + unsigned workaround_offset; +}; + +bool r600_query_hw_init(struct r600_common_screen *rscreen, + struct r600_query_hw *query); +void r600_query_hw_destroy(struct r600_common_screen *rscreen, + struct r600_query *rquery); +bool r600_query_hw_begin(struct r600_common_context *rctx, + struct r600_query *rquery); +bool r600_query_hw_end(struct r600_common_context *rctx, + struct r600_query *rquery); +bool r600_query_hw_get_result(struct r600_common_context *rctx, + struct r600_query *rquery, + bool wait, + union pipe_query_result *result); + +/* Performance counters */ +enum { + /* This block is part of the shader engine */ + R600_PC_BLOCK_SE = (1 << 0), + + /* Expose per-instance groups instead of summing all instances (within + * an SE). */ + R600_PC_BLOCK_INSTANCE_GROUPS = (1 << 1), + + /* Expose per-SE groups instead of summing instances across SEs. */ + R600_PC_BLOCK_SE_GROUPS = (1 << 2), + + /* Shader block */ + R600_PC_BLOCK_SHADER = (1 << 3), + + /* Non-shader block with perfcounters windowed by shaders. */ + R600_PC_BLOCK_SHADER_WINDOWED = (1 << 4), +}; + +/* Describes a hardware block with performance counters. Multiple instances of + * each block, possibly per-SE, may exist on the chip. Depending on the block + * and on the user's configuration, we either + * (a) expose every instance as a performance counter group, + * (b) expose a single performance counter group that reports the sum over all + * instances, or + * (c) expose one performance counter group per instance, but summed over all + * shader engines. + */ +struct r600_perfcounter_block { + const char *basename; + unsigned flags; + unsigned num_counters; + unsigned num_selectors; + unsigned num_instances; + + unsigned num_groups; + char *group_names; + unsigned group_name_stride; + + char *selector_names; + unsigned selector_name_stride; + + void *data; +}; + +struct r600_perfcounters { + unsigned num_groups; + unsigned num_blocks; + struct r600_perfcounter_block *blocks; + + unsigned num_start_cs_dwords; + unsigned num_stop_cs_dwords; + unsigned num_instance_cs_dwords; + unsigned num_shaders_cs_dwords; + + unsigned num_shader_types; + const char * const *shader_type_suffixes; + const unsigned *shader_type_bits; + + void (*get_size)(struct r600_perfcounter_block *, + unsigned count, unsigned *selectors, + unsigned *num_select_dw, unsigned *num_read_dw); + + void (*emit_instance)(struct r600_common_context *, + int se, int instance); + void (*emit_shaders)(struct r600_common_context *, unsigned shaders); + void (*emit_select)(struct r600_common_context *, + struct r600_perfcounter_block *, + unsigned count, unsigned *selectors); + void (*emit_start)(struct r600_common_context *, + struct r600_resource *buffer, uint64_t va); + void (*emit_stop)(struct r600_common_context *, + struct r600_resource *buffer, uint64_t va); + void (*emit_read)(struct r600_common_context *, + struct r600_perfcounter_block *, + unsigned count, unsigned *selectors, + struct r600_resource *buffer, uint64_t va); + + void (*cleanup)(struct r600_common_screen *); + + bool separate_se; + bool separate_instance; +}; + +struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, + unsigned num_queries, + unsigned *query_types); + +int r600_get_perfcounter_info(struct r600_common_screen *, + unsigned index, + struct pipe_driver_query_info *info); +int r600_get_perfcounter_group_info(struct r600_common_screen *, + unsigned index, + struct pipe_driver_query_group_info *info); + +bool r600_perfcounters_init(struct r600_perfcounters *, unsigned num_blocks); +void r600_perfcounters_add_block(struct r600_common_screen *, + struct r600_perfcounters *, + const char *name, unsigned flags, + unsigned counters, unsigned selectors, + unsigned instances, void *data); +void r600_perfcounters_do_destroy(struct r600_perfcounters *); +void r600_query_hw_reset_buffers(struct r600_common_context *rctx, + struct r600_query_hw *query); + +struct r600_qbo_state { + void *saved_compute; + struct pipe_constant_buffer saved_const0; + struct pipe_shader_buffer saved_ssbo[3]; +}; + +#endif /* R600_QUERY_H */ diff --git a/lib/mesa/src/gallium/drivers/r600/r600_streamout.c b/lib/mesa/src/gallium/drivers/r600/r600_streamout.c new file mode 100644 index 000000000..78334066c --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600_streamout.c @@ -0,0 +1,365 @@ +/* + * Copyright 2013 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Marek Olšák <maraeo@gmail.com> + * + */ + +#include "r600_pipe_common.h" +#include "r600_cs.h" + +#include "util/u_memory.h" +#include "evergreend.h" + +#define R_008490_CP_STRMOUT_CNTL 0x008490 +#define R_028AB0_VGT_STRMOUT_EN 0x028AB0 +#define R_028B20_VGT_STRMOUT_BUFFER_EN 0x028B20 + +static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable); + +static struct pipe_stream_output_target * +r600_create_so_target(struct pipe_context *ctx, + struct pipe_resource *buffer, + unsigned buffer_offset, + unsigned buffer_size) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_so_target *t; + struct r600_resource *rbuffer = (struct r600_resource*)buffer; + + t = CALLOC_STRUCT(r600_so_target); + if (!t) { + return NULL; + } + + u_suballocator_alloc(rctx->allocator_zeroed_memory, 4, 4, + &t->buf_filled_size_offset, + (struct pipe_resource**)&t->buf_filled_size); + if (!t->buf_filled_size) { + FREE(t); + return NULL; + } + + t->b.reference.count = 1; + t->b.context = ctx; + pipe_resource_reference(&t->b.buffer, buffer); + t->b.buffer_offset = buffer_offset; + t->b.buffer_size = buffer_size; + + util_range_add(&rbuffer->valid_buffer_range, buffer_offset, + buffer_offset + buffer_size); + return &t->b; +} + +static void r600_so_target_destroy(struct pipe_context *ctx, + struct pipe_stream_output_target *target) +{ + struct r600_so_target *t = (struct r600_so_target*)target; + pipe_resource_reference(&t->b.buffer, NULL); + r600_resource_reference(&t->buf_filled_size, NULL); + FREE(t); +} + +void r600_streamout_buffers_dirty(struct r600_common_context *rctx) +{ + struct r600_atom *begin = &rctx->streamout.begin_atom; + unsigned num_bufs = util_bitcount(rctx->streamout.enabled_mask); + unsigned num_bufs_appended = util_bitcount(rctx->streamout.enabled_mask & + rctx->streamout.append_bitmask); + + if (!num_bufs) + return; + + rctx->streamout.num_dw_for_end = + 12 + /* flush_vgt_streamout */ + num_bufs * 11; /* STRMOUT_BUFFER_UPDATE, BUFFER_SIZE */ + + begin->num_dw = 12; /* flush_vgt_streamout */ + + begin->num_dw += num_bufs * 7; /* SET_CONTEXT_REG */ + + if (rctx->family >= CHIP_RS780 && rctx->family <= CHIP_RV740) + begin->num_dw += num_bufs * 5; /* STRMOUT_BASE_UPDATE */ + + begin->num_dw += + num_bufs_appended * 8 + /* STRMOUT_BUFFER_UPDATE */ + (num_bufs - num_bufs_appended) * 6 + /* STRMOUT_BUFFER_UPDATE */ + (rctx->family > CHIP_R600 && rctx->family < CHIP_RS780 ? 2 : 0); /* SURFACE_BASE_UPDATE */ + + rctx->set_atom_dirty(rctx, begin, true); + + r600_set_streamout_enable(rctx, true); +} + +void r600_set_streamout_targets(struct pipe_context *ctx, + unsigned num_targets, + struct pipe_stream_output_target **targets, + const unsigned *offsets) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + unsigned i; + unsigned enabled_mask = 0, append_bitmask = 0; + + /* Stop streamout. */ + if (rctx->streamout.num_targets && rctx->streamout.begin_emitted) { + r600_emit_streamout_end(rctx); + } + + /* Set the new targets. */ + for (i = 0; i < num_targets; i++) { + pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->streamout.targets[i], targets[i]); + if (!targets[i]) + continue; + + r600_context_add_resource_size(ctx, targets[i]->buffer); + enabled_mask |= 1 << i; + if (offsets[i] == ((unsigned)-1)) + append_bitmask |= 1 << i; + } + for (; i < rctx->streamout.num_targets; i++) { + pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->streamout.targets[i], NULL); + } + + rctx->streamout.enabled_mask = enabled_mask; + + rctx->streamout.num_targets = num_targets; + rctx->streamout.append_bitmask = append_bitmask; + + if (num_targets) { + r600_streamout_buffers_dirty(rctx); + } else { + rctx->set_atom_dirty(rctx, &rctx->streamout.begin_atom, false); + r600_set_streamout_enable(rctx, false); + } +} + +static void r600_flush_vgt_streamout(struct r600_common_context *rctx) +{ + struct radeon_winsys_cs *cs = rctx->gfx.cs; + unsigned reg_strmout_cntl; + + /* The register is at different places on different ASICs. */ + if (rctx->chip_class >= EVERGREEN) { + reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; + } else { + reg_strmout_cntl = R_008490_CP_STRMOUT_CNTL; + } + + radeon_set_config_reg(cs, reg_strmout_cntl, 0); + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); + + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ + radeon_emit(cs, reg_strmout_cntl >> 2); /* register */ + radeon_emit(cs, 0); + radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ + radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ + radeon_emit(cs, 4); /* poll interval */ +} + +static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom) +{ + struct radeon_winsys_cs *cs = rctx->gfx.cs; + struct r600_so_target **t = rctx->streamout.targets; + uint16_t *stride_in_dw = rctx->streamout.stride_in_dw; + unsigned i, update_flags = 0; + + r600_flush_vgt_streamout(rctx); + + for (i = 0; i < rctx->streamout.num_targets; i++) { + if (!t[i]) + continue; + + t[i]->stride_in_dw = stride_in_dw[i]; + + uint64_t va = r600_resource(t[i]->b.buffer)->gpu_address; + + update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i); + + radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 3); + radeon_emit(cs, (t[i]->b.buffer_offset + + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ + radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ + radeon_emit(cs, va >> 8); /* BUFFER_BASE */ + + r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer), + RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RW_BUFFER); + + /* R7xx requires this packet after updating BUFFER_BASE. + * Without this, R7xx locks up. */ + if (rctx->family >= CHIP_RS780 && rctx->family <= CHIP_RV740) { + radeon_emit(cs, PKT3(PKT3_STRMOUT_BASE_UPDATE, 1, 0)); + radeon_emit(cs, i); + radeon_emit(cs, va >> 8); + + r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer), + RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RW_BUFFER); + } + + if (rctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) { + uint64_t va = t[i]->buf_filled_size->gpu_address + + t[i]->buf_filled_size_offset; + + /* Append. */ + radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); + radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, va); /* src address lo */ + radeon_emit(cs, va >> 32); /* src address hi */ + + r600_emit_reloc(rctx, &rctx->gfx, t[i]->buf_filled_size, + RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE); + } else { + /* Start from the beginning. */ + radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); + radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */ + radeon_emit(cs, 0); /* unused */ + } + } + + if (rctx->family > CHIP_R600 && rctx->family < CHIP_RV770) { + radeon_emit(cs, PKT3(PKT3_SURFACE_BASE_UPDATE, 0, 0)); + radeon_emit(cs, update_flags); + } + rctx->streamout.begin_emitted = true; +} + +void r600_emit_streamout_end(struct r600_common_context *rctx) +{ + struct radeon_winsys_cs *cs = rctx->gfx.cs; + struct r600_so_target **t = rctx->streamout.targets; + unsigned i; + uint64_t va; + + r600_flush_vgt_streamout(rctx); + + for (i = 0; i < rctx->streamout.num_targets; i++) { + if (!t[i]) + continue; + + va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; + radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); + radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | + STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ + radeon_emit(cs, va); /* dst address lo */ + radeon_emit(cs, va >> 32); /* dst address hi */ + radeon_emit(cs, 0); /* unused */ + radeon_emit(cs, 0); /* unused */ + + r600_emit_reloc(rctx, &rctx->gfx, t[i]->buf_filled_size, + RADEON_USAGE_WRITE, RADEON_PRIO_SO_FILLED_SIZE); + + /* Zero the buffer size. The counters (primitives generated, + * primitives emitted) may be enabled even if there is not + * buffer bound. This ensures that the primitives-emitted query + * won't increment. */ + radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0); + + t[i]->buf_filled_size_valid = true; + } + + rctx->streamout.begin_emitted = false; + rctx->flags |= R600_CONTEXT_STREAMOUT_FLUSH; +} + +/* STREAMOUT CONFIG DERIVED STATE + * + * Streamout must be enabled for the PRIMITIVES_GENERATED query to work. + * The buffer mask is an independent state, so no writes occur if there + * are no buffers bound. + */ + +static void r600_emit_streamout_enable(struct r600_common_context *rctx, + struct r600_atom *atom) +{ + unsigned strmout_config_reg = R_028AB0_VGT_STRMOUT_EN; + unsigned strmout_config_val = S_028B94_STREAMOUT_0_EN(r600_get_strmout_en(rctx)); + unsigned strmout_buffer_reg = R_028B20_VGT_STRMOUT_BUFFER_EN; + unsigned strmout_buffer_val = rctx->streamout.hw_enabled_mask & + rctx->streamout.enabled_stream_buffers_mask; + + if (rctx->chip_class >= EVERGREEN) { + strmout_buffer_reg = R_028B98_VGT_STRMOUT_BUFFER_CONFIG; + + strmout_config_reg = R_028B94_VGT_STRMOUT_CONFIG; + strmout_config_val |= + S_028B94_STREAMOUT_1_EN(r600_get_strmout_en(rctx)) | + S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) | + S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx)); + } + radeon_set_context_reg(rctx->gfx.cs, strmout_buffer_reg, strmout_buffer_val); + radeon_set_context_reg(rctx->gfx.cs, strmout_config_reg, strmout_config_val); +} + +static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable) +{ + bool old_strmout_en = r600_get_strmout_en(rctx); + unsigned old_hw_enabled_mask = rctx->streamout.hw_enabled_mask; + + rctx->streamout.streamout_enabled = enable; + + rctx->streamout.hw_enabled_mask = rctx->streamout.enabled_mask | + (rctx->streamout.enabled_mask << 4) | + (rctx->streamout.enabled_mask << 8) | + (rctx->streamout.enabled_mask << 12); + + if ((old_strmout_en != r600_get_strmout_en(rctx)) || + (old_hw_enabled_mask != rctx->streamout.hw_enabled_mask)) { + rctx->set_atom_dirty(rctx, &rctx->streamout.enable_atom, true); + } +} + +void r600_update_prims_generated_query_state(struct r600_common_context *rctx, + unsigned type, int diff) +{ + if (type == PIPE_QUERY_PRIMITIVES_GENERATED) { + bool old_strmout_en = r600_get_strmout_en(rctx); + + rctx->streamout.num_prims_gen_queries += diff; + assert(rctx->streamout.num_prims_gen_queries >= 0); + + rctx->streamout.prims_gen_query_enabled = + rctx->streamout.num_prims_gen_queries != 0; + + if (old_strmout_en != r600_get_strmout_en(rctx)) { + rctx->set_atom_dirty(rctx, &rctx->streamout.enable_atom, true); + } + } +} + +void r600_streamout_init(struct r600_common_context *rctx) +{ + rctx->b.create_stream_output_target = r600_create_so_target; + rctx->b.stream_output_target_destroy = r600_so_target_destroy; + rctx->streamout.begin_atom.emit = r600_emit_streamout_begin; + rctx->streamout.enable_atom.emit = r600_emit_streamout_enable; + rctx->streamout.enable_atom.num_dw = 6; +} diff --git a/lib/mesa/src/gallium/drivers/r600/r600_test_dma.c b/lib/mesa/src/gallium/drivers/r600/r600_test_dma.c new file mode 100644 index 000000000..9e1ff9e5f --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600_test_dma.c @@ -0,0 +1,398 @@ +/* + * Copyright 2016 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +/* This file implements randomized SDMA texture blit tests. */ + +#include "r600_pipe_common.h" +#include "util/u_surface.h" +#include "util/rand_xor.h" + +static uint64_t seed_xorshift128plus[2]; + +#define RAND_NUM_SIZE 8 + +/* The GPU blits are emulated on the CPU using these CPU textures. */ + +struct cpu_texture { + uint8_t *ptr; + uint64_t size; + uint64_t layer_stride; + unsigned stride; +}; + +static void alloc_cpu_texture(struct cpu_texture *tex, + struct pipe_resource *templ, int bpp) +{ + tex->stride = align(templ->width0 * bpp, RAND_NUM_SIZE); + tex->layer_stride = (uint64_t)tex->stride * templ->height0; + tex->size = tex->layer_stride * templ->array_size; + tex->ptr = malloc(tex->size); + assert(tex->ptr); +} + +static void set_random_pixels(struct pipe_context *ctx, + struct pipe_resource *tex, + struct cpu_texture *cpu) +{ + struct pipe_transfer *t; + uint8_t *map; + int x,y,z; + + map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE, + 0, 0, 0, tex->width0, tex->height0, + tex->array_size, &t); + assert(map); + + for (z = 0; z < tex->array_size; z++) { + for (y = 0; y < tex->height0; y++) { + uint64_t *ptr = (uint64_t*) + (map + t->layer_stride*z + t->stride*y); + uint64_t *ptr_cpu = (uint64_t*) + (cpu->ptr + cpu->layer_stride*z + cpu->stride*y); + unsigned size = cpu->stride / RAND_NUM_SIZE; + + assert(t->stride % RAND_NUM_SIZE == 0); + assert(cpu->stride % RAND_NUM_SIZE == 0); + + for (x = 0; x < size; x++) { + *ptr++ = *ptr_cpu++ = + rand_xorshift128plus(seed_xorshift128plus); + } + } + } + + pipe_transfer_unmap(ctx, t); +} + +static bool compare_textures(struct pipe_context *ctx, + struct pipe_resource *tex, + struct cpu_texture *cpu, int bpp) +{ + struct pipe_transfer *t; + uint8_t *map; + int y,z; + bool pass = true; + + map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ, + 0, 0, 0, tex->width0, tex->height0, + tex->array_size, &t); + assert(map); + + for (z = 0; z < tex->array_size; z++) { + for (y = 0; y < tex->height0; y++) { + uint8_t *ptr = map + t->layer_stride*z + t->stride*y; + uint8_t *cpu_ptr = cpu->ptr + + cpu->layer_stride*z + cpu->stride*y; + + if (memcmp(ptr, cpu_ptr, tex->width0 * bpp)) { + pass = false; + goto done; + } + } + } +done: + pipe_transfer_unmap(ctx, t); + return pass; +} + +static enum pipe_format get_format_from_bpp(int bpp) +{ + switch (bpp) { + case 1: + return PIPE_FORMAT_R8_UINT; + case 2: + return PIPE_FORMAT_R16_UINT; + case 4: + return PIPE_FORMAT_R32_UINT; + case 8: + return PIPE_FORMAT_R32G32_UINT; + case 16: + return PIPE_FORMAT_R32G32B32A32_UINT; + default: + assert(0); + return PIPE_FORMAT_NONE; + } +} + +static const char *array_mode_to_string(struct r600_common_screen *rscreen, + struct radeon_surf *surf) +{ + if (rscreen->chip_class >= GFX9) { + /* TODO */ + return " UNKNOWN"; + } else { + switch (surf->u.legacy.level[0].mode) { + case RADEON_SURF_MODE_LINEAR_ALIGNED: + return "LINEAR_ALIGNED"; + case RADEON_SURF_MODE_1D: + return "1D_TILED_THIN1"; + case RADEON_SURF_MODE_2D: + return "2D_TILED_THIN1"; + default: + assert(0); + return " UNKNOWN"; + } + } +} + +static unsigned generate_max_tex_side(unsigned max_tex_side) +{ + switch (rand() % 4) { + case 0: + /* Try to hit large sizes in 1/4 of the cases. */ + return max_tex_side; + case 1: + /* Try to hit 1D tiling in 1/4 of the cases. */ + return 128; + default: + /* Try to hit common sizes in 2/4 of the cases. */ + return 2048; + } +} + +void r600_test_dma(struct r600_common_screen *rscreen) +{ + struct pipe_screen *screen = &rscreen->b; + struct pipe_context *ctx = screen->context_create(screen, NULL, 0); + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + uint64_t max_alloc_size; + unsigned i, iterations, num_partial_copies, max_levels, max_tex_side; + unsigned num_pass = 0, num_fail = 0; + + max_levels = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS); + max_tex_side = 1 << (max_levels - 1); + + /* Max 128 MB allowed for both textures. */ + max_alloc_size = 128 * 1024 * 1024; + + /* the seed for random test parameters */ + srand(0x9b47d95b); + /* the seed for random pixel data */ + s_rand_xorshift128plus(seed_xorshift128plus, false); + + iterations = 1000000000; /* just kill it when you are bored */ + num_partial_copies = 30; + + /* These parameters are randomly generated per test: + * - whether to do one whole-surface copy or N partial copies per test + * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D) + * - which texture dimensions to use + * - whether to use VRAM (all tiling modes) and GTT (staging, linear + * only) allocations + * - random initial pixels in src + * - generate random subrectangle copies for partial blits + */ + for (i = 0; i < iterations; i++) { + struct pipe_resource tsrc = {}, tdst = {}, *src, *dst; + struct r600_texture *rdst; + struct r600_texture *rsrc; + struct cpu_texture src_cpu, dst_cpu; + unsigned bpp, max_width, max_height, max_depth, j, num; + unsigned gfx_blits = 0, dma_blits = 0, max_tex_side_gen; + unsigned max_tex_layers; + bool pass; + bool do_partial_copies = rand() & 1; + + /* generate a random test case */ + tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY; + tsrc.depth0 = tdst.depth0 = 1; + + bpp = 1 << (rand() % 5); + tsrc.format = tdst.format = get_format_from_bpp(bpp); + + max_tex_side_gen = generate_max_tex_side(max_tex_side); + max_tex_layers = rand() % 4 ? 1 : 5; + + tsrc.width0 = (rand() % max_tex_side_gen) + 1; + tsrc.height0 = (rand() % max_tex_side_gen) + 1; + tsrc.array_size = (rand() % max_tex_layers) + 1; + + /* Have a 1/4 chance of getting power-of-two dimensions. */ + if (rand() % 4 == 0) { + tsrc.width0 = util_next_power_of_two(tsrc.width0); + tsrc.height0 = util_next_power_of_two(tsrc.height0); + } + + if (!do_partial_copies) { + /* whole-surface copies only, same dimensions */ + tdst = tsrc; + } else { + max_tex_side_gen = generate_max_tex_side(max_tex_side); + max_tex_layers = rand() % 4 ? 1 : 5; + + /* many partial copies, dimensions can be different */ + tdst.width0 = (rand() % max_tex_side_gen) + 1; + tdst.height0 = (rand() % max_tex_side_gen) + 1; + tdst.array_size = (rand() % max_tex_layers) + 1; + + /* Have a 1/4 chance of getting power-of-two dimensions. */ + if (rand() % 4 == 0) { + tdst.width0 = util_next_power_of_two(tdst.width0); + tdst.height0 = util_next_power_of_two(tdst.height0); + } + } + + /* check texture sizes */ + if ((uint64_t)tsrc.width0 * tsrc.height0 * tsrc.array_size * bpp + + (uint64_t)tdst.width0 * tdst.height0 * tdst.array_size * bpp > + max_alloc_size) { + /* too large, try again */ + i--; + continue; + } + + /* VRAM + the tiling mode depends on dimensions (3/4 of cases), + * or GTT + linear only (1/4 of cases) + */ + tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING; + tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING; + + /* Allocate textures (both the GPU and CPU copies). + * The CPU will emulate what the GPU should be doing. + */ + src = screen->resource_create(screen, &tsrc); + dst = screen->resource_create(screen, &tdst); + assert(src); + assert(dst); + rdst = (struct r600_texture*)dst; + rsrc = (struct r600_texture*)src; + alloc_cpu_texture(&src_cpu, &tsrc, bpp); + alloc_cpu_texture(&dst_cpu, &tdst, bpp); + + printf("%4u: dst = (%5u x %5u x %u, %s), " + " src = (%5u x %5u x %u, %s), bpp = %2u, ", + i, tdst.width0, tdst.height0, tdst.array_size, + array_mode_to_string(rscreen, &rdst->surface), + tsrc.width0, tsrc.height0, tsrc.array_size, + array_mode_to_string(rscreen, &rsrc->surface), bpp); + fflush(stdout); + + /* set src pixels */ + set_random_pixels(ctx, src, &src_cpu); + + /* clear dst pixels */ + rctx->clear_buffer(ctx, dst, 0, rdst->surface.surf_size, 0, true); + memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size); + + /* preparation */ + max_width = MIN2(tsrc.width0, tdst.width0); + max_height = MIN2(tsrc.height0, tdst.height0); + max_depth = MIN2(tsrc.array_size, tdst.array_size); + + num = do_partial_copies ? num_partial_copies : 1; + for (j = 0; j < num; j++) { + int width, height, depth; + int srcx, srcy, srcz, dstx, dsty, dstz; + struct pipe_box box; + unsigned old_num_draw_calls = rctx->num_draw_calls; + unsigned old_num_dma_calls = rctx->num_dma_calls; + + if (!do_partial_copies) { + /* copy whole src to dst */ + width = max_width; + height = max_height; + depth = max_depth; + + srcx = srcy = srcz = dstx = dsty = dstz = 0; + } else { + /* random sub-rectangle copies from src to dst */ + depth = (rand() % max_depth) + 1; + srcz = rand() % (tsrc.array_size - depth + 1); + dstz = rand() % (tdst.array_size - depth + 1); + + /* special code path to hit the tiled partial copies */ + if (!rsrc->surface.is_linear && + !rdst->surface.is_linear && + rand() & 1) { + if (max_width < 8 || max_height < 8) + continue; + width = ((rand() % (max_width / 8)) + 1) * 8; + height = ((rand() % (max_height / 8)) + 1) * 8; + + srcx = rand() % (tsrc.width0 - width + 1) & ~0x7; + srcy = rand() % (tsrc.height0 - height + 1) & ~0x7; + + dstx = rand() % (tdst.width0 - width + 1) & ~0x7; + dsty = rand() % (tdst.height0 - height + 1) & ~0x7; + } else { + /* just make sure that it doesn't divide by zero */ + assert(max_width > 0 && max_height > 0); + + width = (rand() % max_width) + 1; + height = (rand() % max_height) + 1; + + srcx = rand() % (tsrc.width0 - width + 1); + srcy = rand() % (tsrc.height0 - height + 1); + + dstx = rand() % (tdst.width0 - width + 1); + dsty = rand() % (tdst.height0 - height + 1); + } + + /* special code path to hit out-of-bounds reads in L2T */ + if (rsrc->surface.is_linear && + !rdst->surface.is_linear && + rand() % 4 == 0) { + srcx = 0; + srcy = 0; + srcz = 0; + } + } + + /* GPU copy */ + u_box_3d(srcx, srcy, srcz, width, height, depth, &box); + rctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box); + + /* See which engine was used. */ + gfx_blits += rctx->num_draw_calls > old_num_draw_calls; + dma_blits += rctx->num_dma_calls > old_num_dma_calls; + + /* CPU copy */ + util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride, + dst_cpu.layer_stride, + dstx, dsty, dstz, width, height, depth, + src_cpu.ptr, src_cpu.stride, + src_cpu.layer_stride, + srcx, srcy, srcz); + } + + pass = compare_textures(ctx, dst, &dst_cpu, bpp); + if (pass) + num_pass++; + else + num_fail++; + + printf("BLITs: GFX = %2u, DMA = %2u, %s [%u/%u]\n", + gfx_blits, dma_blits, pass ? "pass" : "fail", + num_pass, num_pass+num_fail); + + /* cleanup */ + pipe_resource_reference(&src, NULL); + pipe_resource_reference(&dst, NULL); + free(src_cpu.ptr); + free(dst_cpu.ptr); + } + + ctx->destroy(ctx); + exit(0); +} diff --git a/lib/mesa/src/gallium/drivers/r600/r600_texture.c b/lib/mesa/src/gallium/drivers/r600/r600_texture.c new file mode 100644 index 000000000..b84111449 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600_texture.c @@ -0,0 +1,1953 @@ +/* + * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Jerome Glisse + * Corbin Simpson + */ +#include "r600_pipe_common.h" +#include "r600_cs.h" +#include "r600_query.h" +#include "util/u_format.h" +#include "util/u_log.h" +#include "util/u_memory.h" +#include "util/u_pack_color.h" +#include "util/u_surface.h" +#include "os/os_time.h" +#include <errno.h> +#include <inttypes.h> + +static void r600_texture_discard_cmask(struct r600_common_screen *rscreen, + struct r600_texture *rtex); +static enum radeon_surf_mode +r600_choose_tiling(struct r600_common_screen *rscreen, + const struct pipe_resource *templ); + + +bool r600_prepare_for_dma_blit(struct r600_common_context *rctx, + struct r600_texture *rdst, + unsigned dst_level, unsigned dstx, + unsigned dsty, unsigned dstz, + struct r600_texture *rsrc, + unsigned src_level, + const struct pipe_box *src_box) +{ + if (!rctx->dma.cs) + return false; + + if (rdst->surface.bpe != rsrc->surface.bpe) + return false; + + /* MSAA: Blits don't exist in the real world. */ + if (rsrc->resource.b.b.nr_samples > 1 || + rdst->resource.b.b.nr_samples > 1) + return false; + + /* Depth-stencil surfaces: + * When dst is linear, the DB->CB copy preserves HTILE. + * When dst is tiled, the 3D path must be used to update HTILE. + */ + if (rsrc->is_depth || rdst->is_depth) + return false; + + /* CMASK as: + * src: Both texture and SDMA paths need decompression. Use SDMA. + * dst: If overwriting the whole texture, discard CMASK and use + * SDMA. Otherwise, use the 3D path. + */ + if (rdst->cmask.size && rdst->dirty_level_mask & (1 << dst_level)) { + /* The CMASK clear is only enabled for the first level. */ + assert(dst_level == 0); + if (!util_texrange_covers_whole_level(&rdst->resource.b.b, dst_level, + dstx, dsty, dstz, src_box->width, + src_box->height, src_box->depth)) + return false; + + r600_texture_discard_cmask(rctx->screen, rdst); + } + + /* All requirements are met. Prepare textures for SDMA. */ + if (rsrc->cmask.size && rsrc->dirty_level_mask & (1 << src_level)) + rctx->b.flush_resource(&rctx->b, &rsrc->resource.b.b); + + assert(!(rsrc->dirty_level_mask & (1 << src_level))); + assert(!(rdst->dirty_level_mask & (1 << dst_level))); + + return true; +} + +/* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */ +static void r600_copy_region_with_blit(struct pipe_context *pipe, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct pipe_blit_info blit; + + memset(&blit, 0, sizeof(blit)); + blit.src.resource = src; + blit.src.format = src->format; + blit.src.level = src_level; + blit.src.box = *src_box; + blit.dst.resource = dst; + blit.dst.format = dst->format; + blit.dst.level = dst_level; + blit.dst.box.x = dstx; + blit.dst.box.y = dsty; + blit.dst.box.z = dstz; + blit.dst.box.width = src_box->width; + blit.dst.box.height = src_box->height; + blit.dst.box.depth = src_box->depth; + blit.mask = util_format_get_mask(src->format) & + util_format_get_mask(dst->format); + blit.filter = PIPE_TEX_FILTER_NEAREST; + + if (blit.mask) { + pipe->blit(pipe, &blit); + } +} + +/* Copy from a full GPU texture to a transfer's staging one. */ +static void r600_copy_to_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer; + struct pipe_resource *dst = &rtransfer->staging->b.b; + struct pipe_resource *src = transfer->resource; + + if (src->nr_samples > 1) { + r600_copy_region_with_blit(ctx, dst, 0, 0, 0, 0, + src, transfer->level, &transfer->box); + return; + } + + rctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level, + &transfer->box); +} + +/* Copy from a transfer's staging texture to a full GPU one. */ +static void r600_copy_from_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer; + struct pipe_resource *dst = transfer->resource; + struct pipe_resource *src = &rtransfer->staging->b.b; + struct pipe_box sbox; + + u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox); + + if (dst->nr_samples > 1) { + r600_copy_region_with_blit(ctx, dst, transfer->level, + transfer->box.x, transfer->box.y, transfer->box.z, + src, 0, &sbox); + return; + } + + rctx->dma_copy(ctx, dst, transfer->level, + transfer->box.x, transfer->box.y, transfer->box.z, + src, 0, &sbox); +} + +static unsigned r600_texture_get_offset(struct r600_common_screen *rscreen, + struct r600_texture *rtex, unsigned level, + const struct pipe_box *box, + unsigned *stride, + unsigned *layer_stride) +{ + *stride = rtex->surface.u.legacy.level[level].nblk_x * + rtex->surface.bpe; + *layer_stride = rtex->surface.u.legacy.level[level].slice_size; + + if (!box) + return rtex->surface.u.legacy.level[level].offset; + + /* Each texture is an array of mipmap levels. Each level is + * an array of slices. */ + return rtex->surface.u.legacy.level[level].offset + + box->z * rtex->surface.u.legacy.level[level].slice_size + + (box->y / rtex->surface.blk_h * + rtex->surface.u.legacy.level[level].nblk_x + + box->x / rtex->surface.blk_w) * rtex->surface.bpe; +} + +static int r600_init_surface(struct r600_common_screen *rscreen, + struct radeon_surf *surface, + const struct pipe_resource *ptex, + enum radeon_surf_mode array_mode, + unsigned pitch_in_bytes_override, + unsigned offset, + bool is_imported, + bool is_scanout, + bool is_flushed_depth) +{ + const struct util_format_description *desc = + util_format_description(ptex->format); + bool is_depth, is_stencil; + int r; + unsigned i, bpe, flags = 0; + + is_depth = util_format_has_depth(desc); + is_stencil = util_format_has_stencil(desc); + + if (rscreen->chip_class >= EVERGREEN && !is_flushed_depth && + ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { + bpe = 4; /* stencil is allocated separately on evergreen */ + } else { + bpe = util_format_get_blocksize(ptex->format); + assert(util_is_power_of_two(bpe)); + } + + if (!is_flushed_depth && is_depth) { + flags |= RADEON_SURF_ZBUFFER; + + if (is_stencil) + flags |= RADEON_SURF_SBUFFER; + } + + if (ptex->bind & PIPE_BIND_SCANOUT || is_scanout) { + /* This should catch bugs in gallium users setting incorrect flags. */ + assert(ptex->nr_samples <= 1 && + ptex->array_size == 1 && + ptex->depth0 == 1 && + ptex->last_level == 0 && + !(flags & RADEON_SURF_Z_OR_SBUFFER)); + + flags |= RADEON_SURF_SCANOUT; + } + + if (ptex->bind & PIPE_BIND_SHARED) + flags |= RADEON_SURF_SHAREABLE; + if (is_imported) + flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE; + if (!(ptex->flags & R600_RESOURCE_FLAG_FORCE_TILING)) + flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE; + + r = rscreen->ws->surface_init(rscreen->ws, ptex, flags, bpe, + array_mode, surface); + if (r) { + return r; + } + + if (pitch_in_bytes_override && + pitch_in_bytes_override != surface->u.legacy.level[0].nblk_x * bpe) { + /* old ddx on evergreen over estimate alignment for 1d, only 1 level + * for those + */ + surface->u.legacy.level[0].nblk_x = pitch_in_bytes_override / bpe; + surface->u.legacy.level[0].slice_size = pitch_in_bytes_override * + surface->u.legacy.level[0].nblk_y; + } + + if (offset) { + for (i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i) + surface->u.legacy.level[i].offset += offset; + } + + return 0; +} + +static void r600_texture_init_metadata(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + struct radeon_bo_metadata *metadata) +{ + struct radeon_surf *surface = &rtex->surface; + + memset(metadata, 0, sizeof(*metadata)); + + metadata->u.legacy.microtile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_1D ? + RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; + metadata->u.legacy.macrotile = surface->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D ? + RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; + metadata->u.legacy.pipe_config = surface->u.legacy.pipe_config; + metadata->u.legacy.bankw = surface->u.legacy.bankw; + metadata->u.legacy.bankh = surface->u.legacy.bankh; + metadata->u.legacy.tile_split = surface->u.legacy.tile_split; + metadata->u.legacy.mtilea = surface->u.legacy.mtilea; + metadata->u.legacy.num_banks = surface->u.legacy.num_banks; + metadata->u.legacy.stride = surface->u.legacy.level[0].nblk_x * surface->bpe; + metadata->u.legacy.scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; +} + +static void r600_surface_import_metadata(struct r600_common_screen *rscreen, + struct radeon_surf *surf, + struct radeon_bo_metadata *metadata, + enum radeon_surf_mode *array_mode, + bool *is_scanout) +{ + surf->u.legacy.pipe_config = metadata->u.legacy.pipe_config; + surf->u.legacy.bankw = metadata->u.legacy.bankw; + surf->u.legacy.bankh = metadata->u.legacy.bankh; + surf->u.legacy.tile_split = metadata->u.legacy.tile_split; + surf->u.legacy.mtilea = metadata->u.legacy.mtilea; + surf->u.legacy.num_banks = metadata->u.legacy.num_banks; + + if (metadata->u.legacy.macrotile == RADEON_LAYOUT_TILED) + *array_mode = RADEON_SURF_MODE_2D; + else if (metadata->u.legacy.microtile == RADEON_LAYOUT_TILED) + *array_mode = RADEON_SURF_MODE_1D; + else + *array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + + *is_scanout = metadata->u.legacy.scanout; +} + +static void r600_eliminate_fast_color_clear(struct r600_common_context *rctx, + struct r600_texture *rtex) +{ + struct r600_common_screen *rscreen = rctx->screen; + struct pipe_context *ctx = &rctx->b; + + if (ctx == rscreen->aux_context) + mtx_lock(&rscreen->aux_context_lock); + + ctx->flush_resource(ctx, &rtex->resource.b.b); + ctx->flush(ctx, NULL, 0); + + if (ctx == rscreen->aux_context) + mtx_unlock(&rscreen->aux_context_lock); +} + +static void r600_texture_discard_cmask(struct r600_common_screen *rscreen, + struct r600_texture *rtex) +{ + if (!rtex->cmask.size) + return; + + assert(rtex->resource.b.b.nr_samples <= 1); + + /* Disable CMASK. */ + memset(&rtex->cmask, 0, sizeof(rtex->cmask)); + rtex->cmask.base_address_reg = rtex->resource.gpu_address >> 8; + rtex->dirty_level_mask = 0; + + rtex->cb_color_info &= ~EG_S_028C70_FAST_CLEAR(1); + + if (rtex->cmask_buffer != &rtex->resource) + r600_resource_reference(&rtex->cmask_buffer, NULL); + + /* Notify all contexts about the change. */ + p_atomic_inc(&rscreen->dirty_tex_counter); + p_atomic_inc(&rscreen->compressed_colortex_counter); +} + +static void r600_reallocate_texture_inplace(struct r600_common_context *rctx, + struct r600_texture *rtex, + unsigned new_bind_flag, + bool invalidate_storage) +{ + struct pipe_screen *screen = rctx->b.screen; + struct r600_texture *new_tex; + struct pipe_resource templ = rtex->resource.b.b; + unsigned i; + + templ.bind |= new_bind_flag; + + /* r600g doesn't react to dirty_tex_descriptor_counter */ + if (rctx->chip_class < SI) + return; + + if (rtex->resource.b.is_shared) + return; + + if (new_bind_flag == PIPE_BIND_LINEAR) { + if (rtex->surface.is_linear) + return; + + /* This fails with MSAA, depth, and compressed textures. */ + if (r600_choose_tiling(rctx->screen, &templ) != + RADEON_SURF_MODE_LINEAR_ALIGNED) + return; + } + + new_tex = (struct r600_texture*)screen->resource_create(screen, &templ); + if (!new_tex) + return; + + /* Copy the pixels to the new texture. */ + if (!invalidate_storage) { + for (i = 0; i <= templ.last_level; i++) { + struct pipe_box box; + + u_box_3d(0, 0, 0, + u_minify(templ.width0, i), u_minify(templ.height0, i), + util_max_layer(&templ, i) + 1, &box); + + rctx->dma_copy(&rctx->b, &new_tex->resource.b.b, i, 0, 0, 0, + &rtex->resource.b.b, i, &box); + } + } + + if (new_bind_flag == PIPE_BIND_LINEAR) { + r600_texture_discard_cmask(rctx->screen, rtex); + } + + /* Replace the structure fields of rtex. */ + rtex->resource.b.b.bind = templ.bind; + pb_reference(&rtex->resource.buf, new_tex->resource.buf); + rtex->resource.gpu_address = new_tex->resource.gpu_address; + rtex->resource.vram_usage = new_tex->resource.vram_usage; + rtex->resource.gart_usage = new_tex->resource.gart_usage; + rtex->resource.bo_size = new_tex->resource.bo_size; + rtex->resource.bo_alignment = new_tex->resource.bo_alignment; + rtex->resource.domains = new_tex->resource.domains; + rtex->resource.flags = new_tex->resource.flags; + rtex->size = new_tex->size; + rtex->db_render_format = new_tex->db_render_format; + rtex->db_compatible = new_tex->db_compatible; + rtex->can_sample_z = new_tex->can_sample_z; + rtex->can_sample_s = new_tex->can_sample_s; + rtex->surface = new_tex->surface; + rtex->fmask = new_tex->fmask; + rtex->cmask = new_tex->cmask; + rtex->cb_color_info = new_tex->cb_color_info; + rtex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode; + rtex->htile_offset = new_tex->htile_offset; + rtex->depth_cleared = new_tex->depth_cleared; + rtex->stencil_cleared = new_tex->stencil_cleared; + rtex->non_disp_tiling = new_tex->non_disp_tiling; + rtex->framebuffers_bound = new_tex->framebuffers_bound; + + if (new_bind_flag == PIPE_BIND_LINEAR) { + assert(!rtex->htile_offset); + assert(!rtex->cmask.size); + assert(!rtex->fmask.size); + assert(!rtex->is_depth); + } + + r600_texture_reference(&new_tex, NULL); + + p_atomic_inc(&rctx->screen->dirty_tex_counter); +} + +static boolean r600_texture_get_handle(struct pipe_screen* screen, + struct pipe_context *ctx, + struct pipe_resource *resource, + struct winsys_handle *whandle, + unsigned usage) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + struct r600_common_context *rctx; + struct r600_resource *res = (struct r600_resource*)resource; + struct r600_texture *rtex = (struct r600_texture*)resource; + struct radeon_bo_metadata metadata; + bool update_metadata = false; + unsigned stride, offset, slice_size; + + ctx = threaded_context_unwrap_sync(ctx); + rctx = (struct r600_common_context*)(ctx ? ctx : rscreen->aux_context); + + if (resource->target != PIPE_BUFFER) { + /* This is not supported now, but it might be required for OpenCL + * interop in the future. + */ + if (resource->nr_samples > 1 || rtex->is_depth) + return false; + + /* Move a suballocated texture into a non-suballocated allocation. */ + if (rscreen->ws->buffer_is_suballocated(res->buf) || + rtex->surface.tile_swizzle) { + assert(!res->b.is_shared); + r600_reallocate_texture_inplace(rctx, rtex, + PIPE_BIND_SHARED, false); + rctx->b.flush(&rctx->b, NULL, 0); + assert(res->b.b.bind & PIPE_BIND_SHARED); + assert(res->flags & RADEON_FLAG_NO_SUBALLOC); + assert(rtex->surface.tile_swizzle == 0); + } + + if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && + rtex->cmask.size) { + /* Eliminate fast clear (CMASK) */ + r600_eliminate_fast_color_clear(rctx, rtex); + + /* Disable CMASK if flush_resource isn't going + * to be called. + */ + if (rtex->cmask.size) + r600_texture_discard_cmask(rscreen, rtex); + } + + /* Set metadata. */ + if (!res->b.is_shared || update_metadata) { + r600_texture_init_metadata(rscreen, rtex, &metadata); + if (rscreen->query_opaque_metadata) + rscreen->query_opaque_metadata(rscreen, rtex, + &metadata); + + rscreen->ws->buffer_set_metadata(res->buf, &metadata); + } + + offset = rtex->surface.u.legacy.level[0].offset; + stride = rtex->surface.u.legacy.level[0].nblk_x * + rtex->surface.bpe; + slice_size = rtex->surface.u.legacy.level[0].slice_size; + } else { + /* Move a suballocated buffer into a non-suballocated allocation. */ + if (rscreen->ws->buffer_is_suballocated(res->buf)) { + assert(!res->b.is_shared); + + /* Allocate a new buffer with PIPE_BIND_SHARED. */ + struct pipe_resource templ = res->b.b; + templ.bind |= PIPE_BIND_SHARED; + + struct pipe_resource *newb = + screen->resource_create(screen, &templ); + if (!newb) + return false; + + /* Copy the old buffer contents to the new one. */ + struct pipe_box box; + u_box_1d(0, newb->width0, &box); + rctx->b.resource_copy_region(&rctx->b, newb, 0, 0, 0, 0, + &res->b.b, 0, &box); + /* Move the new buffer storage to the old pipe_resource. */ + r600_replace_buffer_storage(&rctx->b, &res->b.b, newb); + pipe_resource_reference(&newb, NULL); + + assert(res->b.b.bind & PIPE_BIND_SHARED); + assert(res->flags & RADEON_FLAG_NO_SUBALLOC); + } + + /* Buffers */ + offset = 0; + stride = 0; + slice_size = 0; + } + + if (res->b.is_shared) { + /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user + * doesn't set it. + */ + res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; + if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) + res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; + } else { + res->b.is_shared = true; + res->external_usage = usage; + } + + return rscreen->ws->buffer_get_handle(res->buf, stride, offset, + slice_size, whandle); +} + +static void r600_texture_destroy(struct pipe_screen *screen, + struct pipe_resource *ptex) +{ + struct r600_texture *rtex = (struct r600_texture*)ptex; + struct r600_resource *resource = &rtex->resource; + + r600_texture_reference(&rtex->flushed_depth_texture, NULL); + + if (rtex->cmask_buffer != &rtex->resource) { + r600_resource_reference(&rtex->cmask_buffer, NULL); + } + pb_reference(&resource->buf, NULL); + FREE(rtex); +} + +static const struct u_resource_vtbl r600_texture_vtbl; + +/* The number of samples can be specified independently of the texture. */ +void r600_texture_get_fmask_info(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + unsigned nr_samples, + struct r600_fmask_info *out) +{ + /* FMASK is allocated like an ordinary texture. */ + struct pipe_resource templ = rtex->resource.b.b; + struct radeon_surf fmask = {}; + unsigned flags, bpe; + + memset(out, 0, sizeof(*out)); + + templ.nr_samples = 1; + flags = rtex->surface.flags | RADEON_SURF_FMASK; + + /* Use the same parameters and tile mode. */ + fmask.u.legacy.bankw = rtex->surface.u.legacy.bankw; + fmask.u.legacy.bankh = rtex->surface.u.legacy.bankh; + fmask.u.legacy.mtilea = rtex->surface.u.legacy.mtilea; + fmask.u.legacy.tile_split = rtex->surface.u.legacy.tile_split; + + if (nr_samples <= 4) + fmask.u.legacy.bankh = 4; + + switch (nr_samples) { + case 2: + case 4: + bpe = 1; + break; + case 8: + bpe = 4; + break; + default: + R600_ERR("Invalid sample count for FMASK allocation.\n"); + return; + } + + /* Overallocate FMASK on R600-R700 to fix colorbuffer corruption. + * This can be fixed by writing a separate FMASK allocator specifically + * for R600-R700 asics. */ + if (rscreen->chip_class <= R700) { + bpe *= 2; + } + + if (rscreen->ws->surface_init(rscreen->ws, &templ, flags, bpe, + RADEON_SURF_MODE_2D, &fmask)) { + R600_ERR("Got error in surface_init while allocating FMASK.\n"); + return; + } + + assert(fmask.u.legacy.level[0].mode == RADEON_SURF_MODE_2D); + + out->slice_tile_max = (fmask.u.legacy.level[0].nblk_x * fmask.u.legacy.level[0].nblk_y) / 64; + if (out->slice_tile_max) + out->slice_tile_max -= 1; + + out->tile_mode_index = fmask.u.legacy.tiling_index[0]; + out->pitch_in_pixels = fmask.u.legacy.level[0].nblk_x; + out->bank_height = fmask.u.legacy.bankh; + out->tile_swizzle = fmask.tile_swizzle; + out->alignment = MAX2(256, fmask.surf_alignment); + out->size = fmask.surf_size; +} + +static void r600_texture_allocate_fmask(struct r600_common_screen *rscreen, + struct r600_texture *rtex) +{ + r600_texture_get_fmask_info(rscreen, rtex, + rtex->resource.b.b.nr_samples, &rtex->fmask); + + rtex->fmask.offset = align64(rtex->size, rtex->fmask.alignment); + rtex->size = rtex->fmask.offset + rtex->fmask.size; +} + +void r600_texture_get_cmask_info(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + struct r600_cmask_info *out) +{ + unsigned cmask_tile_width = 8; + unsigned cmask_tile_height = 8; + unsigned cmask_tile_elements = cmask_tile_width * cmask_tile_height; + unsigned element_bits = 4; + unsigned cmask_cache_bits = 1024; + unsigned num_pipes = rscreen->info.num_tile_pipes; + unsigned pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes; + + unsigned elements_per_macro_tile = (cmask_cache_bits / element_bits) * num_pipes; + unsigned pixels_per_macro_tile = elements_per_macro_tile * cmask_tile_elements; + unsigned sqrt_pixels_per_macro_tile = sqrt(pixels_per_macro_tile); + unsigned macro_tile_width = util_next_power_of_two(sqrt_pixels_per_macro_tile); + unsigned macro_tile_height = pixels_per_macro_tile / macro_tile_width; + + unsigned pitch_elements = align(rtex->resource.b.b.width0, macro_tile_width); + unsigned height = align(rtex->resource.b.b.height0, macro_tile_height); + + unsigned base_align = num_pipes * pipe_interleave_bytes; + unsigned slice_bytes = + ((pitch_elements * height * element_bits + 7) / 8) / cmask_tile_elements; + + assert(macro_tile_width % 128 == 0); + assert(macro_tile_height % 128 == 0); + + out->slice_tile_max = ((pitch_elements * height) / (128*128)) - 1; + out->alignment = MAX2(256, base_align); + out->size = (util_max_layer(&rtex->resource.b.b, 0) + 1) * + align(slice_bytes, base_align); +} + +static void r600_texture_allocate_cmask(struct r600_common_screen *rscreen, + struct r600_texture *rtex) +{ + r600_texture_get_cmask_info(rscreen, rtex, &rtex->cmask); + + rtex->cmask.offset = align64(rtex->size, rtex->cmask.alignment); + rtex->size = rtex->cmask.offset + rtex->cmask.size; + + rtex->cb_color_info |= EG_S_028C70_FAST_CLEAR(1); +} + +static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen, + struct r600_texture *rtex) +{ + if (rtex->cmask_buffer) + return; + + assert(rtex->cmask.size == 0); + + r600_texture_get_cmask_info(rscreen, rtex, &rtex->cmask); + + rtex->cmask_buffer = (struct r600_resource *) + r600_aligned_buffer_create(&rscreen->b, + R600_RESOURCE_FLAG_UNMAPPABLE, + PIPE_USAGE_DEFAULT, + rtex->cmask.size, + rtex->cmask.alignment); + if (rtex->cmask_buffer == NULL) { + rtex->cmask.size = 0; + return; + } + + /* update colorbuffer state bits */ + rtex->cmask.base_address_reg = rtex->cmask_buffer->gpu_address >> 8; + + rtex->cb_color_info |= EG_S_028C70_FAST_CLEAR(1); + + p_atomic_inc(&rscreen->compressed_colortex_counter); +} + +static void r600_texture_get_htile_size(struct r600_common_screen *rscreen, + struct r600_texture *rtex) +{ + unsigned cl_width, cl_height, width, height; + unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align; + unsigned num_pipes = rscreen->info.num_tile_pipes; + + rtex->surface.htile_size = 0; + + if (rscreen->chip_class <= EVERGREEN && + rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 26) + return; + + /* HW bug on R6xx. */ + if (rscreen->chip_class == R600 && + (rtex->resource.b.b.width0 > 7680 || + rtex->resource.b.b.height0 > 7680)) + return; + + switch (num_pipes) { + case 1: + cl_width = 32; + cl_height = 16; + break; + case 2: + cl_width = 32; + cl_height = 32; + break; + case 4: + cl_width = 64; + cl_height = 32; + break; + case 8: + cl_width = 64; + cl_height = 64; + break; + case 16: + cl_width = 128; + cl_height = 64; + break; + default: + assert(0); + return; + } + + width = align(rtex->resource.b.b.width0, cl_width * 8); + height = align(rtex->resource.b.b.height0, cl_height * 8); + + slice_elements = (width * height) / (8 * 8); + slice_bytes = slice_elements * 4; + + pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes; + base_align = num_pipes * pipe_interleave_bytes; + + rtex->surface.htile_alignment = base_align; + rtex->surface.htile_size = + (util_max_layer(&rtex->resource.b.b, 0) + 1) * + align(slice_bytes, base_align); +} + +static void r600_texture_allocate_htile(struct r600_common_screen *rscreen, + struct r600_texture *rtex) +{ + r600_texture_get_htile_size(rscreen, rtex); + + if (!rtex->surface.htile_size) + return; + + rtex->htile_offset = align(rtex->size, rtex->surface.htile_alignment); + rtex->size = rtex->htile_offset + rtex->surface.htile_size; +} + +void r600_print_texture_info(struct r600_common_screen *rscreen, + struct r600_texture *rtex, struct u_log_context *log) +{ + int i; + + /* Common parameters. */ + u_log_printf(log, " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, " + "blk_h=%u, array_size=%u, last_level=%u, " + "bpe=%u, nsamples=%u, flags=0x%x, %s\n", + rtex->resource.b.b.width0, rtex->resource.b.b.height0, + rtex->resource.b.b.depth0, rtex->surface.blk_w, + rtex->surface.blk_h, + rtex->resource.b.b.array_size, rtex->resource.b.b.last_level, + rtex->surface.bpe, rtex->resource.b.b.nr_samples, + rtex->surface.flags, util_format_short_name(rtex->resource.b.b.format)); + + u_log_printf(log, " Layout: size=%"PRIu64", alignment=%u, bankw=%u, " + "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n", + rtex->surface.surf_size, rtex->surface.surf_alignment, rtex->surface.u.legacy.bankw, + rtex->surface.u.legacy.bankh, rtex->surface.u.legacy.num_banks, rtex->surface.u.legacy.mtilea, + rtex->surface.u.legacy.tile_split, rtex->surface.u.legacy.pipe_config, + (rtex->surface.flags & RADEON_SURF_SCANOUT) != 0); + + if (rtex->fmask.size) + u_log_printf(log, " FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, " + "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n", + rtex->fmask.offset, rtex->fmask.size, rtex->fmask.alignment, + rtex->fmask.pitch_in_pixels, rtex->fmask.bank_height, + rtex->fmask.slice_tile_max, rtex->fmask.tile_mode_index); + + if (rtex->cmask.size) + u_log_printf(log, " CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, " + "slice_tile_max=%u\n", + rtex->cmask.offset, rtex->cmask.size, rtex->cmask.alignment, + rtex->cmask.slice_tile_max); + + if (rtex->htile_offset) + u_log_printf(log, " HTile: offset=%"PRIu64", size=%"PRIu64", " + "alignment=%u\n", + rtex->htile_offset, rtex->surface.htile_size, + rtex->surface.htile_alignment); + + for (i = 0; i <= rtex->resource.b.b.last_level; i++) + u_log_printf(log, " Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", " + "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " + "mode=%u, tiling_index = %u\n", + i, rtex->surface.u.legacy.level[i].offset, + rtex->surface.u.legacy.level[i].slice_size, + u_minify(rtex->resource.b.b.width0, i), + u_minify(rtex->resource.b.b.height0, i), + u_minify(rtex->resource.b.b.depth0, i), + rtex->surface.u.legacy.level[i].nblk_x, + rtex->surface.u.legacy.level[i].nblk_y, + rtex->surface.u.legacy.level[i].mode, + rtex->surface.u.legacy.tiling_index[i]); + + if (rtex->surface.has_stencil) { + u_log_printf(log, " StencilLayout: tilesplit=%u\n", + rtex->surface.u.legacy.stencil_tile_split); + for (i = 0; i <= rtex->resource.b.b.last_level; i++) { + u_log_printf(log, " StencilLevel[%i]: offset=%"PRIu64", " + "slice_size=%"PRIu64", npix_x=%u, " + "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " + "mode=%u, tiling_index = %u\n", + i, rtex->surface.u.legacy.stencil_level[i].offset, + rtex->surface.u.legacy.stencil_level[i].slice_size, + u_minify(rtex->resource.b.b.width0, i), + u_minify(rtex->resource.b.b.height0, i), + u_minify(rtex->resource.b.b.depth0, i), + rtex->surface.u.legacy.stencil_level[i].nblk_x, + rtex->surface.u.legacy.stencil_level[i].nblk_y, + rtex->surface.u.legacy.stencil_level[i].mode, + rtex->surface.u.legacy.stencil_tiling_index[i]); + } + } +} + +/* Common processing for r600_texture_create and r600_texture_from_handle */ +static struct r600_texture * +r600_texture_create_object(struct pipe_screen *screen, + const struct pipe_resource *base, + struct pb_buffer *buf, + struct radeon_surf *surface) +{ + struct r600_texture *rtex; + struct r600_resource *resource; + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + + rtex = CALLOC_STRUCT(r600_texture); + if (!rtex) + return NULL; + + resource = &rtex->resource; + resource->b.b = *base; + resource->b.b.next = NULL; + resource->b.vtbl = &r600_texture_vtbl; + pipe_reference_init(&resource->b.b.reference, 1); + resource->b.b.screen = screen; + + /* don't include stencil-only formats which we don't support for rendering */ + rtex->is_depth = util_format_has_depth(util_format_description(rtex->resource.b.b.format)); + + rtex->surface = *surface; + rtex->size = rtex->surface.surf_size; + rtex->db_render_format = base->format; + + /* Tiled depth textures utilize the non-displayable tile order. + * This must be done after r600_setup_surface. + * Applies to R600-Cayman. */ + rtex->non_disp_tiling = rtex->is_depth && rtex->surface.u.legacy.level[0].mode >= RADEON_SURF_MODE_1D; + /* Applies to GCN. */ + rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode; + + if (rtex->is_depth) { + if (base->flags & (R600_RESOURCE_FLAG_TRANSFER | + R600_RESOURCE_FLAG_FLUSHED_DEPTH) || + rscreen->chip_class >= EVERGREEN) { + rtex->can_sample_z = !rtex->surface.u.legacy.depth_adjusted; + rtex->can_sample_s = !rtex->surface.u.legacy.stencil_adjusted; + } else { + if (rtex->resource.b.b.nr_samples <= 1 && + (rtex->resource.b.b.format == PIPE_FORMAT_Z16_UNORM || + rtex->resource.b.b.format == PIPE_FORMAT_Z32_FLOAT)) + rtex->can_sample_z = true; + } + + if (!(base->flags & (R600_RESOURCE_FLAG_TRANSFER | + R600_RESOURCE_FLAG_FLUSHED_DEPTH))) { + rtex->db_compatible = true; + + if (!(rscreen->debug_flags & DBG_NO_HYPERZ)) + r600_texture_allocate_htile(rscreen, rtex); + } + } else { + if (base->nr_samples > 1) { + if (!buf) { + r600_texture_allocate_fmask(rscreen, rtex); + r600_texture_allocate_cmask(rscreen, rtex); + rtex->cmask_buffer = &rtex->resource; + } + if (!rtex->fmask.size || !rtex->cmask.size) { + FREE(rtex); + return NULL; + } + } + } + + /* Now create the backing buffer. */ + if (!buf) { + r600_init_resource_fields(rscreen, resource, rtex->size, + rtex->surface.surf_alignment); + + /* Displayable surfaces are not suballocated. */ + if (resource->b.b.bind & PIPE_BIND_SCANOUT) + resource->flags |= RADEON_FLAG_NO_SUBALLOC; + + if (!r600_alloc_resource(rscreen, resource)) { + FREE(rtex); + return NULL; + } + } else { + resource->buf = buf; + resource->gpu_address = rscreen->ws->buffer_get_virtual_address(resource->buf); + resource->bo_size = buf->size; + resource->bo_alignment = buf->alignment; + resource->domains = rscreen->ws->buffer_get_initial_domain(resource->buf); + if (resource->domains & RADEON_DOMAIN_VRAM) + resource->vram_usage = buf->size; + else if (resource->domains & RADEON_DOMAIN_GTT) + resource->gart_usage = buf->size; + } + + if (rtex->cmask.size) { + /* Initialize the cmask to 0xCC (= compressed state). */ + r600_screen_clear_buffer(rscreen, &rtex->cmask_buffer->b.b, + rtex->cmask.offset, rtex->cmask.size, + 0xCCCCCCCC); + } + if (rtex->htile_offset) { + uint32_t clear_value = 0; + + r600_screen_clear_buffer(rscreen, &rtex->resource.b.b, + rtex->htile_offset, + rtex->surface.htile_size, + clear_value); + } + + /* Initialize the CMASK base register value. */ + rtex->cmask.base_address_reg = + (rtex->resource.gpu_address + rtex->cmask.offset) >> 8; + + if (rscreen->debug_flags & DBG_VM) { + fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Texture %ix%ix%i, %i levels, %i samples, %s\n", + rtex->resource.gpu_address, + rtex->resource.gpu_address + rtex->resource.buf->size, + base->width0, base->height0, util_max_layer(base, 0)+1, base->last_level+1, + base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format)); + } + + if (rscreen->debug_flags & DBG_TEX) { + puts("Texture:"); + struct u_log_context log; + u_log_context_init(&log); + r600_print_texture_info(rscreen, rtex, &log); + u_log_new_page_print(&log, stdout); + fflush(stdout); + u_log_context_destroy(&log); + } + + return rtex; +} + +static enum radeon_surf_mode +r600_choose_tiling(struct r600_common_screen *rscreen, + const struct pipe_resource *templ) +{ + const struct util_format_description *desc = util_format_description(templ->format); + bool force_tiling = templ->flags & R600_RESOURCE_FLAG_FORCE_TILING; + bool is_depth_stencil = util_format_is_depth_or_stencil(templ->format) && + !(templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH); + + /* MSAA resources must be 2D tiled. */ + if (templ->nr_samples > 1) + return RADEON_SURF_MODE_2D; + + /* Transfer resources should be linear. */ + if (templ->flags & R600_RESOURCE_FLAG_TRANSFER) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* r600g: force tiling on TEXTURE_2D and TEXTURE_3D compute resources. */ + if (rscreen->chip_class >= R600 && rscreen->chip_class <= CAYMAN && + (templ->bind & PIPE_BIND_COMPUTE_RESOURCE) && + (templ->target == PIPE_TEXTURE_2D || + templ->target == PIPE_TEXTURE_3D)) + force_tiling = true; + + /* Handle common candidates for the linear mode. + * Compressed textures and DB surfaces must always be tiled. + */ + if (!force_tiling && + !is_depth_stencil && + !util_format_is_compressed(templ->format)) { + if (rscreen->debug_flags & DBG_NO_TILING) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Tiling doesn't work with the 422 (SUBSAMPLED) formats on R600+. */ + if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + if (templ->bind & PIPE_BIND_LINEAR) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Textures with a very small height are recommended to be linear. */ + if (templ->target == PIPE_TEXTURE_1D || + templ->target == PIPE_TEXTURE_1D_ARRAY || + /* Only very thin and long 2D textures should benefit from + * linear_aligned. */ + (templ->width0 > 8 && templ->height0 <= 2)) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + + /* Textures likely to be mapped often. */ + if (templ->usage == PIPE_USAGE_STAGING || + templ->usage == PIPE_USAGE_STREAM) + return RADEON_SURF_MODE_LINEAR_ALIGNED; + } + + /* Make small textures 1D tiled. */ + if (templ->width0 <= 16 || templ->height0 <= 16 || + (rscreen->debug_flags & DBG_NO_2D_TILING)) + return RADEON_SURF_MODE_1D; + + /* The allocator will switch to 1D if needed. */ + return RADEON_SURF_MODE_2D; +} + +struct pipe_resource *r600_texture_create(struct pipe_screen *screen, + const struct pipe_resource *templ) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + struct radeon_surf surface = {0}; + bool is_flushed_depth = templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH; + int r; + + r = r600_init_surface(rscreen, &surface, templ, + r600_choose_tiling(rscreen, templ), 0, 0, + false, false, is_flushed_depth); + if (r) { + return NULL; + } + + return (struct pipe_resource *) + r600_texture_create_object(screen, templ, NULL, &surface); +} + +static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen, + const struct pipe_resource *templ, + struct winsys_handle *whandle, + unsigned usage) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + struct pb_buffer *buf = NULL; + unsigned stride = 0, offset = 0; + enum radeon_surf_mode array_mode; + struct radeon_surf surface = {}; + int r; + struct radeon_bo_metadata metadata = {}; + struct r600_texture *rtex; + bool is_scanout; + + /* Support only 2D textures without mipmaps */ + if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) || + templ->depth0 != 1 || templ->last_level != 0) + return NULL; + + buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride, &offset); + if (!buf) + return NULL; + + rscreen->ws->buffer_get_metadata(buf, &metadata); + r600_surface_import_metadata(rscreen, &surface, &metadata, + &array_mode, &is_scanout); + + r = r600_init_surface(rscreen, &surface, templ, array_mode, stride, + offset, true, is_scanout, false); + if (r) { + return NULL; + } + + rtex = r600_texture_create_object(screen, templ, buf, &surface); + if (!rtex) + return NULL; + + rtex->resource.b.is_shared = true; + rtex->resource.external_usage = usage; + + if (rscreen->apply_opaque_metadata) + rscreen->apply_opaque_metadata(rscreen, rtex, &metadata); + + assert(rtex->surface.tile_swizzle == 0); + return &rtex->resource.b.b; +} + +bool r600_init_flushed_depth_texture(struct pipe_context *ctx, + struct pipe_resource *texture, + struct r600_texture **staging) +{ + struct r600_texture *rtex = (struct r600_texture*)texture; + struct pipe_resource resource; + struct r600_texture **flushed_depth_texture = staging ? + staging : &rtex->flushed_depth_texture; + enum pipe_format pipe_format = texture->format; + + if (!staging) { + if (rtex->flushed_depth_texture) + return true; /* it's ready */ + + if (!rtex->can_sample_z && rtex->can_sample_s) { + switch (pipe_format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + /* Save memory by not allocating the S plane. */ + pipe_format = PIPE_FORMAT_Z32_FLOAT; + break; + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + /* Save memory bandwidth by not copying the + * stencil part during flush. + * + * This potentially increases memory bandwidth + * if an application uses both Z and S texturing + * simultaneously (a flushed Z24S8 texture + * would be stored compactly), but how often + * does that really happen? + */ + pipe_format = PIPE_FORMAT_Z24X8_UNORM; + break; + default:; + } + } else if (!rtex->can_sample_s && rtex->can_sample_z) { + assert(util_format_has_stencil(util_format_description(pipe_format))); + + /* DB->CB copies to an 8bpp surface don't work. */ + pipe_format = PIPE_FORMAT_X24S8_UINT; + } + } + + memset(&resource, 0, sizeof(resource)); + resource.target = texture->target; + resource.format = pipe_format; + resource.width0 = texture->width0; + resource.height0 = texture->height0; + resource.depth0 = texture->depth0; + resource.array_size = texture->array_size; + resource.last_level = texture->last_level; + resource.nr_samples = texture->nr_samples; + resource.usage = staging ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT; + resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL; + resource.flags = texture->flags | R600_RESOURCE_FLAG_FLUSHED_DEPTH; + + if (staging) + resource.flags |= R600_RESOURCE_FLAG_TRANSFER; + + *flushed_depth_texture = (struct r600_texture *)ctx->screen->resource_create(ctx->screen, &resource); + if (*flushed_depth_texture == NULL) { + R600_ERR("failed to create temporary texture to hold flushed depth\n"); + return false; + } + + (*flushed_depth_texture)->non_disp_tiling = false; + return true; +} + +/** + * Initialize the pipe_resource descriptor to be of the same size as the box, + * which is supposed to hold a subregion of the texture "orig" at the given + * mipmap level. + */ +static void r600_init_temp_resource_from_box(struct pipe_resource *res, + struct pipe_resource *orig, + const struct pipe_box *box, + unsigned level, unsigned flags) +{ + memset(res, 0, sizeof(*res)); + res->format = orig->format; + res->width0 = box->width; + res->height0 = box->height; + res->depth0 = 1; + res->array_size = 1; + res->usage = flags & R600_RESOURCE_FLAG_TRANSFER ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT; + res->flags = flags; + + /* We must set the correct texture target and dimensions for a 3D box. */ + if (box->depth > 1 && util_max_layer(orig, level) > 0) { + res->target = PIPE_TEXTURE_2D_ARRAY; + res->array_size = box->depth; + } else { + res->target = PIPE_TEXTURE_2D; + } +} + +static bool r600_can_invalidate_texture(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + unsigned transfer_usage, + const struct pipe_box *box) +{ + /* r600g doesn't react to dirty_tex_descriptor_counter */ + return rscreen->chip_class >= SI && + !rtex->resource.b.is_shared && + !(transfer_usage & PIPE_TRANSFER_READ) && + rtex->resource.b.b.last_level == 0 && + util_texrange_covers_whole_level(&rtex->resource.b.b, 0, + box->x, box->y, box->z, + box->width, box->height, + box->depth); +} + +static void r600_texture_invalidate_storage(struct r600_common_context *rctx, + struct r600_texture *rtex) +{ + struct r600_common_screen *rscreen = rctx->screen; + + /* There is no point in discarding depth and tiled buffers. */ + assert(!rtex->is_depth); + assert(rtex->surface.is_linear); + + /* Reallocate the buffer in the same pipe_resource. */ + r600_alloc_resource(rscreen, &rtex->resource); + + /* Initialize the CMASK base address (needed even without CMASK). */ + rtex->cmask.base_address_reg = + (rtex->resource.gpu_address + rtex->cmask.offset) >> 8; + + p_atomic_inc(&rscreen->dirty_tex_counter); + + rctx->num_alloc_tex_transfer_bytes += rtex->size; +} + +static void *r600_texture_transfer_map(struct pipe_context *ctx, + struct pipe_resource *texture, + unsigned level, + unsigned usage, + const struct pipe_box *box, + struct pipe_transfer **ptransfer) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + struct r600_texture *rtex = (struct r600_texture*)texture; + struct r600_transfer *trans; + struct r600_resource *buf; + unsigned offset = 0; + char *map; + bool use_staging_texture = false; + + assert(!(texture->flags & R600_RESOURCE_FLAG_TRANSFER)); + assert(box->width && box->height && box->depth); + + /* Depth textures use staging unconditionally. */ + if (!rtex->is_depth) { + /* Degrade the tile mode if we get too many transfers on APUs. + * On dGPUs, the staging texture is always faster. + * Only count uploads that are at least 4x4 pixels large. + */ + if (!rctx->screen->info.has_dedicated_vram && + level == 0 && + box->width >= 4 && box->height >= 4 && + p_atomic_inc_return(&rtex->num_level0_transfers) == 10) { + bool can_invalidate = + r600_can_invalidate_texture(rctx->screen, rtex, + usage, box); + + r600_reallocate_texture_inplace(rctx, rtex, + PIPE_BIND_LINEAR, + can_invalidate); + } + + /* Tiled textures need to be converted into a linear texture for CPU + * access. The staging texture is always linear and is placed in GART. + * + * Reading from VRAM or GTT WC is slow, always use the staging + * texture in this case. + * + * Use the staging texture for uploads if the underlying BO + * is busy. + */ + if (!rtex->surface.is_linear) + use_staging_texture = true; + else if (usage & PIPE_TRANSFER_READ) + use_staging_texture = + rtex->resource.domains & RADEON_DOMAIN_VRAM || + rtex->resource.flags & RADEON_FLAG_GTT_WC; + /* Write & linear only: */ + else if (r600_rings_is_buffer_referenced(rctx, rtex->resource.buf, + RADEON_USAGE_READWRITE) || + !rctx->ws->buffer_wait(rtex->resource.buf, 0, + RADEON_USAGE_READWRITE)) { + /* It's busy. */ + if (r600_can_invalidate_texture(rctx->screen, rtex, + usage, box)) + r600_texture_invalidate_storage(rctx, rtex); + else + use_staging_texture = true; + } + } + + trans = CALLOC_STRUCT(r600_transfer); + if (!trans) + return NULL; + pipe_resource_reference(&trans->b.b.resource, texture); + trans->b.b.level = level; + trans->b.b.usage = usage; + trans->b.b.box = *box; + + if (rtex->is_depth) { + struct r600_texture *staging_depth; + + if (rtex->resource.b.b.nr_samples > 1) { + /* MSAA depth buffers need to be converted to single sample buffers. + * + * Mapping MSAA depth buffers can occur if ReadPixels is called + * with a multisample GLX visual. + * + * First downsample the depth buffer to a temporary texture, + * then decompress the temporary one to staging. + * + * Only the region being mapped is transfered. + */ + struct pipe_resource resource; + + r600_init_temp_resource_from_box(&resource, texture, box, level, 0); + + if (!r600_init_flushed_depth_texture(ctx, &resource, &staging_depth)) { + R600_ERR("failed to create temporary texture to hold untiled copy\n"); + FREE(trans); + return NULL; + } + + if (usage & PIPE_TRANSFER_READ) { + struct pipe_resource *temp = ctx->screen->resource_create(ctx->screen, &resource); + if (!temp) { + R600_ERR("failed to create a temporary depth texture\n"); + FREE(trans); + return NULL; + } + + r600_copy_region_with_blit(ctx, temp, 0, 0, 0, 0, texture, level, box); + rctx->blit_decompress_depth(ctx, (struct r600_texture*)temp, staging_depth, + 0, 0, 0, box->depth, 0, 0); + pipe_resource_reference(&temp, NULL); + } + + /* Just get the strides. */ + r600_texture_get_offset(rctx->screen, staging_depth, level, NULL, + &trans->b.b.stride, + &trans->b.b.layer_stride); + } else { + /* XXX: only readback the rectangle which is being mapped? */ + /* XXX: when discard is true, no need to read back from depth texture */ + if (!r600_init_flushed_depth_texture(ctx, texture, &staging_depth)) { + R600_ERR("failed to create temporary texture to hold untiled copy\n"); + FREE(trans); + return NULL; + } + + rctx->blit_decompress_depth(ctx, rtex, staging_depth, + level, level, + box->z, box->z + box->depth - 1, + 0, 0); + + offset = r600_texture_get_offset(rctx->screen, staging_depth, + level, box, + &trans->b.b.stride, + &trans->b.b.layer_stride); + } + + trans->staging = (struct r600_resource*)staging_depth; + buf = trans->staging; + } else if (use_staging_texture) { + struct pipe_resource resource; + struct r600_texture *staging; + + r600_init_temp_resource_from_box(&resource, texture, box, level, + R600_RESOURCE_FLAG_TRANSFER); + resource.usage = (usage & PIPE_TRANSFER_READ) ? + PIPE_USAGE_STAGING : PIPE_USAGE_STREAM; + + /* Create the temporary texture. */ + staging = (struct r600_texture*)ctx->screen->resource_create(ctx->screen, &resource); + if (!staging) { + R600_ERR("failed to create temporary texture to hold untiled copy\n"); + FREE(trans); + return NULL; + } + trans->staging = &staging->resource; + + /* Just get the strides. */ + r600_texture_get_offset(rctx->screen, staging, 0, NULL, + &trans->b.b.stride, + &trans->b.b.layer_stride); + + if (usage & PIPE_TRANSFER_READ) + r600_copy_to_staging_texture(ctx, trans); + else + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + + buf = trans->staging; + } else { + /* the resource is mapped directly */ + offset = r600_texture_get_offset(rctx->screen, rtex, level, box, + &trans->b.b.stride, + &trans->b.b.layer_stride); + buf = &rtex->resource; + } + + if (!(map = r600_buffer_map_sync_with_rings(rctx, buf, usage))) { + r600_resource_reference(&trans->staging, NULL); + FREE(trans); + return NULL; + } + + *ptransfer = &trans->b.b; + return map + offset; +} + +static void r600_texture_transfer_unmap(struct pipe_context *ctx, + struct pipe_transfer* transfer) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; + struct pipe_resource *texture = transfer->resource; + struct r600_texture *rtex = (struct r600_texture*)texture; + + if ((transfer->usage & PIPE_TRANSFER_WRITE) && rtransfer->staging) { + if (rtex->is_depth && rtex->resource.b.b.nr_samples <= 1) { + ctx->resource_copy_region(ctx, texture, transfer->level, + transfer->box.x, transfer->box.y, transfer->box.z, + &rtransfer->staging->b.b, transfer->level, + &transfer->box); + } else { + r600_copy_from_staging_texture(ctx, rtransfer); + } + } + + if (rtransfer->staging) { + rctx->num_alloc_tex_transfer_bytes += rtransfer->staging->buf->size; + r600_resource_reference(&rtransfer->staging, NULL); + } + + /* Heuristic for {upload, draw, upload, draw, ..}: + * + * Flush the gfx IB if we've allocated too much texture storage. + * + * The idea is that we don't want to build IBs that use too much + * memory and put pressure on the kernel memory manager and we also + * want to make temporary and invalidated buffers go idle ASAP to + * decrease the total memory usage or make them reusable. The memory + * usage will be slightly higher than given here because of the buffer + * cache in the winsys. + * + * The result is that the kernel memory manager is never a bottleneck. + */ + if (rctx->num_alloc_tex_transfer_bytes > rctx->screen->info.gart_size / 4) { + rctx->gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL); + rctx->num_alloc_tex_transfer_bytes = 0; + } + + pipe_resource_reference(&transfer->resource, NULL); + FREE(transfer); +} + +static const struct u_resource_vtbl r600_texture_vtbl = +{ + NULL, /* get_handle */ + r600_texture_destroy, /* resource_destroy */ + r600_texture_transfer_map, /* transfer_map */ + u_default_transfer_flush_region, /* transfer_flush_region */ + r600_texture_transfer_unmap, /* transfer_unmap */ +}; + +struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe, + struct pipe_resource *texture, + const struct pipe_surface *templ, + unsigned width0, unsigned height0, + unsigned width, unsigned height) +{ + struct r600_surface *surface = CALLOC_STRUCT(r600_surface); + + if (!surface) + return NULL; + + assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level)); + assert(templ->u.tex.last_layer <= util_max_layer(texture, templ->u.tex.level)); + + pipe_reference_init(&surface->base.reference, 1); + pipe_resource_reference(&surface->base.texture, texture); + surface->base.context = pipe; + surface->base.format = templ->format; + surface->base.width = width; + surface->base.height = height; + surface->base.u = templ->u; + + surface->width0 = width0; + surface->height0 = height0; + + return &surface->base; +} + +static struct pipe_surface *r600_create_surface(struct pipe_context *pipe, + struct pipe_resource *tex, + const struct pipe_surface *templ) +{ + unsigned level = templ->u.tex.level; + unsigned width = u_minify(tex->width0, level); + unsigned height = u_minify(tex->height0, level); + unsigned width0 = tex->width0; + unsigned height0 = tex->height0; + + if (tex->target != PIPE_BUFFER && templ->format != tex->format) { + const struct util_format_description *tex_desc + = util_format_description(tex->format); + const struct util_format_description *templ_desc + = util_format_description(templ->format); + + assert(tex_desc->block.bits == templ_desc->block.bits); + + /* Adjust size of surface if and only if the block width or + * height is changed. */ + if (tex_desc->block.width != templ_desc->block.width || + tex_desc->block.height != templ_desc->block.height) { + unsigned nblks_x = util_format_get_nblocksx(tex->format, width); + unsigned nblks_y = util_format_get_nblocksy(tex->format, height); + + width = nblks_x * templ_desc->block.width; + height = nblks_y * templ_desc->block.height; + + width0 = util_format_get_nblocksx(tex->format, width0); + height0 = util_format_get_nblocksy(tex->format, height0); + } + } + + return r600_create_surface_custom(pipe, tex, templ, + width0, height0, + width, height); +} + +static void r600_surface_destroy(struct pipe_context *pipe, + struct pipe_surface *surface) +{ + struct r600_surface *surf = (struct r600_surface*)surface; + r600_resource_reference(&surf->cb_buffer_fmask, NULL); + r600_resource_reference(&surf->cb_buffer_cmask, NULL); + pipe_resource_reference(&surface->texture, NULL); + FREE(surface); +} + +static void r600_clear_texture(struct pipe_context *pipe, + struct pipe_resource *tex, + unsigned level, + const struct pipe_box *box, + const void *data) +{ + struct pipe_screen *screen = pipe->screen; + struct r600_texture *rtex = (struct r600_texture*)tex; + struct pipe_surface tmpl = {{0}}; + struct pipe_surface *sf; + const struct util_format_description *desc = + util_format_description(tex->format); + + tmpl.format = tex->format; + tmpl.u.tex.first_layer = box->z; + tmpl.u.tex.last_layer = box->z + box->depth - 1; + tmpl.u.tex.level = level; + sf = pipe->create_surface(pipe, tex, &tmpl); + if (!sf) + return; + + if (rtex->is_depth) { + unsigned clear; + float depth; + uint8_t stencil = 0; + + /* Depth is always present. */ + clear = PIPE_CLEAR_DEPTH; + desc->unpack_z_float(&depth, 0, data, 0, 1, 1); + + if (rtex->surface.has_stencil) { + clear |= PIPE_CLEAR_STENCIL; + desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1); + } + + pipe->clear_depth_stencil(pipe, sf, clear, depth, stencil, + box->x, box->y, + box->width, box->height, false); + } else { + union pipe_color_union color; + + /* pipe_color_union requires the full vec4 representation. */ + if (util_format_is_pure_uint(tex->format)) + desc->unpack_rgba_uint(color.ui, 0, data, 0, 1, 1); + else if (util_format_is_pure_sint(tex->format)) + desc->unpack_rgba_sint(color.i, 0, data, 0, 1, 1); + else + desc->unpack_rgba_float(color.f, 0, data, 0, 1, 1); + + if (screen->is_format_supported(screen, tex->format, + tex->target, 0, + PIPE_BIND_RENDER_TARGET)) { + pipe->clear_render_target(pipe, sf, &color, + box->x, box->y, + box->width, box->height, false); + } else { + /* Software fallback - just for R9G9B9E5_FLOAT */ + util_clear_render_target(pipe, sf, &color, + box->x, box->y, + box->width, box->height); + } + } + pipe_surface_reference(&sf, NULL); +} + +unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap) +{ + const struct util_format_description *desc = util_format_description(format); + +#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz) + + if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */ + return V_0280A0_SWAP_STD; + + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) + return ~0U; + + switch (desc->nr_channels) { + case 1: + if (HAS_SWIZZLE(0,X)) + return V_0280A0_SWAP_STD; /* X___ */ + else if (HAS_SWIZZLE(3,X)) + return V_0280A0_SWAP_ALT_REV; /* ___X */ + break; + case 2: + if ((HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,Y)) || + (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(1,NONE)) || + (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,Y))) + return V_0280A0_SWAP_STD; /* XY__ */ + else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) || + (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) || + (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X))) + /* YX__ */ + return (do_endian_swap ? V_0280A0_SWAP_STD : V_0280A0_SWAP_STD_REV); + else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y)) + return V_0280A0_SWAP_ALT; /* X__Y */ + else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X)) + return V_0280A0_SWAP_ALT_REV; /* Y__X */ + break; + case 3: + if (HAS_SWIZZLE(0,X)) + return (do_endian_swap ? V_0280A0_SWAP_STD_REV : V_0280A0_SWAP_STD); + else if (HAS_SWIZZLE(0,Z)) + return V_0280A0_SWAP_STD_REV; /* ZYX */ + break; + case 4: + /* check the middle channels, the 1st and 4th channel can be NONE */ + if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) { + return V_0280A0_SWAP_STD; /* XYZW */ + } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) { + return V_0280A0_SWAP_STD_REV; /* WZYX */ + } else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) { + return V_0280A0_SWAP_ALT; /* ZYXW */ + } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) { + /* YZWX */ + if (desc->is_array) + return V_0280A0_SWAP_ALT_REV; + else + return (do_endian_swap ? V_0280A0_SWAP_ALT : V_0280A0_SWAP_ALT_REV); + } + break; + } + return ~0U; +} + +/* FAST COLOR CLEAR */ + +static void evergreen_set_clear_color(struct r600_texture *rtex, + enum pipe_format surface_format, + const union pipe_color_union *color) +{ + union util_color uc; + + memset(&uc, 0, sizeof(uc)); + + if (rtex->surface.bpe == 16) { + /* DCC fast clear only: + * CLEAR_WORD0 = R = G = B + * CLEAR_WORD1 = A + */ + assert(color->ui[0] == color->ui[1] && + color->ui[0] == color->ui[2]); + uc.ui[0] = color->ui[0]; + uc.ui[1] = color->ui[3]; + } else if (util_format_is_pure_uint(surface_format)) { + util_format_write_4ui(surface_format, color->ui, 0, &uc, 0, 0, 0, 1, 1); + } else if (util_format_is_pure_sint(surface_format)) { + util_format_write_4i(surface_format, color->i, 0, &uc, 0, 0, 0, 1, 1); + } else { + util_pack_color(color->f, surface_format, &uc); + } + + memcpy(rtex->color_clear_value, &uc, 2 * sizeof(uint32_t)); +} + +void evergreen_do_fast_color_clear(struct r600_common_context *rctx, + struct pipe_framebuffer_state *fb, + struct r600_atom *fb_state, + unsigned *buffers, ubyte *dirty_cbufs, + const union pipe_color_union *color) +{ + int i; + + /* This function is broken in BE, so just disable this path for now */ +#ifdef PIPE_ARCH_BIG_ENDIAN + return; +#endif + + if (rctx->render_cond) + return; + + for (i = 0; i < fb->nr_cbufs; i++) { + struct r600_texture *tex; + unsigned clear_bit = PIPE_CLEAR_COLOR0 << i; + + if (!fb->cbufs[i]) + continue; + + /* if this colorbuffer is not being cleared */ + if (!(*buffers & clear_bit)) + continue; + + tex = (struct r600_texture *)fb->cbufs[i]->texture; + + /* the clear is allowed if all layers are bound */ + if (fb->cbufs[i]->u.tex.first_layer != 0 || + fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->resource.b.b, 0)) { + continue; + } + + /* cannot clear mipmapped textures */ + if (fb->cbufs[i]->texture->last_level != 0) { + continue; + } + + /* only supported on tiled surfaces */ + if (tex->surface.is_linear) { + continue; + } + + /* shared textures can't use fast clear without an explicit flush, + * because there is no way to communicate the clear color among + * all clients + */ + if (tex->resource.b.is_shared && + !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) + continue; + + { + /* 128-bit formats are unusupported */ + if (tex->surface.bpe > 8) { + continue; + } + + /* ensure CMASK is enabled */ + r600_texture_alloc_cmask_separate(rctx->screen, tex); + if (tex->cmask.size == 0) { + continue; + } + + /* Do the fast clear. */ + rctx->clear_buffer(&rctx->b, &tex->cmask_buffer->b.b, + tex->cmask.offset, tex->cmask.size, 0, + R600_COHERENCY_CB_META); + + bool need_compressed_update = !tex->dirty_level_mask; + + tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level; + + if (need_compressed_update) + p_atomic_inc(&rctx->screen->compressed_colortex_counter); + } + + evergreen_set_clear_color(tex, fb->cbufs[i]->format, color); + + if (dirty_cbufs) + *dirty_cbufs |= 1 << i; + rctx->set_atom_dirty(rctx, fb_state, true); + *buffers &= ~clear_bit; + } +} + +static struct pipe_memory_object * +r600_memobj_from_handle(struct pipe_screen *screen, + struct winsys_handle *whandle, + bool dedicated) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + struct r600_memory_object *memobj = CALLOC_STRUCT(r600_memory_object); + struct pb_buffer *buf = NULL; + uint32_t stride, offset; + + if (!memobj) + return NULL; + + buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, + &stride, &offset); + if (!buf) { + free(memobj); + return NULL; + } + + memobj->b.dedicated = dedicated; + memobj->buf = buf; + memobj->stride = stride; + memobj->offset = offset; + + return (struct pipe_memory_object *)memobj; + +} + +static void +r600_memobj_destroy(struct pipe_screen *screen, + struct pipe_memory_object *_memobj) +{ + struct r600_memory_object *memobj = (struct r600_memory_object *)_memobj; + + pb_reference(&memobj->buf, NULL); + free(memobj); +} + +static struct pipe_resource * +r600_texture_from_memobj(struct pipe_screen *screen, + const struct pipe_resource *templ, + struct pipe_memory_object *_memobj, + uint64_t offset) +{ + int r; + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + struct r600_memory_object *memobj = (struct r600_memory_object *)_memobj; + struct r600_texture *rtex; + struct radeon_surf surface = {}; + struct radeon_bo_metadata metadata = {}; + enum radeon_surf_mode array_mode; + bool is_scanout; + struct pb_buffer *buf = NULL; + + if (memobj->b.dedicated) { + rscreen->ws->buffer_get_metadata(memobj->buf, &metadata); + r600_surface_import_metadata(rscreen, &surface, &metadata, + &array_mode, &is_scanout); + } else { + /** + * The bo metadata is unset for un-dedicated images. So we fall + * back to linear. See answer to question 5 of the + * VK_KHX_external_memory spec for some details. + * + * It is possible that this case isn't going to work if the + * surface pitch isn't correctly aligned by default. + * + * In order to support it correctly we require multi-image + * metadata to be syncrhonized between radv and radeonsi. The + * semantics of associating multiple image metadata to a memory + * object on the vulkan export side are not concretely defined + * either. + * + * All the use cases we are aware of at the moment for memory + * objects use dedicated allocations. So lets keep the initial + * implementation simple. + * + * A possible alternative is to attempt to reconstruct the + * tiling information when the TexParameter TEXTURE_TILING_EXT + * is set. + */ + array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + is_scanout = false; + + } + + r = r600_init_surface(rscreen, &surface, templ, + array_mode, memobj->stride, + offset, true, is_scanout, + false); + if (r) + return NULL; + + rtex = r600_texture_create_object(screen, templ, memobj->buf, &surface); + if (!rtex) + return NULL; + + /* r600_texture_create_object doesn't increment refcount of + * memobj->buf, so increment it here. + */ + pb_reference(&buf, memobj->buf); + + rtex->resource.b.is_shared = true; + rtex->resource.external_usage = PIPE_HANDLE_USAGE_READ_WRITE; + + if (rscreen->apply_opaque_metadata) + rscreen->apply_opaque_metadata(rscreen, rtex, &metadata); + + return &rtex->resource.b.b; +} + +void r600_init_screen_texture_functions(struct r600_common_screen *rscreen) +{ + rscreen->b.resource_from_handle = r600_texture_from_handle; + rscreen->b.resource_get_handle = r600_texture_get_handle; + rscreen->b.resource_from_memobj = r600_texture_from_memobj; + rscreen->b.memobj_create_from_handle = r600_memobj_from_handle; + rscreen->b.memobj_destroy = r600_memobj_destroy; +} + +void r600_init_context_texture_functions(struct r600_common_context *rctx) +{ + rctx->b.create_surface = r600_create_surface; + rctx->b.surface_destroy = r600_surface_destroy; + rctx->b.clear_texture = r600_clear_texture; +} diff --git a/lib/mesa/src/gallium/drivers/r600/r600_viewport.c b/lib/mesa/src/gallium/drivers/r600/r600_viewport.c new file mode 100644 index 000000000..0797f932f --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600_viewport.c @@ -0,0 +1,456 @@ +/* + * Copyright 2012 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "r600_cs.h" +#include "util/u_viewport.h" +#include "tgsi/tgsi_scan.h" + +#define R600_R_028C0C_PA_CL_GB_VERT_CLIP_ADJ 0x028C0C +#define CM_R_028BE8_PA_CL_GB_VERT_CLIP_ADJ 0x28be8 +#define R_02843C_PA_CL_VPORT_XSCALE 0x02843C + +#define R_028250_PA_SC_VPORT_SCISSOR_0_TL 0x028250 +#define S_028250_TL_X(x) (((unsigned)(x) & 0x7FFF) << 0) +#define G_028250_TL_X(x) (((x) >> 0) & 0x7FFF) +#define C_028250_TL_X 0xFFFF8000 +#define S_028250_TL_Y(x) (((unsigned)(x) & 0x7FFF) << 16) +#define G_028250_TL_Y(x) (((x) >> 16) & 0x7FFF) +#define C_028250_TL_Y 0x8000FFFF +#define S_028250_WINDOW_OFFSET_DISABLE(x) (((unsigned)(x) & 0x1) << 31) +#define G_028250_WINDOW_OFFSET_DISABLE(x) (((x) >> 31) & 0x1) +#define C_028250_WINDOW_OFFSET_DISABLE 0x7FFFFFFF +#define S_028254_BR_X(x) (((unsigned)(x) & 0x7FFF) << 0) +#define G_028254_BR_X(x) (((x) >> 0) & 0x7FFF) +#define C_028254_BR_X 0xFFFF8000 +#define S_028254_BR_Y(x) (((unsigned)(x) & 0x7FFF) << 16) +#define G_028254_BR_Y(x) (((x) >> 16) & 0x7FFF) +#define C_028254_BR_Y 0x8000FFFF +#define R_0282D0_PA_SC_VPORT_ZMIN_0 0x0282D0 +#define R_0282D4_PA_SC_VPORT_ZMAX_0 0x0282D4 + +#define GET_MAX_SCISSOR(rctx) (rctx->chip_class >= EVERGREEN ? 16384 : 8192) + +static void r600_set_scissor_states(struct pipe_context *ctx, + unsigned start_slot, + unsigned num_scissors, + const struct pipe_scissor_state *state) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + int i; + + for (i = 0; i < num_scissors; i++) + rctx->scissors.states[start_slot + i] = state[i]; + + if (!rctx->scissor_enabled) + return; + + rctx->scissors.dirty_mask |= ((1 << num_scissors) - 1) << start_slot; + rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true); +} + +/* Since the guard band disables clipping, we have to clip per-pixel + * using a scissor. + */ +static void r600_get_scissor_from_viewport(struct r600_common_context *rctx, + const struct pipe_viewport_state *vp, + struct r600_signed_scissor *scissor) +{ + float tmp, minx, miny, maxx, maxy; + + /* Convert (-1, -1) and (1, 1) from clip space into window space. */ + minx = -vp->scale[0] + vp->translate[0]; + miny = -vp->scale[1] + vp->translate[1]; + maxx = vp->scale[0] + vp->translate[0]; + maxy = vp->scale[1] + vp->translate[1]; + + /* r600_draw_rectangle sets this. Disable the scissor. */ + if (minx == -1 && miny == -1 && maxx == 1 && maxy == 1) { + scissor->minx = scissor->miny = 0; + scissor->maxx = scissor->maxy = GET_MAX_SCISSOR(rctx); + return; + } + + /* Handle inverted viewports. */ + if (minx > maxx) { + tmp = minx; + minx = maxx; + maxx = tmp; + } + if (miny > maxy) { + tmp = miny; + miny = maxy; + maxy = tmp; + } + + /* Convert to integer and round up the max bounds. */ + scissor->minx = minx; + scissor->miny = miny; + scissor->maxx = ceilf(maxx); + scissor->maxy = ceilf(maxy); +} + +static void r600_clamp_scissor(struct r600_common_context *rctx, + struct pipe_scissor_state *out, + struct r600_signed_scissor *scissor) +{ + unsigned max_scissor = GET_MAX_SCISSOR(rctx); + out->minx = CLAMP(scissor->minx, 0, max_scissor); + out->miny = CLAMP(scissor->miny, 0, max_scissor); + out->maxx = CLAMP(scissor->maxx, 0, max_scissor); + out->maxy = CLAMP(scissor->maxy, 0, max_scissor); +} + +static void r600_clip_scissor(struct pipe_scissor_state *out, + struct pipe_scissor_state *clip) +{ + out->minx = MAX2(out->minx, clip->minx); + out->miny = MAX2(out->miny, clip->miny); + out->maxx = MIN2(out->maxx, clip->maxx); + out->maxy = MIN2(out->maxy, clip->maxy); +} + +static void r600_scissor_make_union(struct r600_signed_scissor *out, + struct r600_signed_scissor *in) +{ + out->minx = MIN2(out->minx, in->minx); + out->miny = MIN2(out->miny, in->miny); + out->maxx = MAX2(out->maxx, in->maxx); + out->maxy = MAX2(out->maxy, in->maxy); +} + +void evergreen_apply_scissor_bug_workaround(struct r600_common_context *rctx, + struct pipe_scissor_state *scissor) +{ + if (rctx->chip_class == EVERGREEN || rctx->chip_class == CAYMAN) { + if (scissor->maxx == 0) + scissor->minx = 1; + if (scissor->maxy == 0) + scissor->miny = 1; + + if (rctx->chip_class == CAYMAN && + scissor->maxx == 1 && scissor->maxy == 1) + scissor->maxx = 2; + } +} + +static void r600_emit_one_scissor(struct r600_common_context *rctx, + struct radeon_winsys_cs *cs, + struct r600_signed_scissor *vp_scissor, + struct pipe_scissor_state *scissor) +{ + struct pipe_scissor_state final; + + if (rctx->vs_disables_clipping_viewport) { + final.minx = final.miny = 0; + final.maxx = final.maxy = GET_MAX_SCISSOR(rctx); + } else { + r600_clamp_scissor(rctx, &final, vp_scissor); + } + + if (scissor) + r600_clip_scissor(&final, scissor); + + evergreen_apply_scissor_bug_workaround(rctx, &final); + + radeon_emit(cs, S_028250_TL_X(final.minx) | + S_028250_TL_Y(final.miny) | + S_028250_WINDOW_OFFSET_DISABLE(1)); + radeon_emit(cs, S_028254_BR_X(final.maxx) | + S_028254_BR_Y(final.maxy)); +} + +/* the range is [-MAX, MAX] */ +#define GET_MAX_VIEWPORT_RANGE(rctx) (rctx->chip_class >= EVERGREEN ? 32768 : 16384) + +static void r600_emit_guardband(struct r600_common_context *rctx, + struct r600_signed_scissor *vp_as_scissor) +{ + struct radeon_winsys_cs *cs = rctx->gfx.cs; + struct pipe_viewport_state vp; + float left, top, right, bottom, max_range, guardband_x, guardband_y; + + /* Reconstruct the viewport transformation from the scissor. */ + vp.translate[0] = (vp_as_scissor->minx + vp_as_scissor->maxx) / 2.0; + vp.translate[1] = (vp_as_scissor->miny + vp_as_scissor->maxy) / 2.0; + vp.scale[0] = vp_as_scissor->maxx - vp.translate[0]; + vp.scale[1] = vp_as_scissor->maxy - vp.translate[1]; + + /* Treat a 0x0 viewport as 1x1 to prevent division by zero. */ + if (vp_as_scissor->minx == vp_as_scissor->maxx) + vp.scale[0] = 0.5; + if (vp_as_scissor->miny == vp_as_scissor->maxy) + vp.scale[1] = 0.5; + + /* Find the biggest guard band that is inside the supported viewport + * range. The guard band is specified as a horizontal and vertical + * distance from (0,0) in clip space. + * + * This is done by applying the inverse viewport transformation + * on the viewport limits to get those limits in clip space. + * + * Use a limit one pixel smaller to allow for some precision error. + */ + max_range = GET_MAX_VIEWPORT_RANGE(rctx) - 1; + left = (-max_range - vp.translate[0]) / vp.scale[0]; + right = ( max_range - vp.translate[0]) / vp.scale[0]; + top = (-max_range - vp.translate[1]) / vp.scale[1]; + bottom = ( max_range - vp.translate[1]) / vp.scale[1]; + + assert(left <= -1 && top <= -1 && right >= 1 && bottom >= 1); + + guardband_x = MIN2(-left, right); + guardband_y = MIN2(-top, bottom); + + /* If any of the GB registers is updated, all of them must be updated. */ + if (rctx->chip_class >= CAYMAN) + radeon_set_context_reg_seq(cs, CM_R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4); + else + radeon_set_context_reg_seq(cs, R600_R_028C0C_PA_CL_GB_VERT_CLIP_ADJ, 4); + + radeon_emit(cs, fui(guardband_y)); /* R_028BE8_PA_CL_GB_VERT_CLIP_ADJ */ + radeon_emit(cs, fui(1.0)); /* R_028BEC_PA_CL_GB_VERT_DISC_ADJ */ + radeon_emit(cs, fui(guardband_x)); /* R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ */ + radeon_emit(cs, fui(1.0)); /* R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */ +} + +static void r600_emit_scissors(struct r600_common_context *rctx, struct r600_atom *atom) +{ + struct radeon_winsys_cs *cs = rctx->gfx.cs; + struct pipe_scissor_state *states = rctx->scissors.states; + unsigned mask = rctx->scissors.dirty_mask; + bool scissor_enabled = rctx->scissor_enabled; + struct r600_signed_scissor max_vp_scissor; + int i; + + /* The simple case: Only 1 viewport is active. */ + if (!rctx->vs_writes_viewport_index) { + struct r600_signed_scissor *vp = &rctx->viewports.as_scissor[0]; + + if (!(mask & 1)) + return; + + radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2); + r600_emit_one_scissor(rctx, cs, vp, scissor_enabled ? &states[0] : NULL); + r600_emit_guardband(rctx, vp); + rctx->scissors.dirty_mask &= ~1; /* clear one bit */ + return; + } + + /* Shaders can draw to any viewport. Make a union of all viewports. */ + max_vp_scissor = rctx->viewports.as_scissor[0]; + for (i = 1; i < R600_MAX_VIEWPORTS; i++) + r600_scissor_make_union(&max_vp_scissor, + &rctx->viewports.as_scissor[i]); + + while (mask) { + int start, count, i; + + u_bit_scan_consecutive_range(&mask, &start, &count); + + radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + + start * 4 * 2, count * 2); + for (i = start; i < start+count; i++) { + r600_emit_one_scissor(rctx, cs, &rctx->viewports.as_scissor[i], + scissor_enabled ? &states[i] : NULL); + } + } + r600_emit_guardband(rctx, &max_vp_scissor); + rctx->scissors.dirty_mask = 0; +} + +static void r600_set_viewport_states(struct pipe_context *ctx, + unsigned start_slot, + unsigned num_viewports, + const struct pipe_viewport_state *state) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + unsigned mask; + int i; + + for (i = 0; i < num_viewports; i++) { + unsigned index = start_slot + i; + + rctx->viewports.states[index] = state[i]; + r600_get_scissor_from_viewport(rctx, &state[i], + &rctx->viewports.as_scissor[index]); + } + + mask = ((1 << num_viewports) - 1) << start_slot; + rctx->viewports.dirty_mask |= mask; + rctx->viewports.depth_range_dirty_mask |= mask; + rctx->scissors.dirty_mask |= mask; + rctx->set_atom_dirty(rctx, &rctx->viewports.atom, true); + rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true); +} + +static void r600_emit_one_viewport(struct r600_common_context *rctx, + struct pipe_viewport_state *state) +{ + struct radeon_winsys_cs *cs = rctx->gfx.cs; + + radeon_emit(cs, fui(state->scale[0])); + radeon_emit(cs, fui(state->translate[0])); + radeon_emit(cs, fui(state->scale[1])); + radeon_emit(cs, fui(state->translate[1])); + radeon_emit(cs, fui(state->scale[2])); + radeon_emit(cs, fui(state->translate[2])); +} + +static void r600_emit_viewports(struct r600_common_context *rctx) +{ + struct radeon_winsys_cs *cs = rctx->gfx.cs; + struct pipe_viewport_state *states = rctx->viewports.states; + unsigned mask = rctx->viewports.dirty_mask; + + /* The simple case: Only 1 viewport is active. */ + if (!rctx->vs_writes_viewport_index) { + if (!(mask & 1)) + return; + + radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6); + r600_emit_one_viewport(rctx, &states[0]); + rctx->viewports.dirty_mask &= ~1; /* clear one bit */ + return; + } + + while (mask) { + int start, count, i; + + u_bit_scan_consecutive_range(&mask, &start, &count); + + radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + + start * 4 * 6, count * 6); + for (i = start; i < start+count; i++) + r600_emit_one_viewport(rctx, &states[i]); + } + rctx->viewports.dirty_mask = 0; +} + +static void r600_emit_depth_ranges(struct r600_common_context *rctx) +{ + struct radeon_winsys_cs *cs = rctx->gfx.cs; + struct pipe_viewport_state *states = rctx->viewports.states; + unsigned mask = rctx->viewports.depth_range_dirty_mask; + float zmin, zmax; + + /* The simple case: Only 1 viewport is active. */ + if (!rctx->vs_writes_viewport_index) { + if (!(mask & 1)) + return; + + util_viewport_zmin_zmax(&states[0], rctx->clip_halfz, &zmin, &zmax); + + radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2); + radeon_emit(cs, fui(zmin)); + radeon_emit(cs, fui(zmax)); + rctx->viewports.depth_range_dirty_mask &= ~1; /* clear one bit */ + return; + } + + while (mask) { + int start, count, i; + + u_bit_scan_consecutive_range(&mask, &start, &count); + + radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 + + start * 4 * 2, count * 2); + for (i = start; i < start+count; i++) { + util_viewport_zmin_zmax(&states[i], rctx->clip_halfz, &zmin, &zmax); + radeon_emit(cs, fui(zmin)); + radeon_emit(cs, fui(zmax)); + } + } + rctx->viewports.depth_range_dirty_mask = 0; +} + +static void r600_emit_viewport_states(struct r600_common_context *rctx, + struct r600_atom *atom) +{ + r600_emit_viewports(rctx); + r600_emit_depth_ranges(rctx); +} + +/* Set viewport dependencies on pipe_rasterizer_state. */ +void r600_viewport_set_rast_deps(struct r600_common_context *rctx, + bool scissor_enable, bool clip_halfz) +{ + if (rctx->scissor_enabled != scissor_enable) { + rctx->scissor_enabled = scissor_enable; + rctx->scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; + rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true); + } + if (rctx->clip_halfz != clip_halfz) { + rctx->clip_halfz = clip_halfz; + rctx->viewports.depth_range_dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; + rctx->set_atom_dirty(rctx, &rctx->viewports.atom, true); + } +} + +/** + * Normally, we only emit 1 viewport and 1 scissor if no shader is using + * the VIEWPORT_INDEX output, and emitting the other viewports and scissors + * is delayed. When a shader with VIEWPORT_INDEX appears, this should be + * called to emit the rest. + */ +void r600_update_vs_writes_viewport_index(struct r600_common_context *rctx, + struct tgsi_shader_info *info) +{ + bool vs_window_space; + + if (!info) + return; + + /* When the VS disables clipping and viewport transformation. */ + vs_window_space = + info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + + if (rctx->vs_disables_clipping_viewport != vs_window_space) { + rctx->vs_disables_clipping_viewport = vs_window_space; + rctx->scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; + rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true); + } + + /* Viewport index handling. */ + rctx->vs_writes_viewport_index = info->writes_viewport_index; + if (!rctx->vs_writes_viewport_index) + return; + + if (rctx->scissors.dirty_mask) + rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true); + + if (rctx->viewports.dirty_mask || + rctx->viewports.depth_range_dirty_mask) + rctx->set_atom_dirty(rctx, &rctx->viewports.atom, true); +} + +void r600_init_viewport_functions(struct r600_common_context *rctx) +{ + rctx->scissors.atom.emit = r600_emit_scissors; + rctx->viewports.atom.emit = r600_emit_viewport_states; + + rctx->scissors.atom.num_dw = (2 + 16 * 2) + 6; + rctx->viewports.atom.num_dw = 2 + 16 * 6; + + rctx->b.set_scissor_states = r600_set_scissor_states; + rctx->b.set_viewport_states = r600_set_viewport_states; +} diff --git a/lib/mesa/src/gallium/drivers/r600/r600d_common.h b/lib/mesa/src/gallium/drivers/r600/r600d_common.h new file mode 100644 index 000000000..ed1d46076 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/r600d_common.h @@ -0,0 +1,135 @@ +/* + * Copyright 2013 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: Marek Olšák <maraeo@gmail.com> + */ + +#ifndef R600D_COMMON_H +#define R600D_COMMON_H + +#define R600_CONFIG_REG_OFFSET 0x08000 +#define R600_CONTEXT_REG_OFFSET 0x28000 +#define SI_SH_REG_OFFSET 0x0000B000 +#define SI_SH_REG_END 0x0000C000 +#define CIK_UCONFIG_REG_OFFSET 0x00030000 +#define CIK_UCONFIG_REG_END 0x00038000 + +#define PKT_TYPE_S(x) (((unsigned)(x) & 0x3) << 30) +#define PKT_COUNT_S(x) (((unsigned)(x) & 0x3FFF) << 16) +#define PKT3_IT_OPCODE_S(x) (((unsigned)(x) & 0xFF) << 8) +#define PKT3_PREDICATE(x) (((x) >> 0) & 0x1) +#define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT_COUNT_S(count) | PKT3_IT_OPCODE_S(op) | PKT3_PREDICATE(predicate)) + +#define PKT3_NOP 0x10 +#define PKT3_SET_PREDICATION 0x20 +#define PKT3_STRMOUT_BUFFER_UPDATE 0x34 +#define STRMOUT_STORE_BUFFER_FILLED_SIZE 1 +#define STRMOUT_OFFSET_SOURCE(x) (((unsigned)(x) & 0x3) << 1) +#define STRMOUT_OFFSET_FROM_PACKET 0 +#define STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE 1 +#define STRMOUT_OFFSET_FROM_MEM 2 +#define STRMOUT_OFFSET_NONE 3 +#define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x) & 0x3) << 8) +#define PKT3_WAIT_REG_MEM 0x3C +#define WAIT_REG_MEM_EQUAL 3 +#define WAIT_REG_MEM_MEM_SPACE(x) (((unsigned)(x) & 0x3) << 4) +#define PKT3_COPY_DATA 0x40 +#define COPY_DATA_SRC_SEL(x) ((x) & 0xf) +#define COPY_DATA_REG 0 +#define COPY_DATA_MEM 1 +#define COPY_DATA_PERF 4 +#define COPY_DATA_IMM 5 +#define COPY_DATA_TIMESTAMP 9 +#define COPY_DATA_DST_SEL(x) (((unsigned)(x) & 0xf) << 8) +#define COPY_DATA_MEM_ASYNC 5 +#define COPY_DATA_COUNT_SEL (1 << 16) +#define COPY_DATA_WR_CONFIRM (1 << 20) +#define PKT3_EVENT_WRITE 0x46 +#define PKT3_EVENT_WRITE_EOP 0x47 +#define EOP_INT_SEL(x) ((x) << 24) +#define EOP_INT_SEL_NONE 0 +#define EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM 3 +#define EOP_DATA_SEL(x) ((x) << 29) +#define EOP_DATA_SEL_DISCARD 0 +#define EOP_DATA_SEL_VALUE_32BIT 1 +#define EOP_DATA_SEL_VALUE_64BIT 2 +#define EOP_DATA_SEL_TIMESTAMP 3 +#define PKT3_RELEASE_MEM 0x49 /* GFX9+ */ +#define PKT3_SET_CONFIG_REG 0x68 +#define PKT3_SET_CONTEXT_REG 0x69 +#define PKT3_STRMOUT_BASE_UPDATE 0x72 /* r700 only */ +#define PKT3_SURFACE_BASE_UPDATE 0x73 /* r600 only */ +#define SURFACE_BASE_UPDATE_DEPTH (1 << 0) +#define SURFACE_BASE_UPDATE_COLOR(x) (2 << (x)) +#define SURFACE_BASE_UPDATE_COLOR_NUM(x) (((1 << x) - 1) << 1) +#define SURFACE_BASE_UPDATE_STRMOUT(x) (0x200 << (x)) +#define PKT3_SET_SH_REG 0x76 /* SI and later */ +#define PKT3_SET_UCONFIG_REG 0x79 /* CIK and later */ + +#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS1 0x1 /* EG and later */ +#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS2 0x2 /* EG and later */ +#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS3 0x3 /* EG and later */ +#define EVENT_TYPE_PS_PARTIAL_FLUSH 0x10 +#define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14 +#define EVENT_TYPE_ZPASS_DONE 0x15 +#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT 0x16 +#define EVENT_TYPE_PERFCOUNTER_START 0x17 +#define EVENT_TYPE_PERFCOUNTER_STOP 0x18 +#define EVENT_TYPE_PIPELINESTAT_START 25 +#define EVENT_TYPE_PIPELINESTAT_STOP 26 +#define EVENT_TYPE_PERFCOUNTER_SAMPLE 0x1B +#define EVENT_TYPE_SAMPLE_PIPELINESTAT 30 +#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH 0x1f +#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS 0x20 +#define EVENT_TYPE_BOTTOM_OF_PIPE_TS 40 +#define EVENT_TYPE_FLUSH_AND_INV_DB_META 0x2c /* supported on r700+ */ +#define EVENT_TYPE_FLUSH_AND_INV_CB_META 46 /* supported on r700+ */ +#define EVENT_TYPE(x) ((x) << 0) +#define EVENT_INDEX(x) ((x) << 8) + /* 0 - any non-TS event + * 1 - ZPASS_DONE + * 2 - SAMPLE_PIPELINESTAT + * 3 - SAMPLE_STREAMOUTSTAT* + * 4 - *S_PARTIAL_FLUSH + * 5 - TS events + */ + +#define PREDICATION_OP_CLEAR 0x0 +#define PREDICATION_OP_ZPASS 0x1 +#define PREDICATION_OP_PRIMCOUNT 0x2 +#define PREDICATION_OP_BOOL64 0x3 +#define PRED_OP(x) ((x) << 16) +#define PREDICATION_CONTINUE (1 << 31) +#define PREDICATION_HINT_WAIT (0 << 12) +#define PREDICATION_HINT_NOWAIT_DRAW (1 << 12) +#define PREDICATION_DRAW_NOT_VISIBLE (0 << 8) +#define PREDICATION_DRAW_VISIBLE (1 << 8) + +#define V_0280A0_SWAP_STD 0x00000000 +#define V_0280A0_SWAP_ALT 0x00000001 +#define V_0280A0_SWAP_STD_REV 0x00000002 +#define V_0280A0_SWAP_ALT_REV 0x00000003 + +#define EG_S_028C70_FAST_CLEAR(x) (((unsigned)(x) & 0x1) << 17) +#define SI_S_028C70_FAST_CLEAR(x) (((unsigned)(x) & 0x1) << 13) + +#endif diff --git a/lib/mesa/src/gallium/drivers/r600/radeon_uvd.c b/lib/mesa/src/gallium/drivers/r600/radeon_uvd.c new file mode 100644 index 000000000..b0551d7e1 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/radeon_uvd.c @@ -0,0 +1,1492 @@ +/************************************************************************** + * + * Copyright 2011 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Authors: + * Christian König <christian.koenig@amd.com> + * + */ + +#include <sys/types.h> +#include <assert.h> +#include <errno.h> +#include <unistd.h> +#include <stdio.h> + +#include "pipe/p_video_codec.h" + +#include "util/u_memory.h" +#include "util/u_video.h" + +#include "vl/vl_defines.h" +#include "vl/vl_mpeg12_decoder.h" + +#include "r600_pipe_common.h" +#include "radeon_video.h" +#include "radeon_uvd.h" + +#define NUM_BUFFERS 4 + +#define NUM_MPEG2_REFS 6 +#define NUM_H264_REFS 17 +#define NUM_VC1_REFS 5 + +#define FB_BUFFER_OFFSET 0x1000 +#define FB_BUFFER_SIZE 2048 +#define FB_BUFFER_SIZE_TONGA (2048 * 64) +#define IT_SCALING_TABLE_SIZE 992 +#define UVD_SESSION_CONTEXT_SIZE (128 * 1024) + +/* UVD decoder representation */ +struct ruvd_decoder { + struct pipe_video_codec base; + + ruvd_set_dtb set_dtb; + + unsigned stream_handle; + unsigned stream_type; + unsigned frame_number; + + struct pipe_screen *screen; + struct radeon_winsys* ws; + struct radeon_winsys_cs* cs; + + unsigned cur_buffer; + + struct rvid_buffer msg_fb_it_buffers[NUM_BUFFERS]; + struct ruvd_msg *msg; + uint32_t *fb; + unsigned fb_size; + uint8_t *it; + + struct rvid_buffer bs_buffers[NUM_BUFFERS]; + void* bs_ptr; + unsigned bs_size; + + struct rvid_buffer dpb; + bool use_legacy; + struct rvid_buffer ctx; + struct rvid_buffer sessionctx; + struct { + unsigned data0; + unsigned data1; + unsigned cmd; + unsigned cntl; + } reg; +}; + +/* flush IB to the hardware */ +static int flush(struct ruvd_decoder *dec, unsigned flags) +{ + return dec->ws->cs_flush(dec->cs, flags, NULL); +} + +/* add a new set register command to the IB */ +static void set_reg(struct ruvd_decoder *dec, unsigned reg, uint32_t val) +{ + radeon_emit(dec->cs, RUVD_PKT0(reg >> 2, 0)); + radeon_emit(dec->cs, val); +} + +/* send a command to the VCPU through the GPCOM registers */ +static void send_cmd(struct ruvd_decoder *dec, unsigned cmd, + struct pb_buffer* buf, uint32_t off, + enum radeon_bo_usage usage, enum radeon_bo_domain domain) +{ + int reloc_idx; + + reloc_idx = dec->ws->cs_add_buffer(dec->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED, + domain, + RADEON_PRIO_UVD); + if (!dec->use_legacy) { + uint64_t addr; + addr = dec->ws->buffer_get_virtual_address(buf); + addr = addr + off; + set_reg(dec, dec->reg.data0, addr); + set_reg(dec, dec->reg.data1, addr >> 32); + } else { + off += dec->ws->buffer_get_reloc_offset(buf); + set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off); + set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4); + } + set_reg(dec, dec->reg.cmd, cmd << 1); +} + +/* do the codec needs an IT buffer ?*/ +static bool have_it(struct ruvd_decoder *dec) +{ + return dec->stream_type == RUVD_CODEC_H264_PERF || + dec->stream_type == RUVD_CODEC_H265; +} + +/* map the next available message/feedback/itscaling buffer */ +static void map_msg_fb_it_buf(struct ruvd_decoder *dec) +{ + struct rvid_buffer* buf; + uint8_t *ptr; + + /* grab the current message/feedback buffer */ + buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; + + /* and map it for CPU access */ + ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE); + + /* calc buffer offsets */ + dec->msg = (struct ruvd_msg *)ptr; + memset(dec->msg, 0, sizeof(*dec->msg)); + + dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET); + if (have_it(dec)) + dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + dec->fb_size); +} + +/* unmap and send a message command to the VCPU */ +static void send_msg_buf(struct ruvd_decoder *dec) +{ + struct rvid_buffer* buf; + + /* ignore the request if message/feedback buffer isn't mapped */ + if (!dec->msg || !dec->fb) + return; + + /* grab the current message buffer */ + buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; + + /* unmap the buffer */ + dec->ws->buffer_unmap(buf->res->buf); + dec->msg = NULL; + dec->fb = NULL; + dec->it = NULL; + + + if (dec->sessionctx.res) + send_cmd(dec, RUVD_CMD_SESSION_CONTEXT_BUFFER, + dec->sessionctx.res->buf, 0, RADEON_USAGE_READWRITE, + RADEON_DOMAIN_VRAM); + + /* and send it to the hardware */ + send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->buf, 0, + RADEON_USAGE_READ, RADEON_DOMAIN_GTT); +} + +/* cycle to the next set of buffers */ +static void next_buffer(struct ruvd_decoder *dec) +{ + ++dec->cur_buffer; + dec->cur_buffer %= NUM_BUFFERS; +} + +/* convert the profile into something UVD understands */ +static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family) +{ + switch (u_reduce_video_profile(dec->base.profile)) { + case PIPE_VIDEO_FORMAT_MPEG4_AVC: + return RUVD_CODEC_H264; + + case PIPE_VIDEO_FORMAT_VC1: + return RUVD_CODEC_VC1; + + case PIPE_VIDEO_FORMAT_MPEG12: + return RUVD_CODEC_MPEG2; + + case PIPE_VIDEO_FORMAT_MPEG4: + return RUVD_CODEC_MPEG4; + + case PIPE_VIDEO_FORMAT_HEVC: + return RUVD_CODEC_H265; + + case PIPE_VIDEO_FORMAT_JPEG: + return RUVD_CODEC_MJPEG; + + default: + assert(0); + return 0; + } +} + +static unsigned calc_ctx_size_h265_main(struct ruvd_decoder *dec) +{ + unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); + unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); + + unsigned max_references = dec->base.max_references + 1; + + if (dec->base.width * dec->base.height >= 4096*2000) + max_references = MAX2(max_references, 8); + else + max_references = MAX2(max_references, 17); + + width = align (width, 16); + height = align (height, 16); + return ((width + 255) / 16) * ((height + 255) / 16) * 16 * max_references + 52 * 1024; +} + +static unsigned calc_ctx_size_h265_main10(struct ruvd_decoder *dec, struct pipe_h265_picture_desc *pic) +{ + unsigned block_size, log2_ctb_size, width_in_ctb, height_in_ctb, num_16x16_block_per_ctb; + unsigned context_buffer_size_per_ctb_row, cm_buffer_size, max_mb_address, db_left_tile_pxl_size; + unsigned db_left_tile_ctx_size = 4096 / 16 * (32 + 16 * 4); + + unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); + unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); + unsigned coeff_10bit = (pic->pps->sps->bit_depth_luma_minus8 || pic->pps->sps->bit_depth_chroma_minus8) ? 2 : 1; + + unsigned max_references = dec->base.max_references + 1; + + if (dec->base.width * dec->base.height >= 4096*2000) + max_references = MAX2(max_references, 8); + else + max_references = MAX2(max_references, 17); + + block_size = (1 << (pic->pps->sps->log2_min_luma_coding_block_size_minus3 + 3)); + log2_ctb_size = block_size + pic->pps->sps->log2_diff_max_min_luma_coding_block_size; + + width_in_ctb = (width + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size; + height_in_ctb = (height + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size; + + num_16x16_block_per_ctb = ((1 << log2_ctb_size) >> 4) * ((1 << log2_ctb_size) >> 4); + context_buffer_size_per_ctb_row = align(width_in_ctb * num_16x16_block_per_ctb * 16, 256); + max_mb_address = (unsigned) ceil(height * 8 / 2048.0); + + cm_buffer_size = max_references * context_buffer_size_per_ctb_row * height_in_ctb; + db_left_tile_pxl_size = coeff_10bit * (max_mb_address * 2 * 2048 + 1024); + + return cm_buffer_size + db_left_tile_ctx_size + db_left_tile_pxl_size; +} + +static unsigned get_db_pitch_alignment(struct ruvd_decoder *dec) +{ + return 16; +} + +/* calculate size of reference picture buffer */ +static unsigned calc_dpb_size(struct ruvd_decoder *dec) +{ + unsigned width_in_mb, height_in_mb, image_size, dpb_size; + + // always align them to MB size for dpb calculation + unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); + unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); + + // always one more for currently decoded picture + unsigned max_references = dec->base.max_references + 1; + + // aligned size of a single frame + image_size = align(width, get_db_pitch_alignment(dec)) * height; + image_size += image_size / 2; + image_size = align(image_size, 1024); + + // picture width & height in 16 pixel units + width_in_mb = width / VL_MACROBLOCK_WIDTH; + height_in_mb = align(height / VL_MACROBLOCK_HEIGHT, 2); + + switch (u_reduce_video_profile(dec->base.profile)) { + case PIPE_VIDEO_FORMAT_MPEG4_AVC: { + if (!dec->use_legacy) { + unsigned fs_in_mb = width_in_mb * height_in_mb; + unsigned alignment = 64, num_dpb_buffer; + + if (dec->stream_type == RUVD_CODEC_H264_PERF) + alignment = 256; + switch(dec->base.level) { + case 30: + num_dpb_buffer = 8100 / fs_in_mb; + break; + case 31: + num_dpb_buffer = 18000 / fs_in_mb; + break; + case 32: + num_dpb_buffer = 20480 / fs_in_mb; + break; + case 41: + num_dpb_buffer = 32768 / fs_in_mb; + break; + case 42: + num_dpb_buffer = 34816 / fs_in_mb; + break; + case 50: + num_dpb_buffer = 110400 / fs_in_mb; + break; + case 51: + num_dpb_buffer = 184320 / fs_in_mb; + break; + default: + num_dpb_buffer = 184320 / fs_in_mb; + break; + } + num_dpb_buffer++; + max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references); + dpb_size = image_size * max_references; + if ((dec->stream_type != RUVD_CODEC_H264_PERF)) { + dpb_size += max_references * align(width_in_mb * height_in_mb * 192, alignment); + dpb_size += align(width_in_mb * height_in_mb * 32, alignment); + } + } else { + // the firmware seems to allways assume a minimum of ref frames + max_references = MAX2(NUM_H264_REFS, max_references); + // reference picture buffer + dpb_size = image_size * max_references; + if ((dec->stream_type != RUVD_CODEC_H264_PERF)) { + // macroblock context buffer + dpb_size += width_in_mb * height_in_mb * max_references * 192; + // IT surface buffer + dpb_size += width_in_mb * height_in_mb * 32; + } + } + break; + } + + case PIPE_VIDEO_FORMAT_HEVC: + if (dec->base.width * dec->base.height >= 4096*2000) + max_references = MAX2(max_references, 8); + else + max_references = MAX2(max_references, 17); + + width = align (width, 16); + height = align (height, 16); + if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + dpb_size = align((align(width, get_db_pitch_alignment(dec)) * height * 9) / 4, 256) * max_references; + else + dpb_size = align((align(width, get_db_pitch_alignment(dec)) * height * 3) / 2, 256) * max_references; + break; + + case PIPE_VIDEO_FORMAT_VC1: + // the firmware seems to allways assume a minimum of ref frames + max_references = MAX2(NUM_VC1_REFS, max_references); + + // reference picture buffer + dpb_size = image_size * max_references; + + // CONTEXT_BUFFER + dpb_size += width_in_mb * height_in_mb * 128; + + // IT surface buffer + dpb_size += width_in_mb * 64; + + // DB surface buffer + dpb_size += width_in_mb * 128; + + // BP + dpb_size += align(MAX2(width_in_mb, height_in_mb) * 7 * 16, 64); + break; + + case PIPE_VIDEO_FORMAT_MPEG12: + // reference picture buffer, must be big enough for all frames + dpb_size = image_size * NUM_MPEG2_REFS; + break; + + case PIPE_VIDEO_FORMAT_MPEG4: + // reference picture buffer + dpb_size = image_size * max_references; + + // CM + dpb_size += width_in_mb * height_in_mb * 64; + + // IT surface buffer + dpb_size += align(width_in_mb * height_in_mb * 32, 64); + + dpb_size = MAX2(dpb_size, 30 * 1024 * 1024); + break; + + case PIPE_VIDEO_FORMAT_JPEG: + dpb_size = 0; + break; + + default: + // something is missing here + assert(0); + + // at least use a sane default value + dpb_size = 32 * 1024 * 1024; + break; + } + return dpb_size; +} + +/* free associated data in the video buffer callback */ +static void ruvd_destroy_associated_data(void *data) +{ + /* NOOP, since we only use an intptr */ +} + +/* get h264 specific message bits */ +static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_picture_desc *pic) +{ + struct ruvd_h264 result; + + memset(&result, 0, sizeof(result)); + switch (pic->base.profile) { + case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE: + case PIPE_VIDEO_PROFILE_MPEG4_AVC_CONSTRAINED_BASELINE: + result.profile = RUVD_H264_PROFILE_BASELINE; + break; + + case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN: + result.profile = RUVD_H264_PROFILE_MAIN; + break; + + case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH: + result.profile = RUVD_H264_PROFILE_HIGH; + break; + + default: + assert(0); + break; + } + + result.level = dec->base.level; + + result.sps_info_flags = 0; + result.sps_info_flags |= pic->pps->sps->direct_8x8_inference_flag << 0; + result.sps_info_flags |= pic->pps->sps->mb_adaptive_frame_field_flag << 1; + result.sps_info_flags |= pic->pps->sps->frame_mbs_only_flag << 2; + result.sps_info_flags |= pic->pps->sps->delta_pic_order_always_zero_flag << 3; + + result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8; + result.bit_depth_chroma_minus8 = pic->pps->sps->bit_depth_chroma_minus8; + result.log2_max_frame_num_minus4 = pic->pps->sps->log2_max_frame_num_minus4; + result.pic_order_cnt_type = pic->pps->sps->pic_order_cnt_type; + result.log2_max_pic_order_cnt_lsb_minus4 = pic->pps->sps->log2_max_pic_order_cnt_lsb_minus4; + + switch (dec->base.chroma_format) { + case PIPE_VIDEO_CHROMA_FORMAT_NONE: + /* TODO: assert? */ + break; + case PIPE_VIDEO_CHROMA_FORMAT_400: + result.chroma_format = 0; + break; + case PIPE_VIDEO_CHROMA_FORMAT_420: + result.chroma_format = 1; + break; + case PIPE_VIDEO_CHROMA_FORMAT_422: + result.chroma_format = 2; + break; + case PIPE_VIDEO_CHROMA_FORMAT_444: + result.chroma_format = 3; + break; + } + + result.pps_info_flags = 0; + result.pps_info_flags |= pic->pps->transform_8x8_mode_flag << 0; + result.pps_info_flags |= pic->pps->redundant_pic_cnt_present_flag << 1; + result.pps_info_flags |= pic->pps->constrained_intra_pred_flag << 2; + result.pps_info_flags |= pic->pps->deblocking_filter_control_present_flag << 3; + result.pps_info_flags |= pic->pps->weighted_bipred_idc << 4; + result.pps_info_flags |= pic->pps->weighted_pred_flag << 6; + result.pps_info_flags |= pic->pps->bottom_field_pic_order_in_frame_present_flag << 7; + result.pps_info_flags |= pic->pps->entropy_coding_mode_flag << 8; + + result.num_slice_groups_minus1 = pic->pps->num_slice_groups_minus1; + result.slice_group_map_type = pic->pps->slice_group_map_type; + result.slice_group_change_rate_minus1 = pic->pps->slice_group_change_rate_minus1; + result.pic_init_qp_minus26 = pic->pps->pic_init_qp_minus26; + result.chroma_qp_index_offset = pic->pps->chroma_qp_index_offset; + result.second_chroma_qp_index_offset = pic->pps->second_chroma_qp_index_offset; + + memcpy(result.scaling_list_4x4, pic->pps->ScalingList4x4, 6*16); + memcpy(result.scaling_list_8x8, pic->pps->ScalingList8x8, 2*64); + + if (dec->stream_type == RUVD_CODEC_H264_PERF) { + memcpy(dec->it, result.scaling_list_4x4, 6*16); + memcpy((dec->it + 96), result.scaling_list_8x8, 2*64); + } + + result.num_ref_frames = pic->num_ref_frames; + + result.num_ref_idx_l0_active_minus1 = pic->num_ref_idx_l0_active_minus1; + result.num_ref_idx_l1_active_minus1 = pic->num_ref_idx_l1_active_minus1; + + result.frame_num = pic->frame_num; + memcpy(result.frame_num_list, pic->frame_num_list, 4*16); + result.curr_field_order_cnt_list[0] = pic->field_order_cnt[0]; + result.curr_field_order_cnt_list[1] = pic->field_order_cnt[1]; + memcpy(result.field_order_cnt_list, pic->field_order_cnt_list, 4*16*2); + + result.decoded_pic_idx = pic->frame_num; + + return result; +} + +/* get h265 specific message bits */ +static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video_buffer *target, + struct pipe_h265_picture_desc *pic) +{ + struct ruvd_h265 result; + unsigned i; + + memset(&result, 0, sizeof(result)); + + result.sps_info_flags = 0; + result.sps_info_flags |= pic->pps->sps->scaling_list_enabled_flag << 0; + result.sps_info_flags |= pic->pps->sps->amp_enabled_flag << 1; + result.sps_info_flags |= pic->pps->sps->sample_adaptive_offset_enabled_flag << 2; + result.sps_info_flags |= pic->pps->sps->pcm_enabled_flag << 3; + result.sps_info_flags |= pic->pps->sps->pcm_loop_filter_disabled_flag << 4; + result.sps_info_flags |= pic->pps->sps->long_term_ref_pics_present_flag << 5; + result.sps_info_flags |= pic->pps->sps->sps_temporal_mvp_enabled_flag << 6; + result.sps_info_flags |= pic->pps->sps->strong_intra_smoothing_enabled_flag << 7; + result.sps_info_flags |= pic->pps->sps->separate_colour_plane_flag << 8; + if (pic->UseRefPicList == true) + result.sps_info_flags |= 1 << 10; + + result.chroma_format = pic->pps->sps->chroma_format_idc; + result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8; + result.bit_depth_chroma_minus8 = pic->pps->sps->bit_depth_chroma_minus8; + result.log2_max_pic_order_cnt_lsb_minus4 = pic->pps->sps->log2_max_pic_order_cnt_lsb_minus4; + result.sps_max_dec_pic_buffering_minus1 = pic->pps->sps->sps_max_dec_pic_buffering_minus1; + result.log2_min_luma_coding_block_size_minus3 = pic->pps->sps->log2_min_luma_coding_block_size_minus3; + result.log2_diff_max_min_luma_coding_block_size = pic->pps->sps->log2_diff_max_min_luma_coding_block_size; + result.log2_min_transform_block_size_minus2 = pic->pps->sps->log2_min_transform_block_size_minus2; + result.log2_diff_max_min_transform_block_size = pic->pps->sps->log2_diff_max_min_transform_block_size; + result.max_transform_hierarchy_depth_inter = pic->pps->sps->max_transform_hierarchy_depth_inter; + result.max_transform_hierarchy_depth_intra = pic->pps->sps->max_transform_hierarchy_depth_intra; + result.pcm_sample_bit_depth_luma_minus1 = pic->pps->sps->pcm_sample_bit_depth_luma_minus1; + result.pcm_sample_bit_depth_chroma_minus1 = pic->pps->sps->pcm_sample_bit_depth_chroma_minus1; + result.log2_min_pcm_luma_coding_block_size_minus3 = pic->pps->sps->log2_min_pcm_luma_coding_block_size_minus3; + result.log2_diff_max_min_pcm_luma_coding_block_size = pic->pps->sps->log2_diff_max_min_pcm_luma_coding_block_size; + result.num_short_term_ref_pic_sets = pic->pps->sps->num_short_term_ref_pic_sets; + + result.pps_info_flags = 0; + result.pps_info_flags |= pic->pps->dependent_slice_segments_enabled_flag << 0; + result.pps_info_flags |= pic->pps->output_flag_present_flag << 1; + result.pps_info_flags |= pic->pps->sign_data_hiding_enabled_flag << 2; + result.pps_info_flags |= pic->pps->cabac_init_present_flag << 3; + result.pps_info_flags |= pic->pps->constrained_intra_pred_flag << 4; + result.pps_info_flags |= pic->pps->transform_skip_enabled_flag << 5; + result.pps_info_flags |= pic->pps->cu_qp_delta_enabled_flag << 6; + result.pps_info_flags |= pic->pps->pps_slice_chroma_qp_offsets_present_flag << 7; + result.pps_info_flags |= pic->pps->weighted_pred_flag << 8; + result.pps_info_flags |= pic->pps->weighted_bipred_flag << 9; + result.pps_info_flags |= pic->pps->transquant_bypass_enabled_flag << 10; + result.pps_info_flags |= pic->pps->tiles_enabled_flag << 11; + result.pps_info_flags |= pic->pps->entropy_coding_sync_enabled_flag << 12; + result.pps_info_flags |= pic->pps->uniform_spacing_flag << 13; + result.pps_info_flags |= pic->pps->loop_filter_across_tiles_enabled_flag << 14; + result.pps_info_flags |= pic->pps->pps_loop_filter_across_slices_enabled_flag << 15; + result.pps_info_flags |= pic->pps->deblocking_filter_override_enabled_flag << 16; + result.pps_info_flags |= pic->pps->pps_deblocking_filter_disabled_flag << 17; + result.pps_info_flags |= pic->pps->lists_modification_present_flag << 18; + result.pps_info_flags |= pic->pps->slice_segment_header_extension_present_flag << 19; + //result.pps_info_flags |= pic->pps->deblocking_filter_control_present_flag; ??? + + result.num_extra_slice_header_bits = pic->pps->num_extra_slice_header_bits; + result.num_long_term_ref_pic_sps = pic->pps->sps->num_long_term_ref_pics_sps; + result.num_ref_idx_l0_default_active_minus1 = pic->pps->num_ref_idx_l0_default_active_minus1; + result.num_ref_idx_l1_default_active_minus1 = pic->pps->num_ref_idx_l1_default_active_minus1; + result.pps_cb_qp_offset = pic->pps->pps_cb_qp_offset; + result.pps_cr_qp_offset = pic->pps->pps_cr_qp_offset; + result.pps_beta_offset_div2 = pic->pps->pps_beta_offset_div2; + result.pps_tc_offset_div2 = pic->pps->pps_tc_offset_div2; + result.diff_cu_qp_delta_depth = pic->pps->diff_cu_qp_delta_depth; + result.num_tile_columns_minus1 = pic->pps->num_tile_columns_minus1; + result.num_tile_rows_minus1 = pic->pps->num_tile_rows_minus1; + result.log2_parallel_merge_level_minus2 = pic->pps->log2_parallel_merge_level_minus2; + result.init_qp_minus26 = pic->pps->init_qp_minus26; + + for (i = 0; i < 19; ++i) + result.column_width_minus1[i] = pic->pps->column_width_minus1[i]; + + for (i = 0; i < 21; ++i) + result.row_height_minus1[i] = pic->pps->row_height_minus1[i]; + + result.num_delta_pocs_ref_rps_idx = pic->NumDeltaPocsOfRefRpsIdx; + result.curr_idx = pic->CurrPicOrderCntVal; + result.curr_poc = pic->CurrPicOrderCntVal; + + vl_video_buffer_set_associated_data(target, &dec->base, + (void *)(uintptr_t)pic->CurrPicOrderCntVal, + &ruvd_destroy_associated_data); + + for (i = 0; i < 16; ++i) { + struct pipe_video_buffer *ref = pic->ref[i]; + uintptr_t ref_pic = 0; + + result.poc_list[i] = pic->PicOrderCntVal[i]; + + if (ref) + ref_pic = (uintptr_t)vl_video_buffer_get_associated_data(ref, &dec->base); + else + ref_pic = 0x7F; + result.ref_pic_list[i] = ref_pic; + } + + for (i = 0; i < 8; ++i) { + result.ref_pic_set_st_curr_before[i] = 0xFF; + result.ref_pic_set_st_curr_after[i] = 0xFF; + result.ref_pic_set_lt_curr[i] = 0xFF; + } + + for (i = 0; i < pic->NumPocStCurrBefore; ++i) + result.ref_pic_set_st_curr_before[i] = pic->RefPicSetStCurrBefore[i]; + + for (i = 0; i < pic->NumPocStCurrAfter; ++i) + result.ref_pic_set_st_curr_after[i] = pic->RefPicSetStCurrAfter[i]; + + for (i = 0; i < pic->NumPocLtCurr; ++i) + result.ref_pic_set_lt_curr[i] = pic->RefPicSetLtCurr[i]; + + for (i = 0; i < 6; ++i) + result.ucScalingListDCCoefSizeID2[i] = pic->pps->sps->ScalingListDCCoeff16x16[i]; + + for (i = 0; i < 2; ++i) + result.ucScalingListDCCoefSizeID3[i] = pic->pps->sps->ScalingListDCCoeff32x32[i]; + + memcpy(dec->it, pic->pps->sps->ScalingList4x4, 6 * 16); + memcpy(dec->it + 96, pic->pps->sps->ScalingList8x8, 6 * 64); + memcpy(dec->it + 480, pic->pps->sps->ScalingList16x16, 6 * 64); + memcpy(dec->it + 864, pic->pps->sps->ScalingList32x32, 2 * 64); + + for (i = 0 ; i < 2 ; i++) { + for (int j = 0 ; j < 15 ; j++) + result.direct_reflist[i][j] = pic->RefPicList[i][j]; + } + + if (pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) { + if (target->buffer_format == PIPE_FORMAT_P016) { + result.p010_mode = 1; + result.msb_mode = 1; + } else { + result.luma_10to8 = 5; + result.chroma_10to8 = 5; + result.sclr_luma10to8 = 4; + result.sclr_chroma10to8 = 4; + } + } + + /* TODO + result.highestTid; + result.isNonRef; + + IDRPicFlag; + RAPPicFlag; + NumPocTotalCurr; + NumShortTermPictureSliceHeaderBits; + NumLongTermPictureSliceHeaderBits; + + IsLongTerm[16]; + */ + + return result; +} + +/* get vc1 specific message bits */ +static struct ruvd_vc1 get_vc1_msg(struct pipe_vc1_picture_desc *pic) +{ + struct ruvd_vc1 result; + + memset(&result, 0, sizeof(result)); + + switch(pic->base.profile) { + case PIPE_VIDEO_PROFILE_VC1_SIMPLE: + result.profile = RUVD_VC1_PROFILE_SIMPLE; + result.level = 1; + break; + + case PIPE_VIDEO_PROFILE_VC1_MAIN: + result.profile = RUVD_VC1_PROFILE_MAIN; + result.level = 2; + break; + + case PIPE_VIDEO_PROFILE_VC1_ADVANCED: + result.profile = RUVD_VC1_PROFILE_ADVANCED; + result.level = 4; + break; + + default: + assert(0); + } + + /* fields common for all profiles */ + result.sps_info_flags |= pic->postprocflag << 7; + result.sps_info_flags |= pic->pulldown << 6; + result.sps_info_flags |= pic->interlace << 5; + result.sps_info_flags |= pic->tfcntrflag << 4; + result.sps_info_flags |= pic->finterpflag << 3; + result.sps_info_flags |= pic->psf << 1; + + result.pps_info_flags |= pic->range_mapy_flag << 31; + result.pps_info_flags |= pic->range_mapy << 28; + result.pps_info_flags |= pic->range_mapuv_flag << 27; + result.pps_info_flags |= pic->range_mapuv << 24; + result.pps_info_flags |= pic->multires << 21; + result.pps_info_flags |= pic->maxbframes << 16; + result.pps_info_flags |= pic->overlap << 11; + result.pps_info_flags |= pic->quantizer << 9; + result.pps_info_flags |= pic->panscan_flag << 7; + result.pps_info_flags |= pic->refdist_flag << 6; + result.pps_info_flags |= pic->vstransform << 0; + + /* some fields only apply to main/advanced profile */ + if (pic->base.profile != PIPE_VIDEO_PROFILE_VC1_SIMPLE) { + result.pps_info_flags |= pic->syncmarker << 20; + result.pps_info_flags |= pic->rangered << 19; + result.pps_info_flags |= pic->loopfilter << 5; + result.pps_info_flags |= pic->fastuvmc << 4; + result.pps_info_flags |= pic->extended_mv << 3; + result.pps_info_flags |= pic->extended_dmv << 8; + result.pps_info_flags |= pic->dquant << 1; + } + + result.chroma_format = 1; + +#if 0 +//(((unsigned int)(pPicParams->advance.reserved1)) << SPS_INFO_VC1_RESERVED_SHIFT) +uint32_t slice_count +uint8_t picture_type +uint8_t frame_coding_mode +uint8_t deblockEnable +uint8_t pquant +#endif + + return result; +} + +/* extract the frame number from a referenced video buffer */ +static uint32_t get_ref_pic_idx(struct ruvd_decoder *dec, struct pipe_video_buffer *ref) +{ + uint32_t min = MAX2(dec->frame_number, NUM_MPEG2_REFS) - NUM_MPEG2_REFS; + uint32_t max = MAX2(dec->frame_number, 1) - 1; + uintptr_t frame; + + /* seems to be the most sane fallback */ + if (!ref) + return max; + + /* get the frame number from the associated data */ + frame = (uintptr_t)vl_video_buffer_get_associated_data(ref, &dec->base); + + /* limit the frame number to a valid range */ + return MAX2(MIN2(frame, max), min); +} + +/* get mpeg2 specific msg bits */ +static struct ruvd_mpeg2 get_mpeg2_msg(struct ruvd_decoder *dec, + struct pipe_mpeg12_picture_desc *pic) +{ + const int *zscan = pic->alternate_scan ? vl_zscan_alternate : vl_zscan_normal; + struct ruvd_mpeg2 result; + unsigned i; + + memset(&result, 0, sizeof(result)); + result.decoded_pic_idx = dec->frame_number; + for (i = 0; i < 2; ++i) + result.ref_pic_idx[i] = get_ref_pic_idx(dec, pic->ref[i]); + + result.load_intra_quantiser_matrix = 1; + result.load_nonintra_quantiser_matrix = 1; + + for (i = 0; i < 64; ++i) { + result.intra_quantiser_matrix[i] = pic->intra_matrix[zscan[i]]; + result.nonintra_quantiser_matrix[i] = pic->non_intra_matrix[zscan[i]]; + } + + result.profile_and_level_indication = 0; + result.chroma_format = 0x1; + + result.picture_coding_type = pic->picture_coding_type; + result.f_code[0][0] = pic->f_code[0][0] + 1; + result.f_code[0][1] = pic->f_code[0][1] + 1; + result.f_code[1][0] = pic->f_code[1][0] + 1; + result.f_code[1][1] = pic->f_code[1][1] + 1; + result.intra_dc_precision = pic->intra_dc_precision; + result.pic_structure = pic->picture_structure; + result.top_field_first = pic->top_field_first; + result.frame_pred_frame_dct = pic->frame_pred_frame_dct; + result.concealment_motion_vectors = pic->concealment_motion_vectors; + result.q_scale_type = pic->q_scale_type; + result.intra_vlc_format = pic->intra_vlc_format; + result.alternate_scan = pic->alternate_scan; + + return result; +} + +/* get mpeg4 specific msg bits */ +static struct ruvd_mpeg4 get_mpeg4_msg(struct ruvd_decoder *dec, + struct pipe_mpeg4_picture_desc *pic) +{ + struct ruvd_mpeg4 result; + unsigned i; + + memset(&result, 0, sizeof(result)); + result.decoded_pic_idx = dec->frame_number; + for (i = 0; i < 2; ++i) + result.ref_pic_idx[i] = get_ref_pic_idx(dec, pic->ref[i]); + + result.variant_type = 0; + result.profile_and_level_indication = 0xF0; // ASP Level0 + + result.video_object_layer_verid = 0x5; // advanced simple + result.video_object_layer_shape = 0x0; // rectangular + + result.video_object_layer_width = dec->base.width; + result.video_object_layer_height = dec->base.height; + + result.vop_time_increment_resolution = pic->vop_time_increment_resolution; + + result.flags |= pic->short_video_header << 0; + //result.flags |= obmc_disable << 1; + result.flags |= pic->interlaced << 2; + result.flags |= 1 << 3; // load_intra_quant_mat + result.flags |= 1 << 4; // load_nonintra_quant_mat + result.flags |= pic->quarter_sample << 5; + result.flags |= 1 << 6; // complexity_estimation_disable + result.flags |= pic->resync_marker_disable << 7; + //result.flags |= data_partitioned << 8; + //result.flags |= reversible_vlc << 9; + result.flags |= 0 << 10; // newpred_enable + result.flags |= 0 << 11; // reduced_resolution_vop_enable + //result.flags |= scalability << 12; + //result.flags |= is_object_layer_identifier << 13; + //result.flags |= fixed_vop_rate << 14; + //result.flags |= newpred_segment_type << 15; + + result.quant_type = pic->quant_type; + + for (i = 0; i < 64; ++i) { + result.intra_quant_mat[i] = pic->intra_matrix[vl_zscan_normal[i]]; + result.nonintra_quant_mat[i] = pic->non_intra_matrix[vl_zscan_normal[i]]; + } + + /* + int32_t trd [2] + int32_t trb [2] + uint8_t vop_coding_type + uint8_t vop_fcode_forward + uint8_t vop_fcode_backward + uint8_t rounding_control + uint8_t alternate_vertical_scan_flag + uint8_t top_field_first + */ + + return result; +} + +static void get_mjpeg_slice_header(struct ruvd_decoder *dec, struct pipe_mjpeg_picture_desc *pic) +{ + int size = 0, saved_size, len_pos, i; + uint16_t *bs; + uint8_t *buf = dec->bs_ptr; + + /* SOI */ + buf[size++] = 0xff; + buf[size++] = 0xd8; + + /* DQT */ + buf[size++] = 0xff; + buf[size++] = 0xdb; + + len_pos = size++; + size++; + + for (i = 0; i < 4; ++i) { + if (pic->quantization_table.load_quantiser_table[i] == 0) + continue; + + buf[size++] = i; + memcpy((buf + size), &pic->quantization_table.quantiser_table[i], 64); + size += 64; + } + + bs = (uint16_t*)&buf[len_pos]; + *bs = util_bswap16(size - 4); + + saved_size = size; + + /* DHT */ + buf[size++] = 0xff; + buf[size++] = 0xc4; + + len_pos = size++; + size++; + + for (i = 0; i < 2; ++i) { + if (pic->huffman_table.load_huffman_table[i] == 0) + continue; + + buf[size++] = 0x00 | i; + memcpy((buf + size), &pic->huffman_table.table[i].num_dc_codes, 16); + size += 16; + memcpy((buf + size), &pic->huffman_table.table[i].dc_values, 12); + size += 12; + } + + for (i = 0; i < 2; ++i) { + if (pic->huffman_table.load_huffman_table[i] == 0) + continue; + + buf[size++] = 0x10 | i; + memcpy((buf + size), &pic->huffman_table.table[i].num_ac_codes, 16); + size += 16; + memcpy((buf + size), &pic->huffman_table.table[i].ac_values, 162); + size += 162; + } + + bs = (uint16_t*)&buf[len_pos]; + *bs = util_bswap16(size - saved_size - 2); + + saved_size = size; + + /* DRI */ + if (pic->slice_parameter.restart_interval) { + buf[size++] = 0xff; + buf[size++] = 0xdd; + buf[size++] = 0x00; + buf[size++] = 0x04; + bs = (uint16_t*)&buf[size++]; + *bs = util_bswap16(pic->slice_parameter.restart_interval); + saved_size = ++size; + } + + /* SOF */ + buf[size++] = 0xff; + buf[size++] = 0xc0; + + len_pos = size++; + size++; + + buf[size++] = 0x08; + + bs = (uint16_t*)&buf[size++]; + *bs = util_bswap16(pic->picture_parameter.picture_height); + size++; + + bs = (uint16_t*)&buf[size++]; + *bs = util_bswap16(pic->picture_parameter.picture_width); + size++; + + buf[size++] = pic->picture_parameter.num_components; + + for (i = 0; i < pic->picture_parameter.num_components; ++i) { + buf[size++] = pic->picture_parameter.components[i].component_id; + buf[size++] = pic->picture_parameter.components[i].h_sampling_factor << 4 | + pic->picture_parameter.components[i].v_sampling_factor; + buf[size++] = pic->picture_parameter.components[i].quantiser_table_selector; + } + + bs = (uint16_t*)&buf[len_pos]; + *bs = util_bswap16(size - saved_size - 2); + + saved_size = size; + + /* SOS */ + buf[size++] = 0xff; + buf[size++] = 0xda; + + len_pos = size++; + size++; + + buf[size++] = pic->slice_parameter.num_components; + + for (i = 0; i < pic->slice_parameter.num_components; ++i) { + buf[size++] = pic->slice_parameter.components[i].component_selector; + buf[size++] = pic->slice_parameter.components[i].dc_table_selector << 4 | + pic->slice_parameter.components[i].ac_table_selector; + } + + buf[size++] = 0x00; + buf[size++] = 0x3f; + buf[size++] = 0x00; + + bs = (uint16_t*)&buf[len_pos]; + *bs = util_bswap16(size - saved_size - 2); + + dec->bs_ptr += size; + dec->bs_size += size; +} + +/** + * destroy this video decoder + */ +static void ruvd_destroy(struct pipe_video_codec *decoder) +{ + struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder; + unsigned i; + + assert(decoder); + + map_msg_fb_it_buf(dec); + dec->msg->size = sizeof(*dec->msg); + dec->msg->msg_type = RUVD_MSG_DESTROY; + dec->msg->stream_handle = dec->stream_handle; + send_msg_buf(dec); + + flush(dec, 0); + + dec->ws->cs_destroy(dec->cs); + + for (i = 0; i < NUM_BUFFERS; ++i) { + rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]); + rvid_destroy_buffer(&dec->bs_buffers[i]); + } + + rvid_destroy_buffer(&dec->dpb); + rvid_destroy_buffer(&dec->ctx); + rvid_destroy_buffer(&dec->sessionctx); + + FREE(dec); +} + +/** + * start decoding of a new frame + */ +static void ruvd_begin_frame(struct pipe_video_codec *decoder, + struct pipe_video_buffer *target, + struct pipe_picture_desc *picture) +{ + struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder; + uintptr_t frame; + + assert(decoder); + + frame = ++dec->frame_number; + vl_video_buffer_set_associated_data(target, decoder, (void *)frame, + &ruvd_destroy_associated_data); + + dec->bs_size = 0; + dec->bs_ptr = dec->ws->buffer_map( + dec->bs_buffers[dec->cur_buffer].res->buf, + dec->cs, PIPE_TRANSFER_WRITE); +} + +/** + * decode a macroblock + */ +static void ruvd_decode_macroblock(struct pipe_video_codec *decoder, + struct pipe_video_buffer *target, + struct pipe_picture_desc *picture, + const struct pipe_macroblock *macroblocks, + unsigned num_macroblocks) +{ + /* not supported (yet) */ + assert(0); +} + +/** + * decode a bitstream + */ +static void ruvd_decode_bitstream(struct pipe_video_codec *decoder, + struct pipe_video_buffer *target, + struct pipe_picture_desc *picture, + unsigned num_buffers, + const void * const *buffers, + const unsigned *sizes) +{ + struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder; + enum pipe_video_format format = u_reduce_video_profile(picture->profile); + unsigned i; + + assert(decoder); + + if (!dec->bs_ptr) + return; + + if (format == PIPE_VIDEO_FORMAT_JPEG) + get_mjpeg_slice_header(dec, (struct pipe_mjpeg_picture_desc*)picture); + + for (i = 0; i < num_buffers; ++i) { + struct rvid_buffer *buf = &dec->bs_buffers[dec->cur_buffer]; + unsigned new_size = dec->bs_size + sizes[i]; + + if (format == PIPE_VIDEO_FORMAT_JPEG) + new_size += 2; /* save for EOI */ + + if (new_size > buf->res->buf->size) { + dec->ws->buffer_unmap(buf->res->buf); + if (!rvid_resize_buffer(dec->screen, dec->cs, buf, new_size)) { + RVID_ERR("Can't resize bitstream buffer!"); + return; + } + + dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, + PIPE_TRANSFER_WRITE); + if (!dec->bs_ptr) + return; + + dec->bs_ptr += dec->bs_size; + } + + memcpy(dec->bs_ptr, buffers[i], sizes[i]); + dec->bs_size += sizes[i]; + dec->bs_ptr += sizes[i]; + } + + if (format == PIPE_VIDEO_FORMAT_JPEG) { + ((uint8_t *)dec->bs_ptr)[0] = 0xff; /* EOI */ + ((uint8_t *)dec->bs_ptr)[1] = 0xd9; + dec->bs_size += 2; + dec->bs_ptr += 2; + } +} + +/** + * end decoding of the current frame + */ +static void ruvd_end_frame(struct pipe_video_codec *decoder, + struct pipe_video_buffer *target, + struct pipe_picture_desc *picture) +{ + struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder; + struct pb_buffer *dt; + struct rvid_buffer *msg_fb_it_buf, *bs_buf; + unsigned bs_size; + + assert(decoder); + + if (!dec->bs_ptr) + return; + + msg_fb_it_buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; + bs_buf = &dec->bs_buffers[dec->cur_buffer]; + + bs_size = align(dec->bs_size, 128); + memset(dec->bs_ptr, 0, bs_size - dec->bs_size); + dec->ws->buffer_unmap(bs_buf->res->buf); + + map_msg_fb_it_buf(dec); + dec->msg->size = sizeof(*dec->msg); + dec->msg->msg_type = RUVD_MSG_DECODE; + dec->msg->stream_handle = dec->stream_handle; + dec->msg->status_report_feedback_number = dec->frame_number; + + dec->msg->body.decode.stream_type = dec->stream_type; + dec->msg->body.decode.decode_flags = 0x1; + dec->msg->body.decode.width_in_samples = dec->base.width; + dec->msg->body.decode.height_in_samples = dec->base.height; + + if ((picture->profile == PIPE_VIDEO_PROFILE_VC1_SIMPLE) || + (picture->profile == PIPE_VIDEO_PROFILE_VC1_MAIN)) { + dec->msg->body.decode.width_in_samples = align(dec->msg->body.decode.width_in_samples, 16) / 16; + dec->msg->body.decode.height_in_samples = align(dec->msg->body.decode.height_in_samples, 16) / 16; + } + + if (dec->dpb.res) + dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size; + dec->msg->body.decode.bsd_size = bs_size; + dec->msg->body.decode.db_pitch = align(dec->base.width, get_db_pitch_alignment(dec)); + + dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target); + + switch (u_reduce_video_profile(picture->profile)) { + case PIPE_VIDEO_FORMAT_MPEG4_AVC: + dec->msg->body.decode.codec.h264 = get_h264_msg(dec, (struct pipe_h264_picture_desc*)picture); + break; + + case PIPE_VIDEO_FORMAT_HEVC: + dec->msg->body.decode.codec.h265 = get_h265_msg(dec, target, (struct pipe_h265_picture_desc*)picture); + if (dec->ctx.res == NULL) { + unsigned ctx_size; + if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + ctx_size = calc_ctx_size_h265_main10(dec, (struct pipe_h265_picture_desc*)picture); + else + ctx_size = calc_ctx_size_h265_main(dec); + if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) { + RVID_ERR("Can't allocated context buffer.\n"); + } + rvid_clear_buffer(decoder->context, &dec->ctx); + } + + if (dec->ctx.res) + dec->msg->body.decode.dpb_reserved = dec->ctx.res->buf->size; + break; + + case PIPE_VIDEO_FORMAT_VC1: + dec->msg->body.decode.codec.vc1 = get_vc1_msg((struct pipe_vc1_picture_desc*)picture); + break; + + case PIPE_VIDEO_FORMAT_MPEG12: + dec->msg->body.decode.codec.mpeg2 = get_mpeg2_msg(dec, (struct pipe_mpeg12_picture_desc*)picture); + break; + + case PIPE_VIDEO_FORMAT_MPEG4: + dec->msg->body.decode.codec.mpeg4 = get_mpeg4_msg(dec, (struct pipe_mpeg4_picture_desc*)picture); + break; + + case PIPE_VIDEO_FORMAT_JPEG: + break; + + default: + assert(0); + return; + } + + dec->msg->body.decode.db_surf_tile_config = dec->msg->body.decode.dt_surf_tile_config; + dec->msg->body.decode.extension_support = 0x1; + + /* set at least the feedback buffer size */ + dec->fb[0] = dec->fb_size; + + send_msg_buf(dec); + + if (dec->dpb.res) + send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->buf, 0, + RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM); + + if (dec->ctx.res) + send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->buf, 0, + RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM); + send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->buf, + 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT); + send_cmd(dec, RUVD_CMD_DECODING_TARGET_BUFFER, dt, 0, + RADEON_USAGE_WRITE, RADEON_DOMAIN_VRAM); + send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->buf, + FB_BUFFER_OFFSET, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT); + if (have_it(dec)) + send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->buf, + FB_BUFFER_OFFSET + dec->fb_size, RADEON_USAGE_READ, RADEON_DOMAIN_GTT); + set_reg(dec, dec->reg.cntl, 1); + + flush(dec, RADEON_FLUSH_ASYNC); + next_buffer(dec); +} + +/** + * flush any outstanding command buffers to the hardware + */ +static void ruvd_flush(struct pipe_video_codec *decoder) +{ +} + +/** + * create and UVD decoder + */ +struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, + const struct pipe_video_codec *templ, + ruvd_set_dtb set_dtb) +{ + struct radeon_winsys* ws = ((struct r600_common_context *)context)->ws; + struct r600_common_context *rctx = (struct r600_common_context*)context; + unsigned dpb_size; + unsigned width = templ->width, height = templ->height; + unsigned bs_buf_size; + struct radeon_info info; + struct ruvd_decoder *dec; + int r, i; + + ws->query_info(ws, &info); + + switch(u_reduce_video_profile(templ->profile)) { + case PIPE_VIDEO_FORMAT_MPEG12: + if (templ->entrypoint > PIPE_VIDEO_ENTRYPOINT_BITSTREAM || info.family < CHIP_PALM) + return vl_create_mpeg12_decoder(context, templ); + + /* fall through */ + case PIPE_VIDEO_FORMAT_MPEG4: + width = align(width, VL_MACROBLOCK_WIDTH); + height = align(height, VL_MACROBLOCK_HEIGHT); + break; + case PIPE_VIDEO_FORMAT_MPEG4_AVC: + width = align(width, VL_MACROBLOCK_WIDTH); + height = align(height, VL_MACROBLOCK_HEIGHT); + break; + + default: + break; + } + + + dec = CALLOC_STRUCT(ruvd_decoder); + + if (!dec) + return NULL; + + if (info.drm_major < 3) + dec->use_legacy = true; + + dec->base = *templ; + dec->base.context = context; + dec->base.width = width; + dec->base.height = height; + + dec->base.destroy = ruvd_destroy; + dec->base.begin_frame = ruvd_begin_frame; + dec->base.decode_macroblock = ruvd_decode_macroblock; + dec->base.decode_bitstream = ruvd_decode_bitstream; + dec->base.end_frame = ruvd_end_frame; + dec->base.flush = ruvd_flush; + + dec->stream_type = profile2stream_type(dec, info.family); + dec->set_dtb = set_dtb; + dec->stream_handle = rvid_alloc_stream_handle(); + dec->screen = context->screen; + dec->ws = ws; + dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL); + if (!dec->cs) { + RVID_ERR("Can't get command submission context.\n"); + goto error; + } + + dec->fb_size = FB_BUFFER_SIZE; + bs_buf_size = width * height * (512 / (16 * 16)); + for (i = 0; i < NUM_BUFFERS; ++i) { + unsigned msg_fb_it_size = FB_BUFFER_OFFSET + dec->fb_size; + STATIC_ASSERT(sizeof(struct ruvd_msg) <= FB_BUFFER_OFFSET); + if (have_it(dec)) + msg_fb_it_size += IT_SCALING_TABLE_SIZE; + if (!rvid_create_buffer(dec->screen, &dec->msg_fb_it_buffers[i], + msg_fb_it_size, PIPE_USAGE_STAGING)) { + RVID_ERR("Can't allocated message buffers.\n"); + goto error; + } + + if (!rvid_create_buffer(dec->screen, &dec->bs_buffers[i], + bs_buf_size, PIPE_USAGE_STAGING)) { + RVID_ERR("Can't allocated bitstream buffers.\n"); + goto error; + } + + rvid_clear_buffer(context, &dec->msg_fb_it_buffers[i]); + rvid_clear_buffer(context, &dec->bs_buffers[i]); + } + + dpb_size = calc_dpb_size(dec); + if (dpb_size) { + if (!rvid_create_buffer(dec->screen, &dec->dpb, dpb_size, PIPE_USAGE_DEFAULT)) { + RVID_ERR("Can't allocated dpb.\n"); + goto error; + } + rvid_clear_buffer(context, &dec->dpb); + } + + dec->reg.data0 = RUVD_GPCOM_VCPU_DATA0; + dec->reg.data1 = RUVD_GPCOM_VCPU_DATA1; + dec->reg.cmd = RUVD_GPCOM_VCPU_CMD; + dec->reg.cntl = RUVD_ENGINE_CNTL; + + map_msg_fb_it_buf(dec); + dec->msg->size = sizeof(*dec->msg); + dec->msg->msg_type = RUVD_MSG_CREATE; + dec->msg->stream_handle = dec->stream_handle; + dec->msg->body.create.stream_type = dec->stream_type; + dec->msg->body.create.width_in_samples = dec->base.width; + dec->msg->body.create.height_in_samples = dec->base.height; + dec->msg->body.create.dpb_size = dpb_size; + send_msg_buf(dec); + r = flush(dec, 0); + if (r) + goto error; + + next_buffer(dec); + + return &dec->base; + +error: + if (dec->cs) dec->ws->cs_destroy(dec->cs); + + for (i = 0; i < NUM_BUFFERS; ++i) { + rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]); + rvid_destroy_buffer(&dec->bs_buffers[i]); + } + + rvid_destroy_buffer(&dec->dpb); + rvid_destroy_buffer(&dec->ctx); + rvid_destroy_buffer(&dec->sessionctx); + + FREE(dec); + + return NULL; +} + +/* calculate top/bottom offset */ +static unsigned texture_offset(struct radeon_surf *surface, unsigned layer) +{ + return surface->u.legacy.level[0].offset + + layer * surface->u.legacy.level[0].slice_size; +} + +/* hw encode the aspect of macro tiles */ +static unsigned macro_tile_aspect(unsigned macro_tile_aspect) +{ + switch (macro_tile_aspect) { + default: + case 1: macro_tile_aspect = 0; break; + case 2: macro_tile_aspect = 1; break; + case 4: macro_tile_aspect = 2; break; + case 8: macro_tile_aspect = 3; break; + } + return macro_tile_aspect; +} + +/* hw encode the bank width and height */ +static unsigned bank_wh(unsigned bankwh) +{ + switch (bankwh) { + default: + case 1: bankwh = 0; break; + case 2: bankwh = 1; break; + case 4: bankwh = 2; break; + case 8: bankwh = 3; break; + } + return bankwh; +} + +/** + * fill decoding target field from the luma and chroma surfaces + */ +void ruvd_set_dt_surfaces(struct ruvd_msg *msg, struct radeon_surf *luma, + struct radeon_surf *chroma) +{ + msg->body.decode.dt_pitch = luma->u.legacy.level[0].nblk_x * luma->blk_w; + switch (luma->u.legacy.level[0].mode) { + case RADEON_SURF_MODE_LINEAR_ALIGNED: + msg->body.decode.dt_tiling_mode = RUVD_TILE_LINEAR; + msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_LINEAR; + break; + case RADEON_SURF_MODE_1D: + msg->body.decode.dt_tiling_mode = RUVD_TILE_8X8; + msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_1D_THIN; + break; + case RADEON_SURF_MODE_2D: + msg->body.decode.dt_tiling_mode = RUVD_TILE_8X8; + msg->body.decode.dt_array_mode = RUVD_ARRAY_MODE_2D_THIN; + break; + default: + assert(0); + break; + } + + msg->body.decode.dt_luma_top_offset = texture_offset(luma, 0); + if (chroma) + msg->body.decode.dt_chroma_top_offset = texture_offset(chroma, 0); + if (msg->body.decode.dt_field_mode) { + msg->body.decode.dt_luma_bottom_offset = texture_offset(luma, 1); + if (chroma) + msg->body.decode.dt_chroma_bottom_offset = texture_offset(chroma, 1); + } else { + msg->body.decode.dt_luma_bottom_offset = msg->body.decode.dt_luma_top_offset; + msg->body.decode.dt_chroma_bottom_offset = msg->body.decode.dt_chroma_top_offset; + } + + if (chroma) { + assert(luma->u.legacy.bankw == chroma->u.legacy.bankw); + assert(luma->u.legacy.bankh == chroma->u.legacy.bankh); + assert(luma->u.legacy.mtilea == chroma->u.legacy.mtilea); + } + + msg->body.decode.dt_surf_tile_config |= RUVD_BANK_WIDTH(bank_wh(luma->u.legacy.bankw)); + msg->body.decode.dt_surf_tile_config |= RUVD_BANK_HEIGHT(bank_wh(luma->u.legacy.bankh)); + msg->body.decode.dt_surf_tile_config |= RUVD_MACRO_TILE_ASPECT_RATIO(macro_tile_aspect(luma->u.legacy.mtilea)); +} diff --git a/lib/mesa/src/gallium/drivers/r600/radeon_uvd.h b/lib/mesa/src/gallium/drivers/r600/radeon_uvd.h new file mode 100644 index 000000000..c371b1441 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/radeon_uvd.h @@ -0,0 +1,442 @@ +/************************************************************************** + * + * Copyright 2011 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Authors: + * Christian König <christian.koenig@amd.com> + * + */ + +#ifndef RADEON_UVD_H +#define RADEON_UVD_H + +#include "radeon/radeon_winsys.h" +#include "vl/vl_video_buffer.h" + +/* UVD uses PM4 packet type 0 and 2 */ +#define RUVD_PKT_TYPE_S(x) (((unsigned)(x) & 0x3) << 30) +#define RUVD_PKT_TYPE_G(x) (((x) >> 30) & 0x3) +#define RUVD_PKT_TYPE_C 0x3FFFFFFF +#define RUVD_PKT_COUNT_S(x) (((unsigned)(x) & 0x3FFF) << 16) +#define RUVD_PKT_COUNT_G(x) (((x) >> 16) & 0x3FFF) +#define RUVD_PKT_COUNT_C 0xC000FFFF +#define RUVD_PKT0_BASE_INDEX_S(x) (((unsigned)(x) & 0xFFFF) << 0) +#define RUVD_PKT0_BASE_INDEX_G(x) (((x) >> 0) & 0xFFFF) +#define RUVD_PKT0_BASE_INDEX_C 0xFFFF0000 +#define RUVD_PKT0(index, count) (RUVD_PKT_TYPE_S(0) | RUVD_PKT0_BASE_INDEX_S(index) | RUVD_PKT_COUNT_S(count)) +#define RUVD_PKT2() (RUVD_PKT_TYPE_S(2)) + +/* registers involved with UVD */ +#define RUVD_GPCOM_VCPU_CMD 0xEF0C +#define RUVD_GPCOM_VCPU_DATA0 0xEF10 +#define RUVD_GPCOM_VCPU_DATA1 0xEF14 +#define RUVD_ENGINE_CNTL 0xEF18 + +#define RUVD_GPCOM_VCPU_CMD_SOC15 0x2070c +#define RUVD_GPCOM_VCPU_DATA0_SOC15 0x20710 +#define RUVD_GPCOM_VCPU_DATA1_SOC15 0x20714 +#define RUVD_ENGINE_CNTL_SOC15 0x20718 + +/* UVD commands to VCPU */ +#define RUVD_CMD_MSG_BUFFER 0x00000000 +#define RUVD_CMD_DPB_BUFFER 0x00000001 +#define RUVD_CMD_DECODING_TARGET_BUFFER 0x00000002 +#define RUVD_CMD_FEEDBACK_BUFFER 0x00000003 +#define RUVD_CMD_SESSION_CONTEXT_BUFFER 0x00000005 +#define RUVD_CMD_BITSTREAM_BUFFER 0x00000100 +#define RUVD_CMD_ITSCALING_TABLE_BUFFER 0x00000204 +#define RUVD_CMD_CONTEXT_BUFFER 0x00000206 + +/* UVD message types */ +#define RUVD_MSG_CREATE 0 +#define RUVD_MSG_DECODE 1 +#define RUVD_MSG_DESTROY 2 + +/* UVD stream types */ +#define RUVD_CODEC_H264 0x00000000 +#define RUVD_CODEC_VC1 0x00000001 +#define RUVD_CODEC_MPEG2 0x00000003 +#define RUVD_CODEC_MPEG4 0x00000004 +#define RUVD_CODEC_H264_PERF 0x00000007 +#define RUVD_CODEC_MJPEG 0x00000008 +#define RUVD_CODEC_H265 0x00000010 + +/* UVD decode target buffer tiling mode */ +#define RUVD_TILE_LINEAR 0x00000000 +#define RUVD_TILE_8X4 0x00000001 +#define RUVD_TILE_8X8 0x00000002 +#define RUVD_TILE_32AS8 0x00000003 + +/* UVD decode target buffer array mode */ +#define RUVD_ARRAY_MODE_LINEAR 0x00000000 +#define RUVD_ARRAY_MODE_MACRO_LINEAR_MICRO_TILED 0x00000001 +#define RUVD_ARRAY_MODE_1D_THIN 0x00000002 +#define RUVD_ARRAY_MODE_2D_THIN 0x00000004 +#define RUVD_ARRAY_MODE_MACRO_TILED_MICRO_LINEAR 0x00000004 +#define RUVD_ARRAY_MODE_MACRO_TILED_MICRO_TILED 0x00000005 + +/* UVD tile config */ +#define RUVD_BANK_WIDTH(x) ((x) << 0) +#define RUVD_BANK_HEIGHT(x) ((x) << 3) +#define RUVD_MACRO_TILE_ASPECT_RATIO(x) ((x) << 6) +#define RUVD_NUM_BANKS(x) ((x) << 9) + +/* H.264 profile definitions */ +#define RUVD_H264_PROFILE_BASELINE 0x00000000 +#define RUVD_H264_PROFILE_MAIN 0x00000001 +#define RUVD_H264_PROFILE_HIGH 0x00000002 +#define RUVD_H264_PROFILE_STEREO_HIGH 0x00000003 +#define RUVD_H264_PROFILE_MVC 0x00000004 + +/* VC-1 profile definitions */ +#define RUVD_VC1_PROFILE_SIMPLE 0x00000000 +#define RUVD_VC1_PROFILE_MAIN 0x00000001 +#define RUVD_VC1_PROFILE_ADVANCED 0x00000002 + +struct ruvd_mvc_element { + uint16_t viewOrderIndex; + uint16_t viewId; + uint16_t numOfAnchorRefsInL0; + uint16_t viewIdOfAnchorRefsInL0[15]; + uint16_t numOfAnchorRefsInL1; + uint16_t viewIdOfAnchorRefsInL1[15]; + uint16_t numOfNonAnchorRefsInL0; + uint16_t viewIdOfNonAnchorRefsInL0[15]; + uint16_t numOfNonAnchorRefsInL1; + uint16_t viewIdOfNonAnchorRefsInL1[15]; +}; + +struct ruvd_h264 { + uint32_t profile; + uint32_t level; + + uint32_t sps_info_flags; + uint32_t pps_info_flags; + uint8_t chroma_format; + uint8_t bit_depth_luma_minus8; + uint8_t bit_depth_chroma_minus8; + uint8_t log2_max_frame_num_minus4; + + uint8_t pic_order_cnt_type; + uint8_t log2_max_pic_order_cnt_lsb_minus4; + uint8_t num_ref_frames; + uint8_t reserved_8bit; + + int8_t pic_init_qp_minus26; + int8_t pic_init_qs_minus26; + int8_t chroma_qp_index_offset; + int8_t second_chroma_qp_index_offset; + + uint8_t num_slice_groups_minus1; + uint8_t slice_group_map_type; + uint8_t num_ref_idx_l0_active_minus1; + uint8_t num_ref_idx_l1_active_minus1; + + uint16_t slice_group_change_rate_minus1; + uint16_t reserved_16bit_1; + + uint8_t scaling_list_4x4[6][16]; + uint8_t scaling_list_8x8[2][64]; + + uint32_t frame_num; + uint32_t frame_num_list[16]; + int32_t curr_field_order_cnt_list[2]; + int32_t field_order_cnt_list[16][2]; + + uint32_t decoded_pic_idx; + + uint32_t curr_pic_ref_frame_num; + + uint8_t ref_frame_list[16]; + + uint32_t reserved[122]; + + struct { + uint32_t numViews; + uint32_t viewId0; + struct ruvd_mvc_element mvcElements[1]; + } mvc; +}; + +struct ruvd_h265 { + uint32_t sps_info_flags; + uint32_t pps_info_flags; + + uint8_t chroma_format; + uint8_t bit_depth_luma_minus8; + uint8_t bit_depth_chroma_minus8; + uint8_t log2_max_pic_order_cnt_lsb_minus4; + + uint8_t sps_max_dec_pic_buffering_minus1; + uint8_t log2_min_luma_coding_block_size_minus3; + uint8_t log2_diff_max_min_luma_coding_block_size; + uint8_t log2_min_transform_block_size_minus2; + + uint8_t log2_diff_max_min_transform_block_size; + uint8_t max_transform_hierarchy_depth_inter; + uint8_t max_transform_hierarchy_depth_intra; + uint8_t pcm_sample_bit_depth_luma_minus1; + + uint8_t pcm_sample_bit_depth_chroma_minus1; + uint8_t log2_min_pcm_luma_coding_block_size_minus3; + uint8_t log2_diff_max_min_pcm_luma_coding_block_size; + uint8_t num_extra_slice_header_bits; + + uint8_t num_short_term_ref_pic_sets; + uint8_t num_long_term_ref_pic_sps; + uint8_t num_ref_idx_l0_default_active_minus1; + uint8_t num_ref_idx_l1_default_active_minus1; + + int8_t pps_cb_qp_offset; + int8_t pps_cr_qp_offset; + int8_t pps_beta_offset_div2; + int8_t pps_tc_offset_div2; + + uint8_t diff_cu_qp_delta_depth; + uint8_t num_tile_columns_minus1; + uint8_t num_tile_rows_minus1; + uint8_t log2_parallel_merge_level_minus2; + + uint16_t column_width_minus1[19]; + uint16_t row_height_minus1[21]; + + int8_t init_qp_minus26; + uint8_t num_delta_pocs_ref_rps_idx; + uint8_t curr_idx; + uint8_t reserved1; + int32_t curr_poc; + uint8_t ref_pic_list[16]; + int32_t poc_list[16]; + uint8_t ref_pic_set_st_curr_before[8]; + uint8_t ref_pic_set_st_curr_after[8]; + uint8_t ref_pic_set_lt_curr[8]; + + uint8_t ucScalingListDCCoefSizeID2[6]; + uint8_t ucScalingListDCCoefSizeID3[2]; + + uint8_t highestTid; + uint8_t isNonRef; + + uint8_t p010_mode; + uint8_t msb_mode; + uint8_t luma_10to8; + uint8_t chroma_10to8; + uint8_t sclr_luma10to8; + uint8_t sclr_chroma10to8; + + uint8_t direct_reflist[2][15]; +}; + +struct ruvd_vc1 { + uint32_t profile; + uint32_t level; + uint32_t sps_info_flags; + uint32_t pps_info_flags; + uint32_t pic_structure; + uint32_t chroma_format; +}; + +struct ruvd_mpeg2 { + uint32_t decoded_pic_idx; + uint32_t ref_pic_idx[2]; + + uint8_t load_intra_quantiser_matrix; + uint8_t load_nonintra_quantiser_matrix; + uint8_t reserved_quantiser_alignement[2]; + uint8_t intra_quantiser_matrix[64]; + uint8_t nonintra_quantiser_matrix[64]; + + uint8_t profile_and_level_indication; + uint8_t chroma_format; + + uint8_t picture_coding_type; + + uint8_t reserved_1; + + uint8_t f_code[2][2]; + uint8_t intra_dc_precision; + uint8_t pic_structure; + uint8_t top_field_first; + uint8_t frame_pred_frame_dct; + uint8_t concealment_motion_vectors; + uint8_t q_scale_type; + uint8_t intra_vlc_format; + uint8_t alternate_scan; +}; + +struct ruvd_mpeg4 +{ + uint32_t decoded_pic_idx; + uint32_t ref_pic_idx[2]; + + uint32_t variant_type; + uint8_t profile_and_level_indication; + + uint8_t video_object_layer_verid; + uint8_t video_object_layer_shape; + + uint8_t reserved_1; + + uint16_t video_object_layer_width; + uint16_t video_object_layer_height; + + uint16_t vop_time_increment_resolution; + + uint16_t reserved_2; + + uint32_t flags; + + uint8_t quant_type; + + uint8_t reserved_3[3]; + + uint8_t intra_quant_mat[64]; + uint8_t nonintra_quant_mat[64]; + + struct { + uint8_t sprite_enable; + + uint8_t reserved_4[3]; + + uint16_t sprite_width; + uint16_t sprite_height; + int16_t sprite_left_coordinate; + int16_t sprite_top_coordinate; + + uint8_t no_of_sprite_warping_points; + uint8_t sprite_warping_accuracy; + uint8_t sprite_brightness_change; + uint8_t low_latency_sprite_enable; + } sprite_config; + + struct { + uint32_t flags; + uint8_t vol_mode; + uint8_t reserved_5[3]; + } divx_311_config; +}; + +/* message between driver and hardware */ +struct ruvd_msg { + + uint32_t size; + uint32_t msg_type; + uint32_t stream_handle; + uint32_t status_report_feedback_number; + + union { + struct { + uint32_t stream_type; + uint32_t session_flags; + uint32_t asic_id; + uint32_t width_in_samples; + uint32_t height_in_samples; + uint32_t dpb_buffer; + uint32_t dpb_size; + uint32_t dpb_model; + uint32_t version_info; + } create; + + struct { + uint32_t stream_type; + uint32_t decode_flags; + uint32_t width_in_samples; + uint32_t height_in_samples; + + uint32_t dpb_buffer; + uint32_t dpb_size; + uint32_t dpb_model; + uint32_t dpb_reserved; + + uint32_t db_offset_alignment; + uint32_t db_pitch; + uint32_t db_tiling_mode; + uint32_t db_array_mode; + uint32_t db_field_mode; + uint32_t db_surf_tile_config; + uint32_t db_aligned_height; + uint32_t db_reserved; + + uint32_t use_addr_macro; + + uint32_t bsd_buffer; + uint32_t bsd_size; + + uint32_t pic_param_buffer; + uint32_t pic_param_size; + uint32_t mb_cntl_buffer; + uint32_t mb_cntl_size; + + uint32_t dt_buffer; + uint32_t dt_pitch; + uint32_t dt_tiling_mode; + uint32_t dt_array_mode; + uint32_t dt_field_mode; + uint32_t dt_luma_top_offset; + uint32_t dt_luma_bottom_offset; + uint32_t dt_chroma_top_offset; + uint32_t dt_chroma_bottom_offset; + uint32_t dt_surf_tile_config; + uint32_t dt_uv_surf_tile_config; + // re-use dt_wa_chroma_top_offset as dt_ext_info for UV pitch in stoney + uint32_t dt_wa_chroma_top_offset; + uint32_t dt_wa_chroma_bottom_offset; + + uint32_t reserved[16]; + + union { + struct ruvd_h264 h264; + struct ruvd_h265 h265; + struct ruvd_vc1 vc1; + struct ruvd_mpeg2 mpeg2; + struct ruvd_mpeg4 mpeg4; + + uint32_t info[768]; + } codec; + + uint8_t extension_support; + uint8_t reserved_8bit_1; + uint8_t reserved_8bit_2; + uint8_t reserved_8bit_3; + uint32_t extension_reserved[64]; + } decode; + } body; +}; + +/* driver dependent callback */ +typedef struct pb_buffer* (*ruvd_set_dtb) +(struct ruvd_msg* msg, struct vl_video_buffer *vb); + +/* create an UVD decode */ +struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, + const struct pipe_video_codec *templat, + ruvd_set_dtb set_dtb); + +/* fill decoding target field from the luma and chroma surfaces */ +void ruvd_set_dt_surfaces(struct ruvd_msg *msg, struct radeon_surf *luma, + struct radeon_surf *chroma); +#endif diff --git a/lib/mesa/src/gallium/drivers/r600/radeon_vce.c b/lib/mesa/src/gallium/drivers/r600/radeon_vce.c new file mode 100644 index 000000000..16a0127f3 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/radeon_vce.c @@ -0,0 +1,533 @@ +/************************************************************************** + * + * Copyright 2013 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Authors: + * Christian König <christian.koenig@amd.com> + * + */ + +#include <stdio.h> + +#include "pipe/p_video_codec.h" + +#include "util/u_video.h" +#include "util/u_memory.h" + +#include "vl/vl_video_buffer.h" + +#include "r600_pipe_common.h" +#include "radeon_video.h" +#include "radeon_vce.h" + +#define FW_40_2_2 ((40 << 24) | (2 << 16) | (2 << 8)) +#define FW_50_0_1 ((50 << 24) | (0 << 16) | (1 << 8)) +#define FW_50_1_2 ((50 << 24) | (1 << 16) | (2 << 8)) +#define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8)) +#define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8)) +#define FW_52_0_3 ((52 << 24) | (0 << 16) | (3 << 8)) +#define FW_52_4_3 ((52 << 24) | (4 << 16) | (3 << 8)) +#define FW_52_8_3 ((52 << 24) | (8 << 16) | (3 << 8)) +#define FW_53 (53 << 24) + +/** + * flush commands to the hardware + */ +static void flush(struct rvce_encoder *enc) +{ + enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL); + enc->task_info_idx = 0; + enc->bs_idx = 0; +} + +#if 0 +static void dump_feedback(struct rvce_encoder *enc, struct rvid_buffer *fb) +{ + uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE); + unsigned i = 0; + fprintf(stderr, "\n"); + fprintf(stderr, "encStatus:\t\t\t%08x\n", ptr[i++]); + fprintf(stderr, "encHasBitstream:\t\t%08x\n", ptr[i++]); + fprintf(stderr, "encHasAudioBitstream:\t\t%08x\n", ptr[i++]); + fprintf(stderr, "encBitstreamOffset:\t\t%08x\n", ptr[i++]); + fprintf(stderr, "encBitstreamSize:\t\t%08x\n", ptr[i++]); + fprintf(stderr, "encAudioBitstreamOffset:\t%08x\n", ptr[i++]); + fprintf(stderr, "encAudioBitstreamSize:\t\t%08x\n", ptr[i++]); + fprintf(stderr, "encExtrabytes:\t\t\t%08x\n", ptr[i++]); + fprintf(stderr, "encAudioExtrabytes:\t\t%08x\n", ptr[i++]); + fprintf(stderr, "videoTimeStamp:\t\t\t%08x\n", ptr[i++]); + fprintf(stderr, "audioTimeStamp:\t\t\t%08x\n", ptr[i++]); + fprintf(stderr, "videoOutputType:\t\t%08x\n", ptr[i++]); + fprintf(stderr, "attributeFlags:\t\t\t%08x\n", ptr[i++]); + fprintf(stderr, "seiPrivatePackageOffset:\t%08x\n", ptr[i++]); + fprintf(stderr, "seiPrivatePackageSize:\t\t%08x\n", ptr[i++]); + fprintf(stderr, "\n"); + enc->ws->buffer_unmap(fb->res->buf); +} +#endif + +/** + * reset the CPB handling + */ +static void reset_cpb(struct rvce_encoder *enc) +{ + unsigned i; + + LIST_INITHEAD(&enc->cpb_slots); + for (i = 0; i < enc->cpb_num; ++i) { + struct rvce_cpb_slot *slot = &enc->cpb_array[i]; + slot->index = i; + slot->picture_type = PIPE_H264_ENC_PICTURE_TYPE_SKIP; + slot->frame_num = 0; + slot->pic_order_cnt = 0; + LIST_ADDTAIL(&slot->list, &enc->cpb_slots); + } +} + +/** + * sort l0 and l1 to the top of the list + */ +static void sort_cpb(struct rvce_encoder *enc) +{ + struct rvce_cpb_slot *i, *l0 = NULL, *l1 = NULL; + + LIST_FOR_EACH_ENTRY(i, &enc->cpb_slots, list) { + if (i->frame_num == enc->pic.ref_idx_l0) + l0 = i; + + if (i->frame_num == enc->pic.ref_idx_l1) + l1 = i; + + if (enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P && l0) + break; + + if (enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B && + l0 && l1) + break; + } + + if (l1) { + LIST_DEL(&l1->list); + LIST_ADD(&l1->list, &enc->cpb_slots); + } + + if (l0) { + LIST_DEL(&l0->list); + LIST_ADD(&l0->list, &enc->cpb_slots); + } +} + +/** + * get number of cpbs based on dpb + */ +static unsigned get_cpb_num(struct rvce_encoder *enc) +{ + unsigned w = align(enc->base.width, 16) / 16; + unsigned h = align(enc->base.height, 16) / 16; + unsigned dpb; + + switch (enc->base.level) { + case 10: + dpb = 396; + break; + case 11: + dpb = 900; + break; + case 12: + case 13: + case 20: + dpb = 2376; + break; + case 21: + dpb = 4752; + break; + case 22: + case 30: + dpb = 8100; + break; + case 31: + dpb = 18000; + break; + case 32: + dpb = 20480; + break; + case 40: + case 41: + dpb = 32768; + break; + case 42: + dpb = 34816; + break; + case 50: + dpb = 110400; + break; + default: + case 51: + case 52: + dpb = 184320; + break; + } + + return MIN2(dpb / (w * h), 16); +} + +/** + * Get the slot for the currently encoded frame + */ +struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc) +{ + return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.prev, list); +} + +/** + * Get the slot for L0 + */ +struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc) +{ + return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next, list); +} + +/** + * Get the slot for L1 + */ +struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc) +{ + return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next->next, list); +} + +/** + * Calculate the offsets into the CPB + */ +void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot, + signed *luma_offset, signed *chroma_offset) +{ + unsigned pitch, vpitch, fsize; + + pitch = align(enc->luma->u.legacy.level[0].nblk_x * enc->luma->bpe, 128); + vpitch = align(enc->luma->u.legacy.level[0].nblk_y, 16); + fsize = pitch * (vpitch + vpitch / 2); + + *luma_offset = slot->index * fsize; + *chroma_offset = *luma_offset + pitch * vpitch; +} + +/** + * destroy this video encoder + */ +static void rvce_destroy(struct pipe_video_codec *encoder) +{ + struct rvce_encoder *enc = (struct rvce_encoder*)encoder; + if (enc->stream_handle) { + struct rvid_buffer fb; + rvid_create_buffer(enc->screen, &fb, 512, PIPE_USAGE_STAGING); + enc->fb = &fb; + enc->session(enc); + enc->feedback(enc); + enc->destroy(enc); + flush(enc); + rvid_destroy_buffer(&fb); + } + rvid_destroy_buffer(&enc->cpb); + enc->ws->cs_destroy(enc->cs); + FREE(enc->cpb_array); + FREE(enc); +} + +static void rvce_begin_frame(struct pipe_video_codec *encoder, + struct pipe_video_buffer *source, + struct pipe_picture_desc *picture) +{ + struct rvce_encoder *enc = (struct rvce_encoder*)encoder; + struct vl_video_buffer *vid_buf = (struct vl_video_buffer *)source; + struct pipe_h264_enc_picture_desc *pic = (struct pipe_h264_enc_picture_desc *)picture; + + bool need_rate_control = + enc->pic.rate_ctrl.rate_ctrl_method != pic->rate_ctrl.rate_ctrl_method || + enc->pic.quant_i_frames != pic->quant_i_frames || + enc->pic.quant_p_frames != pic->quant_p_frames || + enc->pic.quant_b_frames != pic->quant_b_frames; + + enc->pic = *pic; + get_pic_param(enc, pic); + + enc->get_buffer(vid_buf->resources[0], &enc->handle, &enc->luma); + enc->get_buffer(vid_buf->resources[1], NULL, &enc->chroma); + + if (pic->picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR) + reset_cpb(enc); + else if (pic->picture_type == PIPE_H264_ENC_PICTURE_TYPE_P || + pic->picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) + sort_cpb(enc); + + if (!enc->stream_handle) { + struct rvid_buffer fb; + enc->stream_handle = rvid_alloc_stream_handle(); + rvid_create_buffer(enc->screen, &fb, 512, PIPE_USAGE_STAGING); + enc->fb = &fb; + enc->session(enc); + enc->create(enc); + enc->config(enc); + enc->feedback(enc); + flush(enc); + //dump_feedback(enc, &fb); + rvid_destroy_buffer(&fb); + need_rate_control = false; + } + + if (need_rate_control) { + enc->session(enc); + enc->config(enc); + flush(enc); + } +} + +static void rvce_encode_bitstream(struct pipe_video_codec *encoder, + struct pipe_video_buffer *source, + struct pipe_resource *destination, + void **fb) +{ + struct rvce_encoder *enc = (struct rvce_encoder*)encoder; + enc->get_buffer(destination, &enc->bs_handle, NULL); + enc->bs_size = destination->width0; + + *fb = enc->fb = CALLOC_STRUCT(rvid_buffer); + if (!rvid_create_buffer(enc->screen, enc->fb, 512, PIPE_USAGE_STAGING)) { + RVID_ERR("Can't create feedback buffer.\n"); + return; + } + if (!radeon_emitted(enc->cs, 0)) + enc->session(enc); + enc->encode(enc); + enc->feedback(enc); +} + +static void rvce_end_frame(struct pipe_video_codec *encoder, + struct pipe_video_buffer *source, + struct pipe_picture_desc *picture) +{ + struct rvce_encoder *enc = (struct rvce_encoder*)encoder; + struct rvce_cpb_slot *slot = LIST_ENTRY( + struct rvce_cpb_slot, enc->cpb_slots.prev, list); + + if (!enc->dual_inst || enc->bs_idx > 1) + flush(enc); + + /* update the CPB backtrack with the just encoded frame */ + slot->picture_type = enc->pic.picture_type; + slot->frame_num = enc->pic.frame_num; + slot->pic_order_cnt = enc->pic.pic_order_cnt; + if (!enc->pic.not_referenced) { + LIST_DEL(&slot->list); + LIST_ADD(&slot->list, &enc->cpb_slots); + } +} + +static void rvce_get_feedback(struct pipe_video_codec *encoder, + void *feedback, unsigned *size) +{ + struct rvce_encoder *enc = (struct rvce_encoder*)encoder; + struct rvid_buffer *fb = feedback; + + if (size) { + uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE); + + if (ptr[1]) { + *size = ptr[4] - ptr[9]; + } else { + *size = 0; + } + + enc->ws->buffer_unmap(fb->res->buf); + } + //dump_feedback(enc, fb); + rvid_destroy_buffer(fb); + FREE(fb); +} + +/** + * flush any outstanding command buffers to the hardware + */ +static void rvce_flush(struct pipe_video_codec *encoder) +{ + struct rvce_encoder *enc = (struct rvce_encoder*)encoder; + + flush(enc); +} + +static void rvce_cs_flush(void *ctx, unsigned flags, + struct pipe_fence_handle **fence) +{ + // just ignored +} + +struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, + const struct pipe_video_codec *templ, + struct radeon_winsys* ws, + rvce_get_buffer get_buffer) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen *)context->screen; + struct r600_common_context *rctx = (struct r600_common_context*)context; + struct rvce_encoder *enc; + struct pipe_video_buffer *tmp_buf, templat = {}; + struct radeon_surf *tmp_surf; + unsigned cpb_size; + + if (!rscreen->info.vce_fw_version) { + RVID_ERR("Kernel doesn't supports VCE!\n"); + return NULL; + + } else if (!rvce_is_fw_version_supported(rscreen)) { + RVID_ERR("Unsupported VCE fw version loaded!\n"); + return NULL; + } + + enc = CALLOC_STRUCT(rvce_encoder); + if (!enc) + return NULL; + + if (rscreen->info.drm_major == 3) + enc->use_vm = true; + if ((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42) || + rscreen->info.drm_major == 3) + enc->use_vui = true; + + enc->base = *templ; + enc->base.context = context; + + enc->base.destroy = rvce_destroy; + enc->base.begin_frame = rvce_begin_frame; + enc->base.encode_bitstream = rvce_encode_bitstream; + enc->base.end_frame = rvce_end_frame; + enc->base.flush = rvce_flush; + enc->base.get_feedback = rvce_get_feedback; + enc->get_buffer = get_buffer; + + enc->screen = context->screen; + enc->ws = ws; + enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc); + if (!enc->cs) { + RVID_ERR("Can't get command submission context.\n"); + goto error; + } + + templat.buffer_format = PIPE_FORMAT_NV12; + templat.chroma_format = PIPE_VIDEO_CHROMA_FORMAT_420; + templat.width = enc->base.width; + templat.height = enc->base.height; + templat.interlaced = false; + if (!(tmp_buf = context->create_video_buffer(context, &templat))) { + RVID_ERR("Can't create video buffer.\n"); + goto error; + } + + enc->cpb_num = get_cpb_num(enc); + if (!enc->cpb_num) + goto error; + + get_buffer(((struct vl_video_buffer *)tmp_buf)->resources[0], NULL, &tmp_surf); + + cpb_size = align(tmp_surf->u.legacy.level[0].nblk_x * tmp_surf->bpe, 128) * + align(tmp_surf->u.legacy.level[0].nblk_y, 32); + + cpb_size = cpb_size * 3 / 2; + cpb_size = cpb_size * enc->cpb_num; + if (enc->dual_pipe) + cpb_size += RVCE_MAX_AUX_BUFFER_NUM * + RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE * 2; + tmp_buf->destroy(tmp_buf); + if (!rvid_create_buffer(enc->screen, &enc->cpb, cpb_size, PIPE_USAGE_DEFAULT)) { + RVID_ERR("Can't create CPB buffer.\n"); + goto error; + } + + enc->cpb_array = CALLOC(enc->cpb_num, sizeof(struct rvce_cpb_slot)); + if (!enc->cpb_array) + goto error; + + reset_cpb(enc); + + goto error; + + return &enc->base; + +error: + if (enc->cs) + enc->ws->cs_destroy(enc->cs); + + rvid_destroy_buffer(&enc->cpb); + + FREE(enc->cpb_array); + FREE(enc); + return NULL; +} + +/** + * check if kernel has the right fw version loaded + */ +bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen) +{ + switch (rscreen->info.vce_fw_version) { + case FW_40_2_2: + case FW_50_0_1: + case FW_50_1_2: + case FW_50_10_2: + case FW_50_17_3: + case FW_52_0_3: + case FW_52_4_3: + case FW_52_8_3: + return true; + default: + if ((rscreen->info.vce_fw_version & (0xff << 24)) == FW_53) + return true; + else + return false; + } +} + +/** + * Add the buffer as relocation to the current command submission + */ +void rvce_add_buffer(struct rvce_encoder *enc, struct pb_buffer *buf, + enum radeon_bo_usage usage, enum radeon_bo_domain domain, + signed offset) +{ + int reloc_idx; + + reloc_idx = enc->ws->cs_add_buffer(enc->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED, + domain, RADEON_PRIO_VCE); + if (enc->use_vm) { + uint64_t addr; + addr = enc->ws->buffer_get_virtual_address(buf); + addr = addr + offset; + RVCE_CS(addr >> 32); + RVCE_CS(addr); + } else { + offset += enc->ws->buffer_get_reloc_offset(buf); + RVCE_CS(reloc_idx * 4); + RVCE_CS(offset); + } +} diff --git a/lib/mesa/src/gallium/drivers/r600/radeon_vce.h b/lib/mesa/src/gallium/drivers/r600/radeon_vce.h new file mode 100644 index 000000000..f79e65c9a --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/radeon_vce.h @@ -0,0 +1,462 @@ +/************************************************************************** + * + * Copyright 2013 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Authors: + * Christian König <christian.koenig@amd.com> + * + */ + +#ifndef RADEON_VCE_H +#define RADEON_VCE_H + +#include "util/list.h" + +#define RVCE_CS(value) (enc->cs->current.buf[enc->cs->current.cdw++] = (value)) +#define RVCE_BEGIN(cmd) { \ + uint32_t *begin = &enc->cs->current.buf[enc->cs->current.cdw++]; \ + RVCE_CS(cmd) +#define RVCE_READ(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READ, (domain), (off)) +#define RVCE_WRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_WRITE, (domain), (off)) +#define RVCE_READWRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READWRITE, (domain), (off)) +#define RVCE_END() *begin = (&enc->cs->current.buf[enc->cs->current.cdw] - begin) * 4; } + +#define RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE (4096 * 16 * 2.5) +#define RVCE_MAX_AUX_BUFFER_NUM 4 + +struct r600_common_screen; + +/* driver dependent callback */ +typedef void (*rvce_get_buffer)(struct pipe_resource *resource, + struct pb_buffer **handle, + struct radeon_surf **surface); + +/* Coded picture buffer slot */ +struct rvce_cpb_slot { + struct list_head list; + + unsigned index; + enum pipe_h264_enc_picture_type picture_type; + unsigned frame_num; + unsigned pic_order_cnt; +}; + +struct rvce_rate_control { + uint32_t rc_method; + uint32_t target_bitrate; + uint32_t peak_bitrate; + uint32_t frame_rate_num; + uint32_t gop_size; + uint32_t quant_i_frames; + uint32_t quant_p_frames; + uint32_t quant_b_frames; + uint32_t vbv_buffer_size; + uint32_t frame_rate_den; + uint32_t vbv_buf_lv; + uint32_t max_au_size; + uint32_t qp_initial_mode; + uint32_t target_bits_picture; + uint32_t peak_bits_picture_integer; + uint32_t peak_bits_picture_fraction; + uint32_t min_qp; + uint32_t max_qp; + uint32_t skip_frame_enable; + uint32_t fill_data_enable; + uint32_t enforce_hrd; + uint32_t b_pics_delta_qp; + uint32_t ref_b_pics_delta_qp; + uint32_t rc_reinit_disable; + uint32_t enc_lcvbr_init_qp_flag; + uint32_t lcvbrsatd_based_nonlinear_bit_budget_flag; +}; + +struct rvce_motion_estimation { + uint32_t enc_ime_decimation_search; + uint32_t motion_est_half_pixel; + uint32_t motion_est_quarter_pixel; + uint32_t disable_favor_pmv_point; + uint32_t force_zero_point_center; + uint32_t lsmvert; + uint32_t enc_search_range_x; + uint32_t enc_search_range_y; + uint32_t enc_search1_range_x; + uint32_t enc_search1_range_y; + uint32_t disable_16x16_frame1; + uint32_t disable_satd; + uint32_t enable_amd; + uint32_t enc_disable_sub_mode; + uint32_t enc_ime_skip_x; + uint32_t enc_ime_skip_y; + uint32_t enc_en_ime_overw_dis_subm; + uint32_t enc_ime_overw_dis_subm_no; + uint32_t enc_ime2_search_range_x; + uint32_t enc_ime2_search_range_y; + uint32_t parallel_mode_speedup_enable; + uint32_t fme0_enc_disable_sub_mode; + uint32_t fme1_enc_disable_sub_mode; + uint32_t ime_sw_speedup_enable; +}; + +struct rvce_pic_control { + uint32_t enc_use_constrained_intra_pred; + uint32_t enc_cabac_enable; + uint32_t enc_cabac_idc; + uint32_t enc_loop_filter_disable; + int32_t enc_lf_beta_offset; + int32_t enc_lf_alpha_c0_offset; + uint32_t enc_crop_left_offset; + uint32_t enc_crop_right_offset; + uint32_t enc_crop_top_offset; + uint32_t enc_crop_bottom_offset; + uint32_t enc_num_mbs_per_slice; + uint32_t enc_intra_refresh_num_mbs_per_slot; + uint32_t enc_force_intra_refresh; + uint32_t enc_force_imb_period; + uint32_t enc_pic_order_cnt_type; + uint32_t log2_max_pic_order_cnt_lsb_minus4; + uint32_t enc_sps_id; + uint32_t enc_pps_id; + uint32_t enc_constraint_set_flags; + uint32_t enc_b_pic_pattern; + uint32_t weight_pred_mode_b_picture; + uint32_t enc_number_of_reference_frames; + uint32_t enc_max_num_ref_frames; + uint32_t enc_num_default_active_ref_l0; + uint32_t enc_num_default_active_ref_l1; + uint32_t enc_slice_mode; + uint32_t enc_max_slice_size; +}; + +struct rvce_task_info { + uint32_t offset_of_next_task_info; + uint32_t task_operation; + uint32_t reference_picture_dependency; + uint32_t collocate_flag_dependency; + uint32_t feedback_index; + uint32_t video_bitstream_ring_index; +}; + +struct rvce_feedback_buf_pkg { + uint32_t feedback_ring_address_hi; + uint32_t feedback_ring_address_lo; + uint32_t feedback_ring_size; +}; + +struct rvce_rdo { + uint32_t enc_disable_tbe_pred_i_frame; + uint32_t enc_disable_tbe_pred_p_frame; + uint32_t use_fme_interpol_y; + uint32_t use_fme_interpol_uv; + uint32_t use_fme_intrapol_y; + uint32_t use_fme_intrapol_uv; + uint32_t use_fme_interpol_y_1; + uint32_t use_fme_interpol_uv_1; + uint32_t use_fme_intrapol_y_1; + uint32_t use_fme_intrapol_uv_1; + uint32_t enc_16x16_cost_adj; + uint32_t enc_skip_cost_adj; + uint32_t enc_force_16x16_skip; + uint32_t enc_disable_threshold_calc_a; + uint32_t enc_luma_coeff_cost; + uint32_t enc_luma_mb_coeff_cost; + uint32_t enc_chroma_coeff_cost; +}; + +struct rvce_vui { + uint32_t aspect_ratio_info_present_flag; + uint32_t aspect_ratio_idc; + uint32_t sar_width; + uint32_t sar_height; + uint32_t overscan_info_present_flag; + uint32_t overscan_Approp_flag; + uint32_t video_signal_type_present_flag; + uint32_t video_format; + uint32_t video_full_range_flag; + uint32_t color_description_present_flag; + uint32_t color_prim; + uint32_t transfer_char; + uint32_t matrix_coef; + uint32_t chroma_loc_info_present_flag; + uint32_t chroma_loc_top; + uint32_t chroma_loc_bottom; + uint32_t timing_info_present_flag; + uint32_t num_units_in_tick; + uint32_t time_scale; + uint32_t fixed_frame_rate_flag; + uint32_t nal_hrd_parameters_present_flag; + uint32_t cpb_cnt_minus1; + uint32_t bit_rate_scale; + uint32_t cpb_size_scale; + uint32_t bit_rate_value_minus; + uint32_t cpb_size_value_minus; + uint32_t cbr_flag; + uint32_t initial_cpb_removal_delay_length_minus1; + uint32_t cpb_removal_delay_length_minus1; + uint32_t dpb_output_delay_length_minus1; + uint32_t time_offset_length; + uint32_t low_delay_hrd_flag; + uint32_t pic_struct_present_flag; + uint32_t bitstream_restriction_present_flag; + uint32_t motion_vectors_over_pic_boundaries_flag; + uint32_t max_bytes_per_pic_denom; + uint32_t max_bits_per_mb_denom; + uint32_t log2_max_mv_length_hori; + uint32_t log2_max_mv_length_vert; + uint32_t num_reorder_frames; + uint32_t max_dec_frame_buffering; +}; + +struct rvce_enc_operation { + uint32_t insert_headers; + uint32_t picture_structure; + uint32_t allowed_max_bitstream_size; + uint32_t force_refresh_map; + uint32_t insert_aud; + uint32_t end_of_sequence; + uint32_t end_of_stream; + uint32_t input_picture_luma_address_hi; + uint32_t input_picture_luma_address_lo; + uint32_t input_picture_chroma_address_hi; + uint32_t input_picture_chroma_address_lo; + uint32_t enc_input_frame_y_pitch; + uint32_t enc_input_pic_luma_pitch; + uint32_t enc_input_pic_chroma_pitch;; + uint32_t enc_input_pic_addr_array; + uint32_t enc_input_pic_addr_array_disable2pipe_disablemboffload; + uint32_t enc_input_pic_tile_config; + uint32_t enc_pic_type; + uint32_t enc_idr_flag; + uint32_t enc_idr_pic_id; + uint32_t enc_mgs_key_pic; + uint32_t enc_reference_flag; + uint32_t enc_temporal_layer_index; + uint32_t num_ref_idx_active_override_flag; + uint32_t num_ref_idx_l0_active_minus1; + uint32_t num_ref_idx_l1_active_minus1; + uint32_t enc_ref_list_modification_op; + uint32_t enc_ref_list_modification_num; + uint32_t enc_decoded_picture_marking_op; + uint32_t enc_decoded_picture_marking_num; + uint32_t enc_decoded_picture_marking_idx; + uint32_t enc_decoded_ref_base_picture_marking_op; + uint32_t enc_decoded_ref_base_picture_marking_num; + uint32_t l0_picture_structure; + uint32_t l0_enc_pic_type; + uint32_t l0_frame_number; + uint32_t l0_picture_order_count; + uint32_t l0_luma_offset; + uint32_t l0_chroma_offset; + uint32_t l1_picture_structure; + uint32_t l1_enc_pic_type; + uint32_t l1_frame_number; + uint32_t l1_picture_order_count; + uint32_t l1_luma_offset; + uint32_t l1_chroma_offset; + uint32_t enc_reconstructed_luma_offset; + uint32_t enc_reconstructed_chroma_offset;; + uint32_t enc_coloc_buffer_offset; + uint32_t enc_reconstructed_ref_base_picture_luma_offset; + uint32_t enc_reconstructed_ref_base_picture_chroma_offset; + uint32_t enc_reference_ref_base_picture_luma_offset; + uint32_t enc_reference_ref_base_picture_chroma_offset; + uint32_t picture_count; + uint32_t frame_number; + uint32_t picture_order_count; + uint32_t num_i_pic_remain_in_rcgop; + uint32_t num_p_pic_remain_in_rcgop; + uint32_t num_b_pic_remain_in_rcgop; + uint32_t num_ir_pic_remain_in_rcgop; + uint32_t enable_intra_refresh; + uint32_t aq_variance_en; + uint32_t aq_block_size; + uint32_t aq_mb_variance_sel; + uint32_t aq_frame_variance_sel; + uint32_t aq_param_a; + uint32_t aq_param_b; + uint32_t aq_param_c; + uint32_t aq_param_d; + uint32_t aq_param_e; + uint32_t context_in_sfb; +}; + +struct rvce_enc_create { + uint32_t enc_use_circular_buffer; + uint32_t enc_profile; + uint32_t enc_level; + uint32_t enc_pic_struct_restriction; + uint32_t enc_image_width; + uint32_t enc_image_height; + uint32_t enc_ref_pic_luma_pitch; + uint32_t enc_ref_pic_chroma_pitch; + uint32_t enc_ref_y_height_in_qw; + uint32_t enc_ref_pic_addr_array_enc_pic_struct_restriction_disable_rdo; + uint32_t enc_pre_encode_context_buffer_offset; + uint32_t enc_pre_encode_input_luma_buffer_offset; + uint32_t enc_pre_encode_input_chroma_buffer_offset; + uint32_t enc_pre_encode_mode_chromaflag_vbaqmode_scenechangesensitivity; +}; + +struct rvce_config_ext { + uint32_t enc_enable_perf_logging; +}; + +struct rvce_h264_enc_pic { + struct rvce_rate_control rc; + struct rvce_motion_estimation me; + struct rvce_pic_control pc; + struct rvce_task_info ti; + struct rvce_feedback_buf_pkg fb; + struct rvce_rdo rdo; + struct rvce_vui vui; + struct rvce_enc_operation eo; + struct rvce_enc_create ec; + struct rvce_config_ext ce; + + unsigned quant_i_frames; + unsigned quant_p_frames; + unsigned quant_b_frames; + + enum pipe_h264_enc_picture_type picture_type; + unsigned frame_num; + unsigned frame_num_cnt; + unsigned p_remain; + unsigned i_remain; + unsigned idr_pic_id; + unsigned gop_cnt; + unsigned gop_size; + unsigned pic_order_cnt; + unsigned ref_idx_l0; + unsigned ref_idx_l1; + unsigned addrmode_arraymode_disrdo_distwoinstants; + + bool not_referenced; + bool is_idr; + bool has_ref_pic_list; + bool enable_vui; + unsigned int ref_pic_list_0[32]; + unsigned int ref_pic_list_1[32]; + unsigned int frame_idx[32]; +}; + +/* VCE encoder representation */ +struct rvce_encoder { + struct pipe_video_codec base; + + /* version specific packets */ + void (*session)(struct rvce_encoder *enc); + void (*create)(struct rvce_encoder *enc); + void (*feedback)(struct rvce_encoder *enc); + void (*rate_control)(struct rvce_encoder *enc); + void (*config_extension)(struct rvce_encoder *enc); + void (*pic_control)(struct rvce_encoder *enc); + void (*motion_estimation)(struct rvce_encoder *enc); + void (*rdo)(struct rvce_encoder *enc); + void (*vui)(struct rvce_encoder *enc); + void (*config)(struct rvce_encoder *enc); + void (*encode)(struct rvce_encoder *enc); + void (*destroy)(struct rvce_encoder *enc); + void (*task_info)(struct rvce_encoder *enc, uint32_t op, + uint32_t dep, uint32_t fb_idx, + uint32_t ring_idx); + + unsigned stream_handle; + + struct pipe_screen *screen; + struct radeon_winsys* ws; + struct radeon_winsys_cs* cs; + + rvce_get_buffer get_buffer; + + struct pb_buffer* handle; + struct radeon_surf* luma; + struct radeon_surf* chroma; + + struct pb_buffer* bs_handle; + unsigned bs_size; + + struct rvce_cpb_slot *cpb_array; + struct list_head cpb_slots; + unsigned cpb_num; + + struct rvid_buffer *fb; + struct rvid_buffer cpb; + struct pipe_h264_enc_picture_desc pic; + struct rvce_h264_enc_pic enc_pic; + + unsigned task_info_idx; + unsigned bs_idx; + + bool use_vm; + bool use_vui; + bool dual_pipe; + bool dual_inst; +}; + +/* CPB handling functions */ +struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc); +struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc); +struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc); +void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot, + signed *luma_offset, signed *chroma_offset); + +struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, + const struct pipe_video_codec *templat, + struct radeon_winsys* ws, + rvce_get_buffer get_buffer); + +bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen); + +void rvce_add_buffer(struct rvce_encoder *enc, struct pb_buffer *buf, + enum radeon_bo_usage usage, enum radeon_bo_domain domain, + signed offset); + +/* init vce fw 40.2.2 specific callbacks */ +void radeon_vce_40_2_2_init(struct rvce_encoder *enc); + +/* init vce fw 50 specific callbacks */ +void radeon_vce_50_init(struct rvce_encoder *enc); + +/* init vce fw 52 specific callbacks */ +void radeon_vce_52_init(struct rvce_encoder *enc); + +/* version specific function for getting parameters */ +void (*get_pic_param)(struct rvce_encoder *enc, + struct pipe_h264_enc_picture_desc *pic); + +/* get parameters for vce 40.2.2 */ +void radeon_vce_40_2_2_get_param(struct rvce_encoder *enc, + struct pipe_h264_enc_picture_desc *pic); + +/* get parameters for vce 50 */ +void radeon_vce_50_get_param(struct rvce_encoder *enc, + struct pipe_h264_enc_picture_desc *pic); + +/* get parameters for vce 52 */ +void radeon_vce_52_get_param(struct rvce_encoder *enc, + struct pipe_h264_enc_picture_desc *pic); + +#endif diff --git a/lib/mesa/src/gallium/drivers/r600/radeon_video.c b/lib/mesa/src/gallium/drivers/r600/radeon_video.c new file mode 100644 index 000000000..c7acc3d6e --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/radeon_video.c @@ -0,0 +1,349 @@ +/************************************************************************** + * + * Copyright 2013 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Authors: + * Christian König <christian.koenig@amd.com> + * + */ + +#include <unistd.h> + +#include "util/u_memory.h" +#include "util/u_video.h" + +#include "vl/vl_defines.h" +#include "vl/vl_video_buffer.h" + +#include "r600_pipe_common.h" +#include "radeon_video.h" +#include "radeon_vce.h" + +#define UVD_FW_1_66_16 ((1 << 24) | (66 << 16) | (16 << 8)) + +/* generate an stream handle */ +unsigned rvid_alloc_stream_handle() +{ + static unsigned counter = 0; + unsigned stream_handle = 0; + unsigned pid = getpid(); + int i; + + for (i = 0; i < 32; ++i) + stream_handle |= ((pid >> i) & 1) << (31 - i); + + stream_handle ^= ++counter; + return stream_handle; +} + +/* create a buffer in the winsys */ +bool rvid_create_buffer(struct pipe_screen *screen, struct rvid_buffer *buffer, + unsigned size, unsigned usage) +{ + memset(buffer, 0, sizeof(*buffer)); + buffer->usage = usage; + + /* Hardware buffer placement restrictions require the kernel to be + * able to move buffers around individually, so request a + * non-sub-allocated buffer. + */ + buffer->res = (struct r600_resource *) + pipe_buffer_create(screen, PIPE_BIND_SHARED, + usage, size); + + return buffer->res != NULL; +} + +/* destroy a buffer */ +void rvid_destroy_buffer(struct rvid_buffer *buffer) +{ + r600_resource_reference(&buffer->res, NULL); +} + +/* reallocate a buffer, preserving its content */ +bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs, + struct rvid_buffer *new_buf, unsigned new_size) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen *)screen; + struct radeon_winsys* ws = rscreen->ws; + unsigned bytes = MIN2(new_buf->res->buf->size, new_size); + struct rvid_buffer old_buf = *new_buf; + void *src = NULL, *dst = NULL; + + if (!rvid_create_buffer(screen, new_buf, new_size, new_buf->usage)) + goto error; + + src = ws->buffer_map(old_buf.res->buf, cs, PIPE_TRANSFER_READ); + if (!src) + goto error; + + dst = ws->buffer_map(new_buf->res->buf, cs, PIPE_TRANSFER_WRITE); + if (!dst) + goto error; + + memcpy(dst, src, bytes); + if (new_size > bytes) { + new_size -= bytes; + dst += bytes; + memset(dst, 0, new_size); + } + ws->buffer_unmap(new_buf->res->buf); + ws->buffer_unmap(old_buf.res->buf); + rvid_destroy_buffer(&old_buf); + return true; + +error: + if (src) + ws->buffer_unmap(old_buf.res->buf); + rvid_destroy_buffer(new_buf); + *new_buf = old_buf; + return false; +} + +/* clear the buffer with zeros */ +void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer) +{ + struct r600_common_context *rctx = (struct r600_common_context*)context; + + rctx->dma_clear_buffer(context, &buffer->res->b.b, 0, + buffer->res->buf->size, 0); + context->flush(context, NULL, 0); +} + +/** + * join surfaces into the same buffer with identical tiling params + * sumup their sizes and replace the backend buffers with a single bo + */ +void rvid_join_surfaces(struct r600_common_context *rctx, + struct pb_buffer** buffers[VL_NUM_COMPONENTS], + struct radeon_surf *surfaces[VL_NUM_COMPONENTS]) +{ + struct radeon_winsys* ws; + unsigned best_tiling, best_wh, off; + unsigned size, alignment; + struct pb_buffer *pb; + unsigned i, j; + + ws = rctx->ws; + + for (i = 0, best_tiling = 0, best_wh = ~0; i < VL_NUM_COMPONENTS; ++i) { + unsigned wh; + + if (!surfaces[i]) + continue; + + /* choose the smallest bank w/h for now */ + wh = surfaces[i]->u.legacy.bankw * surfaces[i]->u.legacy.bankh; + if (wh < best_wh) { + best_wh = wh; + best_tiling = i; + } + } + + for (i = 0, off = 0; i < VL_NUM_COMPONENTS; ++i) { + if (!surfaces[i]) + continue; + + /* adjust the texture layer offsets */ + off = align(off, surfaces[i]->surf_alignment); + + /* copy the tiling parameters */ + surfaces[i]->u.legacy.bankw = surfaces[best_tiling]->u.legacy.bankw; + surfaces[i]->u.legacy.bankh = surfaces[best_tiling]->u.legacy.bankh; + surfaces[i]->u.legacy.mtilea = surfaces[best_tiling]->u.legacy.mtilea; + surfaces[i]->u.legacy.tile_split = surfaces[best_tiling]->u.legacy.tile_split; + + for (j = 0; j < ARRAY_SIZE(surfaces[i]->u.legacy.level); ++j) + surfaces[i]->u.legacy.level[j].offset += off; + + off += surfaces[i]->surf_size; + } + + for (i = 0, size = 0, alignment = 0; i < VL_NUM_COMPONENTS; ++i) { + if (!buffers[i] || !*buffers[i]) + continue; + + size = align(size, (*buffers[i])->alignment); + size += (*buffers[i])->size; + alignment = MAX2(alignment, (*buffers[i])->alignment * 1); + } + + if (!size) + return; + + /* TODO: 2D tiling workaround */ + alignment *= 2; + + pb = ws->buffer_create(ws, size, alignment, RADEON_DOMAIN_VRAM, + RADEON_FLAG_GTT_WC); + if (!pb) + return; + + for (i = 0; i < VL_NUM_COMPONENTS; ++i) { + if (!buffers[i] || !*buffers[i]) + continue; + + pb_reference(buffers[i], pb); + } + + pb_reference(&pb, NULL); +} + +int rvid_get_video_param(struct pipe_screen *screen, + enum pipe_video_profile profile, + enum pipe_video_entrypoint entrypoint, + enum pipe_video_cap param) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen *)screen; + enum pipe_video_format codec = u_reduce_video_profile(profile); + struct radeon_info info; + + rscreen->ws->query_info(rscreen->ws, &info); + + if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) { + switch (param) { + case PIPE_VIDEO_CAP_SUPPORTED: + return codec == PIPE_VIDEO_FORMAT_MPEG4_AVC && + rvce_is_fw_version_supported(rscreen); + case PIPE_VIDEO_CAP_NPOT_TEXTURES: + return 1; + case PIPE_VIDEO_CAP_MAX_WIDTH: + return 2048; + case PIPE_VIDEO_CAP_MAX_HEIGHT: + return 1152; + case PIPE_VIDEO_CAP_PREFERED_FORMAT: + return PIPE_FORMAT_NV12; + case PIPE_VIDEO_CAP_PREFERS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: + return true; + case PIPE_VIDEO_CAP_STACKED_FRAMES: + return 1; + default: + return 0; + } + } + + switch (param) { + case PIPE_VIDEO_CAP_SUPPORTED: + switch (codec) { + case PIPE_VIDEO_FORMAT_MPEG12: + return profile != PIPE_VIDEO_PROFILE_MPEG1; + case PIPE_VIDEO_FORMAT_MPEG4: + /* no support for MPEG4 on older hw */ + return rscreen->family >= CHIP_PALM; + case PIPE_VIDEO_FORMAT_MPEG4_AVC: + return true; + case PIPE_VIDEO_FORMAT_VC1: + return true; + case PIPE_VIDEO_FORMAT_HEVC: + return false; + case PIPE_VIDEO_FORMAT_JPEG: + return false; + default: + return false; + } + case PIPE_VIDEO_CAP_NPOT_TEXTURES: + return 1; + case PIPE_VIDEO_CAP_MAX_WIDTH: + return 2048; + case PIPE_VIDEO_CAP_MAX_HEIGHT: + return 1152; + case PIPE_VIDEO_CAP_PREFERED_FORMAT: + if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + return PIPE_FORMAT_P016; + else + return PIPE_FORMAT_NV12; + + case PIPE_VIDEO_CAP_PREFERS_INTERLACED: + case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: + if (rscreen->family < CHIP_PALM) { + /* MPEG2 only with shaders and no support for + interlacing on R6xx style UVD */ + return codec != PIPE_VIDEO_FORMAT_MPEG12 && + rscreen->family > CHIP_RV770; + } else { + enum pipe_video_format format = u_reduce_video_profile(profile); + + if (format == PIPE_VIDEO_FORMAT_HEVC) + return false; //The firmware doesn't support interlaced HEVC. + else if (format == PIPE_VIDEO_FORMAT_JPEG) + return false; + return true; + } + case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: + return true; + case PIPE_VIDEO_CAP_MAX_LEVEL: + switch (profile) { + case PIPE_VIDEO_PROFILE_MPEG1: + return 0; + case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE: + case PIPE_VIDEO_PROFILE_MPEG2_MAIN: + return 3; + case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE: + return 3; + case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE: + return 5; + case PIPE_VIDEO_PROFILE_VC1_SIMPLE: + return 1; + case PIPE_VIDEO_PROFILE_VC1_MAIN: + return 2; + case PIPE_VIDEO_PROFILE_VC1_ADVANCED: + return 4; + case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE: + case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN: + case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH: + return 41; + case PIPE_VIDEO_PROFILE_HEVC_MAIN: + case PIPE_VIDEO_PROFILE_HEVC_MAIN_10: + return 186; + default: + return 0; + } + default: + return 0; + } +} + +boolean rvid_is_format_supported(struct pipe_screen *screen, + enum pipe_format format, + enum pipe_video_profile profile, + enum pipe_video_entrypoint entrypoint) +{ + /* HEVC 10 bit decoding should use P016 instead of NV12 if possible */ + if (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + return (format == PIPE_FORMAT_NV12) || + (format == PIPE_FORMAT_P016); + + /* we can only handle this one with UVD */ + if (profile != PIPE_VIDEO_PROFILE_UNKNOWN) + return format == PIPE_FORMAT_NV12; + + return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint); +} diff --git a/lib/mesa/src/gallium/drivers/r600/radeon_video.h b/lib/mesa/src/gallium/drivers/r600/radeon_video.h new file mode 100644 index 000000000..3347c4ebc --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/radeon_video.h @@ -0,0 +1,85 @@ +/************************************************************************** + * + * Copyright 2013 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Authors: + * Christian König <christian.koenig@amd.com> + * + */ + +#ifndef RADEON_VIDEO_H +#define RADEON_VIDEO_H + +#include "radeon/radeon_winsys.h" +#include "vl/vl_video_buffer.h" + +#define RVID_ERR(fmt, args...) \ + fprintf(stderr, "EE %s:%d %s UVD - "fmt, __FILE__, __LINE__, __func__, ##args) + +/* video buffer representation */ +struct rvid_buffer +{ + unsigned usage; + struct r600_resource *res; +}; + +/* generate an stream handle */ +unsigned rvid_alloc_stream_handle(void); + +/* create a buffer in the winsys */ +bool rvid_create_buffer(struct pipe_screen *screen, struct rvid_buffer *buffer, + unsigned size, unsigned usage); + +/* destroy a buffer */ +void rvid_destroy_buffer(struct rvid_buffer *buffer); + +/* reallocate a buffer, preserving its content */ +bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs, + struct rvid_buffer *new_buf, unsigned new_size); + +/* clear the buffer with zeros */ +void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer); + +/* join surfaces into the same buffer with identical tiling params + sumup their sizes and replace the backend buffers with a single bo */ +void rvid_join_surfaces(struct r600_common_context *rctx, + struct pb_buffer** buffers[VL_NUM_COMPONENTS], + struct radeon_surf *surfaces[VL_NUM_COMPONENTS]); + +/* returns supported codecs and other parameters */ +int rvid_get_video_param(struct pipe_screen *screen, + enum pipe_video_profile profile, + enum pipe_video_entrypoint entrypoint, + enum pipe_video_cap param); + +/* the hardware only supports NV12 */ +boolean rvid_is_format_supported(struct pipe_screen *screen, + enum pipe_format format, + enum pipe_video_profile profile, + enum pipe_video_entrypoint entrypoint); + +#endif // RADEON_VIDEO_H diff --git a/lib/mesa/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/lib/mesa/src/gallium/drivers/r600/sb/sb_bc_parser.cpp index ae92a767b..09a326ef6 100644 --- a/lib/mesa/src/gallium/drivers/r600/sb/sb_bc_parser.cpp +++ b/lib/mesa/src/gallium/drivers/r600/sb/sb_bc_parser.cpp @@ -516,7 +516,7 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { n->src.push_back(get_cf_index_value(1)); } - if ((n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX0 || n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1) && + if ((flags & AF_MOVA) && (n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX0 || n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1) && ctx.is_cayman()) // Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX save_set_cf_index(n->src[0], n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1); |