From 851ab880f8cbf08cc4d343259210addcb2715c09 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 22 Jan 2020 02:09:57 +0000 Subject: Import Mesa 19.2.8 --- .../src/gallium/drivers/radeonsi/gfx10_query.c | 792 +++++++++++---------- 1 file changed, 426 insertions(+), 366 deletions(-) (limited to 'lib') diff --git a/lib/mesa/src/gallium/drivers/radeonsi/gfx10_query.c b/lib/mesa/src/gallium/drivers/radeonsi/gfx10_query.c index 98ee6ba3d..56ecbd548 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/gfx10_query.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/gfx10_query.c @@ -22,442 +22,502 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include "si_pipe.h" #include "si_query.h" -#include "sid.h" #include "util/u_memory.h" #include "util/u_suballoc.h" +#include "sid.h" -#include +/** + * The query buffer is written to by ESGS NGG shaders with statistics about + * generated and (streamout-)emitted primitives. + * + * The context maintains a ring of these query buffers, and queries simply + * point into the ring, allowing an arbitrary number of queries to be active + * without additional GPU cost. + */ +struct gfx10_sh_query_buffer { + struct list_head list; + struct si_resource *buf; + unsigned refcount; + + /* Offset into the buffer in bytes; points at the first un-emitted entry. */ + unsigned head; +}; + +/* Memory layout of the query buffer. Must be kept in sync with shaders + * (including QBO shaders) and should be aligned to cachelines. + * + * The somewhat awkward memory layout is for compatibility with the + * SET_PREDICATION packet, which also means that we're setting the high bit + * of all those values unconditionally. + */ +struct gfx10_sh_query_buffer_mem { + struct { + uint64_t generated_primitives_start_dummy; + uint64_t emitted_primitives_start_dummy; + uint64_t generated_primitives; + uint64_t emitted_primitives; + } stream[4]; + uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */ + uint32_t pad[31]; +}; + +/* Shader-based queries. */ +struct gfx10_sh_query { + struct si_query b; + + struct gfx10_sh_query_buffer *first; + struct gfx10_sh_query_buffer *last; + unsigned first_begin; + unsigned last_end; + + unsigned stream; +}; static void emit_shader_query(struct si_context *sctx) { - assert(!list_is_empty(&sctx->shader_query_buffers)); + assert(!LIST_IS_EMPTY(&sctx->shader_query_buffers)); - struct gfx10_sh_query_buffer *qbuf = - list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); - qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem); + struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers, + struct gfx10_sh_query_buffer, list); + qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem); } static void gfx10_release_query_buffers(struct si_context *sctx, - struct gfx10_sh_query_buffer *first, - struct gfx10_sh_query_buffer *last) + struct gfx10_sh_query_buffer *first, + struct gfx10_sh_query_buffer *last) { - while (first) { - struct gfx10_sh_query_buffer *qbuf = first; - if (first != last) - first = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list); - else - first = NULL; - - qbuf->refcount--; - if (qbuf->refcount) - continue; - - if (qbuf->list.next == &sctx->shader_query_buffers) - continue; /* keep the most recent buffer; it may not be full yet */ - if (qbuf->list.prev == &sctx->shader_query_buffers) - continue; /* keep the oldest buffer for recycling */ - - list_del(&qbuf->list); - si_resource_reference(&qbuf->buf, NULL); - FREE(qbuf); - } + while (first) { + struct gfx10_sh_query_buffer *qbuf = first; + if (first != last) + first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list); + else + first = NULL; + + qbuf->refcount--; + if (qbuf->refcount) + continue; + + if (qbuf->list.next == &sctx->shader_query_buffers) + continue; /* keep the most recent buffer; it may not be full yet */ + if (qbuf->list.prev == &sctx->shader_query_buffers) + continue; /* keep the oldest buffer for recycling */ + + LIST_DEL(&qbuf->list); + si_resource_reference(&qbuf->buf, NULL); + FREE(qbuf); + } } static bool gfx10_alloc_query_buffer(struct si_context *sctx) { - if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) - return true; - - struct gfx10_sh_query_buffer *qbuf = NULL; - - if (!list_is_empty(&sctx->shader_query_buffers)) { - qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); - if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0) - goto success; - - qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); - if (!qbuf->refcount && - !si_cs_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) && - sctx->ws->buffer_wait(sctx->ws, qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) { - /* Can immediately re-use the oldest buffer */ - list_del(&qbuf->list); - } else { - qbuf = NULL; - } - } - - if (!qbuf) { - qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer); - if (unlikely(!qbuf)) - return false; - - struct si_screen *screen = sctx->screen; - unsigned buf_size = - MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size); - qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size)); - if (unlikely(!qbuf->buf)) { - FREE(qbuf); - return false; - } - } - - /* The buffer is currently unused by the GPU. Initialize it. - * - * We need to set the high bit of all the primitive counters for - * compatibility with the SET_PREDICATION packet. - */ - uint64_t *results = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, - PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED); - assert(results); - - for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e; - ++i) { - for (unsigned j = 0; j < 16; ++j) - results[32 * i + j] = (uint64_t)1 << 63; - results[32 * i + 16] = 0; - } - - list_addtail(&qbuf->list, &sctx->shader_query_buffers); - qbuf->head = 0; - qbuf->refcount = sctx->num_active_shader_queries; + if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) + return true; + + struct gfx10_sh_query_buffer *qbuf = NULL; + + if (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) { + qbuf = list_last_entry(&sctx->shader_query_buffers, + struct gfx10_sh_query_buffer, list); + if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0) + goto success; + + qbuf = list_first_entry(&sctx->shader_query_buffers, + struct gfx10_sh_query_buffer, list); + if (!qbuf->refcount && + !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) && + sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) { + /* Can immediately re-use the oldest buffer */ + LIST_DEL(&qbuf->list); + } else { + qbuf = NULL; + } + } + + if (!qbuf) { + qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer); + if (unlikely(!qbuf)) + return false; + + struct si_screen *screen = sctx->screen; + unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem), + screen->info.min_alloc_size); + qbuf->buf = si_resource( + pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size)); + if (unlikely(!qbuf->buf)) { + FREE(qbuf); + return false; + } + } + + /* The buffer is currently unused by the GPU. Initialize it. + * + * We need to set the high bit of all the primitive counters for + * compatibility with the SET_PREDICATION packet. + */ + uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL, + PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_UNSYNCHRONIZED); + assert(results); + + for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); + i < e; ++i) { + for (unsigned j = 0; j < 16; ++j) + results[32 * i + j] = (uint64_t)1 << 63; + results[32 * i + 16] = 0; + } + + LIST_ADDTAIL(&qbuf->list, &sctx->shader_query_buffers); + qbuf->head = 0; + qbuf->refcount = sctx->num_active_shader_queries; success:; - struct pipe_shader_buffer sbuf; - sbuf.buffer = &qbuf->buf->b.b; - sbuf.buffer_offset = qbuf->head; - sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem); - si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, &sbuf); - SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 1); - - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query); - return true; + struct pipe_shader_buffer sbuf; + sbuf.buffer = &qbuf->buf->b.b; + sbuf.buffer_offset = qbuf->head; + sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem); + si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf); + sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1); + + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query); + return true; } static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; - gfx10_release_query_buffers(sctx, query->first, query->last); - FREE(query); + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + gfx10_release_query_buffers(sctx, query->first, query->last); + FREE(query); } static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; - gfx10_release_query_buffers(sctx, query->first, query->last); - query->first = query->last = NULL; + gfx10_release_query_buffers(sctx, query->first, query->last); + query->first = query->last = NULL; - if (unlikely(!gfx10_alloc_query_buffer(sctx))) - return false; + if (unlikely(!gfx10_alloc_query_buffer(sctx))) + return false; - query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); - query->first_begin = query->first->head; + query->first = list_last_entry(&sctx->shader_query_buffers, + struct gfx10_sh_query_buffer, list); + query->first_begin = query->first->head; - sctx->num_active_shader_queries++; - query->first->refcount++; + sctx->num_active_shader_queries++; + query->first->refcount++; - return true; + return true; } static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; - - if (unlikely(!query->first)) - return false; /* earlier out of memory error */ - - query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); - query->last_end = query->last->head; - - /* Signal the fence of the previous chunk */ - if (query->last_end != 0) { - uint64_t fence_va = query->last->buf->gpu_address; - fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem); - fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence); - si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, - EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va, - 0xffffffff, PIPE_QUERY_GPU_FINISHED); - } - - sctx->num_active_shader_queries--; - - if (sctx->num_active_shader_queries <= 0 || !si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query)) { - si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL); - SET_FIELD(sctx->current_gs_state, GS_STATE_STREAMOUT_QUERY_ENABLED, 0); - - /* If a query_begin is followed by a query_end without a draw - * in-between, we need to clear the atom to ensure that the - * next query_begin will re-initialize the shader buffer. */ - si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false); - } - - return true; + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + + if (unlikely(!query->first)) + return false; /* earlier out of memory error */ + + query->last = list_last_entry(&sctx->shader_query_buffers, + struct gfx10_sh_query_buffer, list); + query->last_end = query->last->head; + + /* Signal the fence of the previous chunk */ + if (query->last_end != 0) { + uint64_t fence_va = query->last->buf->gpu_address; + fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem); + fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence); + si_cp_release_mem(sctx, sctx->gfx_cs, + V_028A90_BOTTOM_OF_PIPE_TS, 0, + EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, + EOP_DATA_SEL_VALUE_32BIT, + query->last->buf, fence_va, 0xffffffff, + PIPE_QUERY_GPU_FINISHED); + } + + sctx->num_active_shader_queries--; + + if (sctx->num_active_shader_queries > 0) { + gfx10_alloc_query_buffer(sctx); + } else { + si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL); + sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED; + + /* If a query_begin is followed by a query_end without a draw + * in-between, we need to clear the atom to ensure that the + * next query_begin will re-initialize the shader buffer. */ + si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false); + } + + return true; } static void gfx10_sh_query_add_result(struct gfx10_sh_query *query, - struct gfx10_sh_query_buffer_mem *qmem, - union pipe_query_result *result) + struct gfx10_sh_query_buffer_mem *qmem, + union pipe_query_result *result) { - static const uint64_t mask = ((uint64_t)1 << 63) - 1; - - switch (query->b.type) { - case PIPE_QUERY_PRIMITIVES_EMITTED: - result->u64 += qmem->stream[query->stream].emitted_primitives & mask; - break; - case PIPE_QUERY_PRIMITIVES_GENERATED: - result->u64 += qmem->stream[query->stream].generated_primitives & mask; - break; - case PIPE_QUERY_SO_STATISTICS: - result->so_statistics.num_primitives_written += - qmem->stream[query->stream].emitted_primitives & mask; - result->so_statistics.primitives_storage_needed += - qmem->stream[query->stream].generated_primitives & mask; - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - result->b |= qmem->stream[query->stream].emitted_primitives != - qmem->stream[query->stream].generated_primitives; - break; - case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { - result->b |= qmem->stream[stream].emitted_primitives != - qmem->stream[stream].generated_primitives; - } - break; - default: - assert(0); - } + static const uint64_t mask = ((uint64_t)1 << 63) - 1; + + switch (query->b.type) { + case PIPE_QUERY_PRIMITIVES_EMITTED: + result->u64 += qmem->stream[query->stream].emitted_primitives & mask; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + result->u64 += qmem->stream[query->stream].generated_primitives & mask; + break; + case PIPE_QUERY_SO_STATISTICS: + result->so_statistics.num_primitives_written += + qmem->stream[query->stream].emitted_primitives & mask; + result->so_statistics.primitives_storage_needed += + qmem->stream[query->stream].generated_primitives & mask; + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + result->b |= qmem->stream[query->stream].emitted_primitives != + qmem->stream[query->stream].generated_primitives; + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) { + result->b |= qmem->stream[query->stream].emitted_primitives != + qmem->stream[query->stream].generated_primitives; + } + break; + default: + assert(0); + } } -static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait, - union pipe_query_result *result) +static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, + bool wait, union pipe_query_result *result) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; - util_query_clear_result(result, query->b.type); + util_query_clear_result(result, query->b.type); - if (unlikely(!query->first)) - return false; /* earlier out of memory error */ - assert(query->last); + if (unlikely(!query->first)) + return false; /* earlier out of memory error */ + assert(query->last); - for (struct gfx10_sh_query_buffer *qbuf = query->last;; - qbuf = list_entry(qbuf->list.prev, struct gfx10_sh_query_buffer, list)) { - unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK); - void *map; + for (struct gfx10_sh_query_buffer *qbuf = query->last;; + qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) { + unsigned usage = PIPE_TRANSFER_READ | + (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); + void *map; - if (rquery->b.flushed) - map = sctx->ws->buffer_map(sctx->ws, qbuf->buf->buf, NULL, usage); - else - map = si_buffer_map(sctx, qbuf->buf, usage); + if (rquery->b.flushed) + map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); + else + map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage); - if (!map) - return false; + if (!map) + return false; - unsigned results_begin = 0; - unsigned results_end = qbuf->head; - if (qbuf == query->first) - results_begin = query->first_begin; - if (qbuf == query->last) - results_end = query->last_end; + unsigned results_begin = 0; + unsigned results_end = qbuf->head; + if (qbuf == query->first) + results_begin = query->first_begin; + if (qbuf == query->last) + results_end = query->last_end; - while (results_begin != results_end) { - struct gfx10_sh_query_buffer_mem *qmem = map + results_begin; - results_begin += sizeof(*qmem); + while (results_begin != results_end) { + struct gfx10_sh_query_buffer_mem *qmem = map + results_begin; + results_begin += sizeof(*qmem); - gfx10_sh_query_add_result(query, qmem, result); - } + gfx10_sh_query_add_result(query, qmem, result); + } - if (qbuf == query->first) - break; - } + if (qbuf == query->first) + break; + } - return true; + return true; } -static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery, - enum pipe_query_flags flags, - enum pipe_query_value_type result_type, - int index, struct pipe_resource *resource, - unsigned offset) +static void gfx10_sh_query_get_result_resource(struct si_context *sctx, + struct si_query *rquery, + bool wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset) { - struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; - struct si_qbo_state saved_state = {}; - struct pipe_resource *tmp_buffer = NULL; - unsigned tmp_buffer_offset = 0; - - if (!sctx->sh_query_result_shader) { - sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx); - if (!sctx->sh_query_result_shader) - return; - } - - if (query->first != query->last) { - u_suballocator_alloc(&sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer); - if (!tmp_buffer) - return; - } - - si_save_qbo_state(sctx, &saved_state); - - /* Pre-fill the constants configuring the shader behavior. */ - struct { - uint32_t config; - uint32_t offset; - uint32_t chain; - uint32_t result_count; - } consts; - struct pipe_constant_buffer constant_buffer = {}; - - if (index >= 0) { - switch (query->b.type) { - case PIPE_QUERY_PRIMITIVES_GENERATED: - consts.offset = 4 * sizeof(uint64_t) * query->stream + 2 * sizeof(uint64_t); - consts.config = 0; - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - consts.offset = 4 * sizeof(uint64_t) * query->stream + 3 * sizeof(uint64_t); - consts.config = 0; - break; - case PIPE_QUERY_SO_STATISTICS: - consts.offset = sizeof(uint32_t) * (4 * index + query->stream); - consts.config = 0; - break; - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - consts.offset = 4 * sizeof(uint64_t) * query->stream; - consts.config = 2; - break; - case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - consts.offset = 0; - consts.config = 3; - break; - default: - unreachable("bad query type"); - } - } else { - /* Check result availability. */ - consts.offset = 0; - consts.config = 1; - } - - if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64) - consts.config |= 8; - - constant_buffer.buffer_size = sizeof(consts); - constant_buffer.user_buffer = &consts; - - /* Pre-fill the SSBOs and grid. */ - struct pipe_shader_buffer ssbo[3]; - struct pipe_grid_info grid = {}; - - ssbo[1].buffer = tmp_buffer; - ssbo[1].buffer_offset = tmp_buffer_offset; - ssbo[1].buffer_size = 16; - - ssbo[2] = ssbo[1]; - - grid.block[0] = 1; - grid.block[1] = 1; - grid.block[2] = 1; - grid.grid[0] = 1; - grid.grid[1] = 1; - grid.grid[2] = 1; - - struct gfx10_sh_query_buffer *qbuf = query->first; - for (;;) { - unsigned begin = qbuf == query->first ? query->first_begin : 0; - unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0; - if (!end) - continue; - - ssbo[0].buffer = &qbuf->buf->b.b; - ssbo[0].buffer_offset = begin; - ssbo[0].buffer_size = end - begin; - - consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem); - consts.chain = 0; - if (qbuf != query->first) - consts.chain |= 1; - if (qbuf != query->last) - consts.chain |= 2; - - if (qbuf == query->last) { - ssbo[2].buffer = resource; - ssbo[2].buffer_offset = offset; - ssbo[2].buffer_size = 8; - } - - sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer); - - if (flags & PIPE_QUERY_WAIT) { - uint64_t va; - - /* Wait for result availability. Wait only for readiness - * of the last entry, since the fence writes should be - * serialized in the CP. - */ - va = qbuf->buf->gpu_address; - va += end - sizeof(struct gfx10_sh_query_buffer_mem); - va += offsetof(struct gfx10_sh_query_buffer_mem, fence); - - si_cp_wait_mem(sctx, &sctx->gfx_cs, va, 0x00000001, 0x00000001, 0); - } - - /* ssbo[2] is either tmp_buffer or resource */ - assert(ssbo[2].buffer); - si_launch_grid_internal_ssbos(sctx, &grid, sctx->sh_query_result_shader, - SI_OP_SYNC_PS_BEFORE | SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER, - 3, ssbo, (1 << 2) | (ssbo[1].buffer ? 1 << 1 : 0)); - - if (qbuf == query->last) - break; - qbuf = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list); - } - - si_restore_qbo_state(sctx, &saved_state); - pipe_resource_reference(&tmp_buffer, NULL); + struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery; + struct si_qbo_state saved_state = {}; + struct pipe_resource *tmp_buffer = NULL; + unsigned tmp_buffer_offset = 0; + + if (!sctx->sh_query_result_shader) { + sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx); + if (!sctx->sh_query_result_shader) + return; + } + + if (query->first != query->last) { + u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, + &tmp_buffer_offset, &tmp_buffer); + if (!tmp_buffer) + return; + } + + si_save_qbo_state(sctx, &saved_state); + + /* Pre-fill the constants configuring the shader behavior. */ + struct { + uint32_t config; + uint32_t offset; + uint32_t chain; + uint32_t result_count; + } consts; + struct pipe_constant_buffer constant_buffer = {}; + + if (index >= 0) { + switch (query->b.type) { + case PIPE_QUERY_PRIMITIVES_GENERATED: + consts.offset = sizeof(uint32_t) * query->stream; + consts.config = 0; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + consts.offset = sizeof(uint32_t) * (4 + query->stream); + consts.config = 0; + break; + case PIPE_QUERY_SO_STATISTICS: + consts.offset = sizeof(uint32_t) * (4 * index + query->stream); + consts.config = 0; + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + consts.offset = sizeof(uint32_t) * query->stream; + consts.config = 2; + break; + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + consts.offset = 0; + consts.config = 3; + break; + default: unreachable("bad query type"); + } + } else { + /* Check result availability. */ + consts.offset = 0; + consts.config = 1; + } + + if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64) + consts.config |= 8; + + constant_buffer.buffer_size = sizeof(consts); + constant_buffer.user_buffer = &consts; + + /* Pre-fill the SSBOs and grid. */ + struct pipe_shader_buffer ssbo[3]; + struct pipe_grid_info grid = {}; + + ssbo[1].buffer = tmp_buffer; + ssbo[1].buffer_offset = tmp_buffer_offset; + ssbo[1].buffer_size = 16; + + ssbo[2] = ssbo[1]; + + sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader); + + grid.block[0] = 1; + grid.block[1] = 1; + grid.block[2] = 1; + grid.grid[0] = 1; + grid.grid[1] = 1; + grid.grid[2] = 1; + + struct gfx10_sh_query_buffer *qbuf = query->first; + for (;;) { + unsigned begin = qbuf == query->first ? query->first_begin : 0; + unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0; + if (!end) + continue; + + ssbo[0].buffer = &qbuf->buf->b.b; + ssbo[0].buffer_offset = begin; + ssbo[0].buffer_size = end - begin; + + consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem); + consts.chain = 0; + if (qbuf != query->first) + consts.chain |= 1; + if (qbuf != query->last) + consts.chain |= 2; + + if (qbuf == query->last) { + ssbo[2].buffer = resource; + ssbo[2].buffer_offset = offset; + ssbo[2].buffer_size = 8; + } + + sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); + sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6); + + if (wait) { + uint64_t va; + + /* Wait for result availability. Wait only for readiness + * of the last entry, since the fence writes should be + * serialized in the CP. + */ + va = qbuf->buf->gpu_address; + va += end - sizeof(struct gfx10_sh_query_buffer_mem); + va += offsetof(struct gfx10_sh_query_buffer_mem, fence); + + si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0); + } + + sctx->b.launch_grid(&sctx->b, &grid); + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + + if (qbuf == query->last) + break; + qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list); + } + + si_restore_qbo_state(sctx, &saved_state); + pipe_resource_reference(&tmp_buffer, NULL); } static const struct si_query_ops gfx10_sh_query_ops = { - .destroy = gfx10_sh_query_destroy, - .begin = gfx10_sh_query_begin, - .end = gfx10_sh_query_end, - .get_result = gfx10_sh_query_get_result, - .get_result_resource = gfx10_sh_query_get_result_resource, + .destroy = gfx10_sh_query_destroy, + .begin = gfx10_sh_query_begin, + .end = gfx10_sh_query_end, + .get_result = gfx10_sh_query_get_result, + .get_result_resource = gfx10_sh_query_get_result_resource, }; -struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type, - unsigned index) +struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, + enum pipe_query_type query_type, + unsigned index) { - struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query); - if (unlikely(!query)) - return NULL; + struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query); + if (unlikely(!query)) + return NULL; - query->b.ops = &gfx10_sh_query_ops; - query->b.type = query_type; - query->stream = index; + query->b.ops = &gfx10_sh_query_ops; + query->b.type = query_type; + query->stream = index; - return (struct pipe_query *)query; + return (struct pipe_query *)query; } void gfx10_init_query(struct si_context *sctx) { - list_inithead(&sctx->shader_query_buffers); - sctx->atoms.s.shader_query.emit = emit_shader_query; + LIST_INITHEAD(&sctx->shader_query_buffers); + sctx->atoms.s.shader_query.emit = emit_shader_query; } void gfx10_destroy_query(struct si_context *sctx) { - if (!sctx->shader_query_buffers.next) - return; - - while (!list_is_empty(&sctx->shader_query_buffers)) { - struct gfx10_sh_query_buffer *qbuf = - list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list); - list_del(&qbuf->list); - - assert(!qbuf->refcount); - si_resource_reference(&qbuf->buf, NULL); - FREE(qbuf); - } + while (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) { + struct gfx10_sh_query_buffer *qbuf = + list_first_entry(&sctx->shader_query_buffers, + struct gfx10_sh_query_buffer, list); + LIST_DEL(&qbuf->list); + + assert(!qbuf->refcount); + si_resource_reference(&qbuf->buf, NULL); + FREE(qbuf); + } } -- cgit v1.2.3