diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2023-01-28 08:15:04 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2023-01-28 08:15:04 +0000 |
commit | 1c5c7896c1d54abd25c0f33ca996165b359eecb3 (patch) | |
tree | bc5a10bbe89ecd4008a5bed7a029babc992f66b5 /lib/mesa/src/gallium/drivers/radeonsi/si_query.c | |
parent | c55dff45ea3a2c70e45601240b248017acc48198 (diff) |
Import Mesa 22.3.4
Diffstat (limited to 'lib/mesa/src/gallium/drivers/radeonsi/si_query.c')
-rw-r--r-- | lib/mesa/src/gallium/drivers/radeonsi/si_query.c | 189 |
1 files changed, 143 insertions, 46 deletions
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_query.c b/lib/mesa/src/gallium/drivers/radeonsi/si_query.c index 8a666710b..ba82464bc 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_query.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_query.c @@ -465,7 +465,7 @@ static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squ result->u32 = 0; return true; case SI_QUERY_GPIN_NUM_SIMD: - result->u32 = sctx->screen->info.num_good_compute_units; + result->u32 = sctx->screen->info.num_cu; return true; case SI_QUERY_GPIN_NUM_RB: result->u32 = sctx->screen->info.max_render_backends; @@ -637,6 +637,40 @@ static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_ return true; } +static unsigned si_query_pipestats_num_results(struct si_screen *sscreen) +{ + return sscreen->info.gfx_level >= GFX11 ? 14 : 11; +} + +static unsigned si_query_pipestat_dw_offset(enum pipe_statistics_query_index index) +{ + switch (index) { + case PIPE_STAT_QUERY_PS_INVOCATIONS: return 0; + case PIPE_STAT_QUERY_C_PRIMITIVES: return 2; + case PIPE_STAT_QUERY_C_INVOCATIONS: return 4; + case PIPE_STAT_QUERY_VS_INVOCATIONS: return 6; + case PIPE_STAT_QUERY_GS_INVOCATIONS: return 8; + case PIPE_STAT_QUERY_GS_PRIMITIVES: return 10; + case PIPE_STAT_QUERY_IA_PRIMITIVES: return 12; + case PIPE_STAT_QUERY_IA_VERTICES: return 14; + case PIPE_STAT_QUERY_HS_INVOCATIONS: return 16; + case PIPE_STAT_QUERY_DS_INVOCATIONS: return 18; + case PIPE_STAT_QUERY_CS_INVOCATIONS: return 20; + /* gfx11: MS_INVOCATIONS */ + /* gfx11: MS_PRIMITIVES */ + /* gfx11: TS_INVOCATIONS */ + default: + assert(false); + } + return ~0; +} + +unsigned si_query_pipestat_end_dw_offset(struct si_screen *sscreen, + enum pipe_statistics_query_index index) +{ + return si_query_pipestats_num_results(sscreen) * 2 + si_query_pipestat_dw_offset(index); +} + static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery, enum pipe_query_flags flags, enum pipe_query_value_type result_type, @@ -702,10 +736,13 @@ static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS; break; case PIPE_QUERY_PIPELINE_STATISTICS: - /* 11 values on GCN. */ - query->result_size = 11 * 16; + query->result_size = si_query_pipestats_num_results(sscreen) * 16; query->result_size += 8; /* for the fence + alignment */ query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen); + query->index = index; + if ((index == PIPE_STAT_QUERY_GS_PRIMITIVES || index == PIPE_STAT_QUERY_GS_INVOCATIONS) && + sscreen->use_ngg && (sscreen->info.gfx_level >= GFX10 && sscreen->info.gfx_level <= GFX10_3)) + query->flags |= SI_QUERY_EMULATE_GS_COUNTERS; break; default: assert(0); @@ -776,8 +813,22 @@ static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_h case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: { radeon_begin(cs); + if (sctx->gfx_level >= GFX11) { + uint64_t rb_mask = BITFIELD64_MASK(sctx->screen->info.max_render_backends); + + radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_CONTROL) | EVENT_INDEX(1)); + radeon_emit(PIXEL_PIPE_STATE_CNTL_COUNTER_ID(0) | + PIXEL_PIPE_STATE_CNTL_STRIDE(2) | + PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_LO(rb_mask)); + radeon_emit(PIXEL_PIPE_STATE_CNTL_INSTANCE_EN_HI(rb_mask)); + } + radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); + if (sctx->gfx_level >= GFX11) + radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1)); + else + radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); radeon_emit(va); radeon_emit(va >> 32); radeon_end(); @@ -798,12 +849,44 @@ static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_h EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type); break; case PIPE_QUERY_PIPELINE_STATISTICS: { - radeon_begin(cs); - radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); - radeon_emit(va); - radeon_emit(va >> 32); - radeon_end(); + if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) { + /* The hw GS primitive counter doesn't work when ngg is active. + * So if use_ngg is true, we don't use the hw version but instead + * emulate it in the GS shader. + * The value is written at the same position, so we don't need to + * change anything else. + * If ngg is enabled for the draw, the primitive count is written in + * gfx10_ngg_gs_emit_epilogue. If ngg is disabled, the number of exported + * vertices is stored in gs_emitted_vertices and the number of prim + * is computed based on the output prim type in emit_gs_epilogue. + */ + struct pipe_shader_buffer sbuf; + sbuf.buffer = &buffer->b.b; + sbuf.buffer_offset = query->buffer.results_end; + sbuf.buffer_size = buffer->bo_size; + si_set_internal_shader_buffer(sctx, SI_GS_QUERY_EMULATED_COUNTERS_BUF, &sbuf); + SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 1); + + const uint32_t zero = 0; + radeon_begin(cs); + /* Clear the emulated counter end value. We don't clear start because it's unused. */ + va += si_query_pipestat_end_dw_offset(sctx->screen, query->index) * 4; + radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + 1, 0)); + radeon_emit(S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP)); + radeon_emit(va); + radeon_emit(va >> 32); + radeon_emit(zero); + radeon_end(); + + sctx->num_pipeline_stat_emulated_queries++; + } else { + radeon_begin(cs); + radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); + radeon_emit(va); + radeon_emit(va >> 32); + radeon_end(); + } break; } default: @@ -817,9 +900,17 @@ static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw * { uint64_t va; - if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size)) + if (!query->buffer.buf && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) + si_resource_reference(&query->buffer.buf, sctx->pipeline_stats_query_buf); + + /* Don't realloc pipeline_stats_query_buf */ + if ((!(query->flags & SI_QUERY_EMULATE_GS_COUNTERS) || !sctx->pipeline_stats_query_buf) && + !si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size)) return; + if (query->flags & SI_QUERY_EMULATE_GS_COUNTERS) + si_resource_reference(&sctx->pipeline_stats_query_buf, query->buffer.buf); + si_update_occlusion_query_state(sctx, query->b.type, 1); si_update_prims_generated_query_state(sctx, query->b.type, 1); @@ -845,7 +936,10 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw va += 8; radeon_begin(cs); radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); + if (sctx->gfx_level >= GFX11) + radeon_emit(EVENT_TYPE(V_028A90_PIXEL_PIPE_STAT_DUMP) | EVENT_INDEX(1)); + else + radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); radeon_emit(va); radeon_emit(va >> 32); radeon_end(); @@ -877,11 +971,22 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw unsigned sample_size = (query->result_size - 8) / 2; va += sample_size; + radeon_begin(cs); - radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0)); - radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); - radeon_emit(va); - radeon_emit(va >> 32); + if (sctx->screen->use_ngg && query->flags & SI_QUERY_EMULATE_GS_COUNTERS) { + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + + if (--sctx->num_pipeline_stat_emulated_queries == 0) { + si_set_internal_shader_buffer(sctx, SI_GS_QUERY_BUF, NULL); + SET_FIELD(sctx->current_gs_state, GS_STATE_PIPELINE_STATS_EMU, 0); + } + } else { + radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); + radeon_emit(va); + radeon_emit(va >> 32); + } radeon_end(); fence_va = va + sample_size; @@ -936,7 +1041,7 @@ static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, radeon_begin(cs); - if (ctx->chip_class >= GFX9) { + if (ctx->gfx_level >= GFX9) { radeon_emit(PKT3(PKT3_SET_PREDICATION, 2, 0)); radeon_emit(op); radeon_emit(va); @@ -985,7 +1090,7 @@ static void si_emit_query_predication(struct si_context *ctx) while (first) { qbuf = first; if (first != last) - first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list); + first = list_entry(qbuf->list.next, struct gfx10_sh_query_buffer, list); else first = NULL; @@ -1227,10 +1332,9 @@ static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw * params->fence_offset = squery->result_size - 4; break; case PIPE_QUERY_PIPELINE_STATISTICS: { - static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80}; - params->start_offset = offsets[index]; - params->end_offset = 88 + offsets[index]; - params->fence_offset = 2 * 88; + params->start_offset = si_query_pipestat_dw_offset(index) * 4; + params->end_offset = si_query_pipestat_end_dw_offset(sctx->screen, index) * 4; + params->fence_offset = si_query_pipestats_num_results(sctx->screen) * 16; break; } default: @@ -1309,17 +1413,11 @@ static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw } break; case PIPE_QUERY_PIPELINE_STATISTICS: - result->pipeline_statistics.ps_invocations += si_query_read_result(buffer, 0, 22, false); - result->pipeline_statistics.c_primitives += si_query_read_result(buffer, 2, 24, false); - result->pipeline_statistics.c_invocations += si_query_read_result(buffer, 4, 26, false); - result->pipeline_statistics.vs_invocations += si_query_read_result(buffer, 6, 28, false); - result->pipeline_statistics.gs_invocations += si_query_read_result(buffer, 8, 30, false); - result->pipeline_statistics.gs_primitives += si_query_read_result(buffer, 10, 32, false); - result->pipeline_statistics.ia_primitives += si_query_read_result(buffer, 12, 34, false); - result->pipeline_statistics.ia_vertices += si_query_read_result(buffer, 14, 36, false); - result->pipeline_statistics.hs_invocations += si_query_read_result(buffer, 16, 38, false); - result->pipeline_statistics.ds_invocations += si_query_read_result(buffer, 18, 40, false); - result->pipeline_statistics.cs_invocations += si_query_read_result(buffer, 20, 42, false); + for (int i = 0; i < 11; i++) { + result->pipeline_statistics.counters[i] += + si_query_read_result(buffer, si_query_pipestat_dw_offset(i), + si_query_pipestat_end_dw_offset(sscreen, i), false); + } #if 0 /* for testing */ printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, " "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, " @@ -1538,7 +1636,8 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q if (!qbuf->previous) { ssbo[2].buffer = resource; ssbo[2].buffer_offset = offset; - ssbo[2].buffer_size = 8; + ssbo[2].buffer_size = resource->width0 - offset; + /* assert size is correct, based on result_type ? */ si_resource(resource)->TC_L2_dirty = true; } @@ -1578,8 +1677,8 @@ static void si_render_condition(struct pipe_context *ctx, struct pipe_query *que * SET_PREDICATION packets to give the wrong answer for * non-inverted stream overflow predication. */ - if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) || - (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) && + if (((sctx->gfx_level == GFX8 && sctx->screen->info.pfp_fw_feature < 49) || + (sctx->gfx_level == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) && !condition && (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE || (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE && @@ -1747,19 +1846,17 @@ static unsigned si_get_num_queries(struct si_screen *sscreen) { /* amdgpu */ if (sscreen->info.is_amdgpu) { - if (sscreen->info.chip_class >= GFX8) + if (sscreen->info.gfx_level >= GFX8) return ARRAY_SIZE(si_driver_query_list); else return ARRAY_SIZE(si_driver_query_list) - 7; } /* radeon */ - if (sscreen->info.has_read_registers_query) { - if (sscreen->info.chip_class == GFX7) - return ARRAY_SIZE(si_driver_query_list) - 6; - else - return ARRAY_SIZE(si_driver_query_list) - 7; - } + if (sscreen->info.gfx_level == GFX7) + return ARRAY_SIZE(si_driver_query_list) - 6; + else + return ARRAY_SIZE(si_driver_query_list) - 7; return ARRAY_SIZE(si_driver_query_list) - 21; } @@ -1786,19 +1883,19 @@ static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index, case SI_QUERY_VRAM_USAGE: case SI_QUERY_MAPPED_VRAM: case SI_QUERY_SLAB_WASTED_VRAM: - info->max_value.u64 = sscreen->info.vram_size; + info->max_value.u64 = (uint64_t)sscreen->info.vram_size_kb * 1024; break; case SI_QUERY_REQUESTED_GTT: case SI_QUERY_GTT_USAGE: case SI_QUERY_MAPPED_GTT: case SI_QUERY_SLAB_WASTED_GTT: - info->max_value.u64 = sscreen->info.gart_size; + info->max_value.u64 = (uint64_t)sscreen->info.gart_size_kb * 1024; break; case SI_QUERY_GPU_TEMPERATURE: info->max_value.u64 = 125; break; case SI_QUERY_VRAM_VIS_USAGE: - info->max_value.u64 = sscreen->info.vram_vis_size; + info->max_value.u64 = (uint64_t)sscreen->info.vram_vis_size_kb * 1024; break; } |