diff options
Diffstat (limited to 'lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c')
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 164 |
1 files changed, 99 insertions, 65 deletions
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 0f42298c2..83da740f6 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -200,46 +200,46 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx) static bool amdgpu_get_new_ib(struct amdgpu_cs *cs) { - /* The maximum size is 4MB - 1B, which is unaligned. - * Use aligned size 4MB - 16B. */ - const unsigned max_ib_size = (1024 * 1024 - 16) * 4; - const unsigned min_ib_size = 24 * 1024 * 4; + /* Small IBs are better than big IBs, because the GPU goes idle quicker + * and there is less waiting for buffers and fences. Proof: + * http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1 + */ + const unsigned buffer_size = 128 * 1024 * 4; + const unsigned ib_size = 20 * 1024 * 4; cs->base.cdw = 0; cs->base.buf = NULL; /* Allocate a new buffer for IBs if the current buffer is all used. */ if (!cs->big_ib_buffer || - cs->used_ib_space + min_ib_size > cs->big_ib_buffer->size) { + cs->used_ib_space + ib_size > cs->big_ib_buffer->size) { struct radeon_winsys *ws = &cs->ctx->ws->base; - struct radeon_winsys_cs_handle *winsys_bo; pb_reference(&cs->big_ib_buffer, NULL); cs->big_ib_winsys_buffer = NULL; cs->ib_mapped = NULL; cs->used_ib_space = 0; - cs->big_ib_buffer = ws->buffer_create(ws, max_ib_size, + cs->big_ib_buffer = ws->buffer_create(ws, buffer_size, 4096, true, RADEON_DOMAIN_GTT, RADEON_FLAG_CPU_ACCESS); if (!cs->big_ib_buffer) return false; - winsys_bo = ws->buffer_get_cs_handle(cs->big_ib_buffer); - - cs->ib_mapped = ws->buffer_map(winsys_bo, NULL, PIPE_TRANSFER_WRITE); + cs->ib_mapped = ws->buffer_map(cs->big_ib_buffer, NULL, + PIPE_TRANSFER_WRITE); if (!cs->ib_mapped) { pb_reference(&cs->big_ib_buffer, NULL); return false; } - cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)winsys_bo; + cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)cs->big_ib_buffer; } cs->ib.ib_mc_address = cs->big_ib_winsys_buffer->va + cs->used_ib_space; cs->base.buf = (uint32_t*)(cs->ib_mapped + cs->used_ib_space); - cs->base.max_dw = (cs->big_ib_buffer->size - cs->used_ib_space) / 4; + cs->base.max_dw = ib_size / 4; return true; } @@ -336,7 +336,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), void *flush_ctx, - struct radeon_winsys_cs_handle *trace_buf) + struct pb_buffer *trace_buf) { struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; struct amdgpu_cs *cs; @@ -368,7 +368,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value) -int amdgpu_get_reloc(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo) +int amdgpu_lookup_buffer(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo) { unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1); int i = cs->buffer_indices_hashlist[hash]; @@ -377,15 +377,15 @@ int amdgpu_get_reloc(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo) if (i == -1 || cs->buffers[i].bo == bo) return i; - /* Hash collision, look for the BO in the list of relocs linearly. */ + /* Hash collision, look for the BO in the list of buffers linearly. */ for (i = cs->num_buffers - 1; i >= 0; i--) { if (cs->buffers[i].bo == bo) { - /* Put this reloc in the hash list. + /* Put this buffer in the hash list. * This will prevent additional hash collisions if there are - * several consecutive get_reloc calls for the same buffer. + * several consecutive lookup_buffer calls for the same buffer. * * Example: Assuming buffers A,B,C collide in the hash list, - * the following sequence of relocs: + * the following sequence of buffers: * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC * will collide here: ^ and here: ^, * meaning that we should get very few collisions in the end. */ @@ -396,32 +396,33 @@ int amdgpu_get_reloc(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo) return -1; } -static unsigned amdgpu_add_reloc(struct amdgpu_cs *cs, +static unsigned amdgpu_add_buffer(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo, enum radeon_bo_usage usage, enum radeon_bo_domain domains, unsigned priority, enum radeon_bo_domain *added_domains) { - struct amdgpu_cs_buffer *reloc; + struct amdgpu_cs_buffer *buffer; unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1); int i = -1; - priority = MIN2(priority, 15); + assert(priority < 64); *added_domains = 0; - i = amdgpu_get_reloc(cs, bo); + i = amdgpu_lookup_buffer(cs, bo); if (i >= 0) { - reloc = &cs->buffers[i]; - reloc->usage |= usage; - *added_domains = domains & ~reloc->domains; - reloc->domains |= domains; - cs->flags[i] = MAX2(cs->flags[i], priority); + buffer = &cs->buffers[i]; + buffer->priority_usage |= 1llu << priority; + buffer->usage |= usage; + *added_domains = domains & ~buffer->domains; + buffer->domains |= domains; + cs->flags[i] = MAX2(cs->flags[i], priority / 4); return i; } - /* New relocation, check if the backing array is large enough. */ + /* New buffer, check if the backing array is large enough. */ if (cs->num_buffers >= cs->max_num_buffers) { uint32_t size; cs->max_num_buffers += 10; @@ -435,16 +436,17 @@ static unsigned amdgpu_add_reloc(struct amdgpu_cs *cs, cs->flags = realloc(cs->flags, cs->max_num_buffers); } - /* Initialize the new relocation. */ + /* Initialize the new buffer. */ cs->buffers[cs->num_buffers].bo = NULL; amdgpu_winsys_bo_reference(&cs->buffers[cs->num_buffers].bo, bo); cs->handles[cs->num_buffers] = bo->bo; - cs->flags[cs->num_buffers] = priority; + cs->flags[cs->num_buffers] = priority / 4; p_atomic_inc(&bo->num_cs_references); - reloc = &cs->buffers[cs->num_buffers]; - reloc->bo = bo; - reloc->usage = usage; - reloc->domains = domains; + buffer = &cs->buffers[cs->num_buffers]; + buffer->bo = bo; + buffer->priority_usage = 1llu << priority; + buffer->usage = usage; + buffer->domains = domains; cs->buffer_indices_hashlist[hash] = cs->num_buffers; @@ -452,8 +454,8 @@ static unsigned amdgpu_add_reloc(struct amdgpu_cs *cs, return cs->num_buffers++; } -static unsigned amdgpu_cs_add_reloc(struct radeon_winsys_cs *rcs, - struct radeon_winsys_cs_handle *buf, +static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs, + struct pb_buffer *buf, enum radeon_bo_usage usage, enum radeon_bo_domain domains, enum radeon_bo_priority priority) @@ -464,7 +466,7 @@ static unsigned amdgpu_cs_add_reloc(struct radeon_winsys_cs *rcs, struct amdgpu_cs *cs = amdgpu_cs(rcs); struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; enum radeon_bo_domain added_domains; - unsigned index = amdgpu_add_reloc(cs, bo, usage, bo->initial_domain, + unsigned index = amdgpu_add_buffer(cs, bo, usage, bo->initial_domain, priority, &added_domains); if (added_domains & RADEON_DOMAIN_GTT) @@ -475,12 +477,12 @@ static unsigned amdgpu_cs_add_reloc(struct radeon_winsys_cs *rcs, return index; } -static int amdgpu_cs_get_reloc(struct radeon_winsys_cs *rcs, - struct radeon_winsys_cs_handle *buf) +static int amdgpu_cs_lookup_buffer(struct radeon_winsys_cs *rcs, + struct pb_buffer *buf) { struct amdgpu_cs *cs = amdgpu_cs(rcs); - return amdgpu_get_reloc(cs, (struct amdgpu_winsys_bo*)buf); + return amdgpu_lookup_buffer(cs, (struct amdgpu_winsys_bo*)buf); } static boolean amdgpu_cs_validate(struct radeon_winsys_cs *rcs) @@ -498,6 +500,22 @@ static boolean amdgpu_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64 return status; } +static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs, + struct radeon_bo_list_item *list) +{ + struct amdgpu_cs *cs = amdgpu_cs(rcs); + int i; + + if (list) { + for (i = 0; i < cs->num_buffers; i++) { + pb_reference(&list[i].buf, &cs->buffers[i].bo->base); + list[i].vm_address = cs->buffers[i].bo->va; + list[i].priority_usage = cs->buffers[i].priority_usage; + } + } + return cs->num_buffers; +} + static void amdgpu_cs_do_submission(struct amdgpu_cs *cs, struct pipe_fence_handle **out_fence) { @@ -587,6 +605,7 @@ static void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs) } DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE) +DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", FALSE) static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags, @@ -599,25 +618,13 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs, switch (cs->base.ring_type) { case RING_DMA: /* pad DMA ring to 8 DWs */ - if (ws->info.chip_class <= SI) { - while (rcs->cdw & 7) - OUT_CS(&cs->base, 0xf0000000); /* NOP packet */ - } else { - while (rcs->cdw & 7) - OUT_CS(&cs->base, 0x00000000); /* NOP packet */ - } + while (rcs->cdw & 7) + OUT_CS(&cs->base, 0x00000000); /* NOP packet */ break; case RING_GFX: - /* pad DMA ring to 8 DWs to meet CP fetch alignment requirements - * r6xx, requires at least 4 dw alignment to avoid a hw bug. - */ - if (ws->info.chip_class <= SI) { - while (rcs->cdw & 7) - OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */ - } else { - while (rcs->cdw & 7) - OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */ - } + /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements */ + while (rcs->cdw & 7) + OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */ break; case RING_UVD: while (rcs->cdw & 15) @@ -631,16 +638,42 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs, fprintf(stderr, "amdgpu: command stream overflowed\n"); } - amdgpu_cs_add_reloc(rcs, (void*)cs->big_ib_winsys_buffer, - RADEON_USAGE_READ, 0, RADEON_PRIO_MIN); + amdgpu_cs_add_buffer(rcs, (void*)cs->big_ib_winsys_buffer, + RADEON_USAGE_READ, 0, RADEON_PRIO_IB1); /* If the CS is not empty or overflowed.... */ if (cs->base.cdw && cs->base.cdw <= cs->base.max_dw && !debug_get_option_noop()) { int r; - r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, - cs->handles, cs->flags, - &cs->request.resources); + /* Use a buffer list containing all allocated buffers if requested. */ + if (debug_get_option_all_bos()) { + struct amdgpu_winsys_bo *bo; + amdgpu_bo_handle *handles; + unsigned num = 0; + + pipe_mutex_lock(ws->global_bo_list_lock); + + handles = malloc(sizeof(handles[0]) * ws->num_buffers); + if (!handles) { + pipe_mutex_unlock(ws->global_bo_list_lock); + goto cleanup; + } + + LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) { + assert(num < ws->num_buffers); + handles[num++] = bo->bo; + } + + r = amdgpu_bo_list_create(ws->dev, ws->num_buffers, + handles, NULL, + &cs->request.resources); + free(handles); + pipe_mutex_unlock(ws->global_bo_list_lock); + } else { + r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, + cs->handles, cs->flags, + &cs->request.resources); + } if (r) { fprintf(stderr, "amdgpu: resource list creation failed (%d)\n", r); @@ -676,7 +709,7 @@ static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs) } static boolean amdgpu_bo_is_referenced(struct radeon_winsys_cs *rcs, - struct radeon_winsys_cs_handle *_buf, + struct pb_buffer *_buf, enum radeon_bo_usage usage) { struct amdgpu_cs *cs = amdgpu_cs(rcs); @@ -692,10 +725,11 @@ void amdgpu_cs_init_functions(struct amdgpu_winsys *ws) ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status; ws->base.cs_create = amdgpu_cs_create; ws->base.cs_destroy = amdgpu_cs_destroy; - ws->base.cs_add_reloc = amdgpu_cs_add_reloc; - ws->base.cs_get_reloc = amdgpu_cs_get_reloc; + ws->base.cs_add_buffer = amdgpu_cs_add_buffer; + ws->base.cs_lookup_buffer = amdgpu_cs_lookup_buffer; ws->base.cs_validate = amdgpu_cs_validate; ws->base.cs_memory_below_limit = amdgpu_cs_memory_below_limit; + ws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list; ws->base.cs_flush = amdgpu_cs_flush; ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced; ws->base.cs_sync_flush = amdgpu_cs_sync_flush; |