summaryrefslogtreecommitdiff
path: root/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c')
-rw-r--r--lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c164
1 files changed, 99 insertions, 65 deletions
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 0f42298c2..83da740f6 100644
--- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -200,46 +200,46 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx)
static bool amdgpu_get_new_ib(struct amdgpu_cs *cs)
{
- /* The maximum size is 4MB - 1B, which is unaligned.
- * Use aligned size 4MB - 16B. */
- const unsigned max_ib_size = (1024 * 1024 - 16) * 4;
- const unsigned min_ib_size = 24 * 1024 * 4;
+ /* Small IBs are better than big IBs, because the GPU goes idle quicker
+ * and there is less waiting for buffers and fences. Proof:
+ * http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
+ */
+ const unsigned buffer_size = 128 * 1024 * 4;
+ const unsigned ib_size = 20 * 1024 * 4;
cs->base.cdw = 0;
cs->base.buf = NULL;
/* Allocate a new buffer for IBs if the current buffer is all used. */
if (!cs->big_ib_buffer ||
- cs->used_ib_space + min_ib_size > cs->big_ib_buffer->size) {
+ cs->used_ib_space + ib_size > cs->big_ib_buffer->size) {
struct radeon_winsys *ws = &cs->ctx->ws->base;
- struct radeon_winsys_cs_handle *winsys_bo;
pb_reference(&cs->big_ib_buffer, NULL);
cs->big_ib_winsys_buffer = NULL;
cs->ib_mapped = NULL;
cs->used_ib_space = 0;
- cs->big_ib_buffer = ws->buffer_create(ws, max_ib_size,
+ cs->big_ib_buffer = ws->buffer_create(ws, buffer_size,
4096, true,
RADEON_DOMAIN_GTT,
RADEON_FLAG_CPU_ACCESS);
if (!cs->big_ib_buffer)
return false;
- winsys_bo = ws->buffer_get_cs_handle(cs->big_ib_buffer);
-
- cs->ib_mapped = ws->buffer_map(winsys_bo, NULL, PIPE_TRANSFER_WRITE);
+ cs->ib_mapped = ws->buffer_map(cs->big_ib_buffer, NULL,
+ PIPE_TRANSFER_WRITE);
if (!cs->ib_mapped) {
pb_reference(&cs->big_ib_buffer, NULL);
return false;
}
- cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)winsys_bo;
+ cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)cs->big_ib_buffer;
}
cs->ib.ib_mc_address = cs->big_ib_winsys_buffer->va + cs->used_ib_space;
cs->base.buf = (uint32_t*)(cs->ib_mapped + cs->used_ib_space);
- cs->base.max_dw = (cs->big_ib_buffer->size - cs->used_ib_space) / 4;
+ cs->base.max_dw = ib_size / 4;
return true;
}
@@ -336,7 +336,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
void (*flush)(void *ctx, unsigned flags,
struct pipe_fence_handle **fence),
void *flush_ctx,
- struct radeon_winsys_cs_handle *trace_buf)
+ struct pb_buffer *trace_buf)
{
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
struct amdgpu_cs *cs;
@@ -368,7 +368,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
-int amdgpu_get_reloc(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
+int amdgpu_lookup_buffer(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
{
unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1);
int i = cs->buffer_indices_hashlist[hash];
@@ -377,15 +377,15 @@ int amdgpu_get_reloc(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
if (i == -1 || cs->buffers[i].bo == bo)
return i;
- /* Hash collision, look for the BO in the list of relocs linearly. */
+ /* Hash collision, look for the BO in the list of buffers linearly. */
for (i = cs->num_buffers - 1; i >= 0; i--) {
if (cs->buffers[i].bo == bo) {
- /* Put this reloc in the hash list.
+ /* Put this buffer in the hash list.
* This will prevent additional hash collisions if there are
- * several consecutive get_reloc calls for the same buffer.
+ * several consecutive lookup_buffer calls for the same buffer.
*
* Example: Assuming buffers A,B,C collide in the hash list,
- * the following sequence of relocs:
+ * the following sequence of buffers:
* AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
* will collide here: ^ and here: ^,
* meaning that we should get very few collisions in the end. */
@@ -396,32 +396,33 @@ int amdgpu_get_reloc(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo)
return -1;
}
-static unsigned amdgpu_add_reloc(struct amdgpu_cs *cs,
+static unsigned amdgpu_add_buffer(struct amdgpu_cs *cs,
struct amdgpu_winsys_bo *bo,
enum radeon_bo_usage usage,
enum radeon_bo_domain domains,
unsigned priority,
enum radeon_bo_domain *added_domains)
{
- struct amdgpu_cs_buffer *reloc;
+ struct amdgpu_cs_buffer *buffer;
unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1);
int i = -1;
- priority = MIN2(priority, 15);
+ assert(priority < 64);
*added_domains = 0;
- i = amdgpu_get_reloc(cs, bo);
+ i = amdgpu_lookup_buffer(cs, bo);
if (i >= 0) {
- reloc = &cs->buffers[i];
- reloc->usage |= usage;
- *added_domains = domains & ~reloc->domains;
- reloc->domains |= domains;
- cs->flags[i] = MAX2(cs->flags[i], priority);
+ buffer = &cs->buffers[i];
+ buffer->priority_usage |= 1llu << priority;
+ buffer->usage |= usage;
+ *added_domains = domains & ~buffer->domains;
+ buffer->domains |= domains;
+ cs->flags[i] = MAX2(cs->flags[i], priority / 4);
return i;
}
- /* New relocation, check if the backing array is large enough. */
+ /* New buffer, check if the backing array is large enough. */
if (cs->num_buffers >= cs->max_num_buffers) {
uint32_t size;
cs->max_num_buffers += 10;
@@ -435,16 +436,17 @@ static unsigned amdgpu_add_reloc(struct amdgpu_cs *cs,
cs->flags = realloc(cs->flags, cs->max_num_buffers);
}
- /* Initialize the new relocation. */
+ /* Initialize the new buffer. */
cs->buffers[cs->num_buffers].bo = NULL;
amdgpu_winsys_bo_reference(&cs->buffers[cs->num_buffers].bo, bo);
cs->handles[cs->num_buffers] = bo->bo;
- cs->flags[cs->num_buffers] = priority;
+ cs->flags[cs->num_buffers] = priority / 4;
p_atomic_inc(&bo->num_cs_references);
- reloc = &cs->buffers[cs->num_buffers];
- reloc->bo = bo;
- reloc->usage = usage;
- reloc->domains = domains;
+ buffer = &cs->buffers[cs->num_buffers];
+ buffer->bo = bo;
+ buffer->priority_usage = 1llu << priority;
+ buffer->usage = usage;
+ buffer->domains = domains;
cs->buffer_indices_hashlist[hash] = cs->num_buffers;
@@ -452,8 +454,8 @@ static unsigned amdgpu_add_reloc(struct amdgpu_cs *cs,
return cs->num_buffers++;
}
-static unsigned amdgpu_cs_add_reloc(struct radeon_winsys_cs *rcs,
- struct radeon_winsys_cs_handle *buf,
+static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs,
+ struct pb_buffer *buf,
enum radeon_bo_usage usage,
enum radeon_bo_domain domains,
enum radeon_bo_priority priority)
@@ -464,7 +466,7 @@ static unsigned amdgpu_cs_add_reloc(struct radeon_winsys_cs *rcs,
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
enum radeon_bo_domain added_domains;
- unsigned index = amdgpu_add_reloc(cs, bo, usage, bo->initial_domain,
+ unsigned index = amdgpu_add_buffer(cs, bo, usage, bo->initial_domain,
priority, &added_domains);
if (added_domains & RADEON_DOMAIN_GTT)
@@ -475,12 +477,12 @@ static unsigned amdgpu_cs_add_reloc(struct radeon_winsys_cs *rcs,
return index;
}
-static int amdgpu_cs_get_reloc(struct radeon_winsys_cs *rcs,
- struct radeon_winsys_cs_handle *buf)
+static int amdgpu_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
+ struct pb_buffer *buf)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
- return amdgpu_get_reloc(cs, (struct amdgpu_winsys_bo*)buf);
+ return amdgpu_lookup_buffer(cs, (struct amdgpu_winsys_bo*)buf);
}
static boolean amdgpu_cs_validate(struct radeon_winsys_cs *rcs)
@@ -498,6 +500,22 @@ static boolean amdgpu_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64
return status;
}
+static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
+ struct radeon_bo_list_item *list)
+{
+ struct amdgpu_cs *cs = amdgpu_cs(rcs);
+ int i;
+
+ if (list) {
+ for (i = 0; i < cs->num_buffers; i++) {
+ pb_reference(&list[i].buf, &cs->buffers[i].bo->base);
+ list[i].vm_address = cs->buffers[i].bo->va;
+ list[i].priority_usage = cs->buffers[i].priority_usage;
+ }
+ }
+ return cs->num_buffers;
+}
+
static void amdgpu_cs_do_submission(struct amdgpu_cs *cs,
struct pipe_fence_handle **out_fence)
{
@@ -587,6 +605,7 @@ static void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
}
DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
+DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", FALSE)
static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
unsigned flags,
@@ -599,25 +618,13 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
switch (cs->base.ring_type) {
case RING_DMA:
/* pad DMA ring to 8 DWs */
- if (ws->info.chip_class <= SI) {
- while (rcs->cdw & 7)
- OUT_CS(&cs->base, 0xf0000000); /* NOP packet */
- } else {
- while (rcs->cdw & 7)
- OUT_CS(&cs->base, 0x00000000); /* NOP packet */
- }
+ while (rcs->cdw & 7)
+ OUT_CS(&cs->base, 0x00000000); /* NOP packet */
break;
case RING_GFX:
- /* pad DMA ring to 8 DWs to meet CP fetch alignment requirements
- * r6xx, requires at least 4 dw alignment to avoid a hw bug.
- */
- if (ws->info.chip_class <= SI) {
- while (rcs->cdw & 7)
- OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */
- } else {
- while (rcs->cdw & 7)
- OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */
- }
+ /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements */
+ while (rcs->cdw & 7)
+ OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */
break;
case RING_UVD:
while (rcs->cdw & 15)
@@ -631,16 +638,42 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
fprintf(stderr, "amdgpu: command stream overflowed\n");
}
- amdgpu_cs_add_reloc(rcs, (void*)cs->big_ib_winsys_buffer,
- RADEON_USAGE_READ, 0, RADEON_PRIO_MIN);
+ amdgpu_cs_add_buffer(rcs, (void*)cs->big_ib_winsys_buffer,
+ RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
/* If the CS is not empty or overflowed.... */
if (cs->base.cdw && cs->base.cdw <= cs->base.max_dw && !debug_get_option_noop()) {
int r;
- r = amdgpu_bo_list_create(ws->dev, cs->num_buffers,
- cs->handles, cs->flags,
- &cs->request.resources);
+ /* Use a buffer list containing all allocated buffers if requested. */
+ if (debug_get_option_all_bos()) {
+ struct amdgpu_winsys_bo *bo;
+ amdgpu_bo_handle *handles;
+ unsigned num = 0;
+
+ pipe_mutex_lock(ws->global_bo_list_lock);
+
+ handles = malloc(sizeof(handles[0]) * ws->num_buffers);
+ if (!handles) {
+ pipe_mutex_unlock(ws->global_bo_list_lock);
+ goto cleanup;
+ }
+
+ LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) {
+ assert(num < ws->num_buffers);
+ handles[num++] = bo->bo;
+ }
+
+ r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
+ handles, NULL,
+ &cs->request.resources);
+ free(handles);
+ pipe_mutex_unlock(ws->global_bo_list_lock);
+ } else {
+ r = amdgpu_bo_list_create(ws->dev, cs->num_buffers,
+ cs->handles, cs->flags,
+ &cs->request.resources);
+ }
if (r) {
fprintf(stderr, "amdgpu: resource list creation failed (%d)\n", r);
@@ -676,7 +709,7 @@ static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
}
static boolean amdgpu_bo_is_referenced(struct radeon_winsys_cs *rcs,
- struct radeon_winsys_cs_handle *_buf,
+ struct pb_buffer *_buf,
enum radeon_bo_usage usage)
{
struct amdgpu_cs *cs = amdgpu_cs(rcs);
@@ -692,10 +725,11 @@ void amdgpu_cs_init_functions(struct amdgpu_winsys *ws)
ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
ws->base.cs_create = amdgpu_cs_create;
ws->base.cs_destroy = amdgpu_cs_destroy;
- ws->base.cs_add_reloc = amdgpu_cs_add_reloc;
- ws->base.cs_get_reloc = amdgpu_cs_get_reloc;
+ ws->base.cs_add_buffer = amdgpu_cs_add_buffer;
+ ws->base.cs_lookup_buffer = amdgpu_cs_lookup_buffer;
ws->base.cs_validate = amdgpu_cs_validate;
ws->base.cs_memory_below_limit = amdgpu_cs_memory_below_limit;
+ ws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
ws->base.cs_flush = amdgpu_cs_flush;
ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
ws->base.cs_sync_flush = amdgpu_cs_sync_flush;