diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2019-05-23 05:33:34 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2019-05-23 05:33:34 +0000 |
commit | 9886815a25d84be79f51e65ebd8e458bb5d26ca8 (patch) | |
tree | a65edf018dd992543337433f7303fb29a6c8e8cf /lib/mesa/src/gallium/winsys/amdgpu/drm | |
parent | e2a3acb64af2657b1181806818eacad061103c23 (diff) |
Merge Mesa 19.0.5
Diffstat (limited to 'lib/mesa/src/gallium/winsys/amdgpu/drm')
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am | 1 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in | 1 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 293 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 7 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 129 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 11 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 80 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h | 12 |
8 files changed, 355 insertions, 179 deletions
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am index e35fa2cd0..1c2ec010f 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am @@ -4,6 +4,7 @@ include $(top_srcdir)/src/gallium/Automake.inc AM_CFLAGS = \ $(GALLIUM_WINSYS_CFLAGS) \ $(AMDGPU_CFLAGS) \ + $(LLVM_CFLAGS) \ -I$(top_srcdir)/src/amd/ AM_CXXFLAGS = $(AM_CFLAGS) diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in index 2cba679ac..2730755f8 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in @@ -539,6 +539,7 @@ GALLIUM_PIPE_LOADER_WINSYS_LIBS = \ AM_CFLAGS = \ $(GALLIUM_WINSYS_CFLAGS) \ $(AMDGPU_CFLAGS) \ + $(LLVM_CFLAGS) \ -I$(top_srcdir)/src/amd/ AM_CXXFLAGS = $(AM_CFLAGS) diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index f10805805..58979bd4e 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -56,6 +56,7 @@ amdgpu_bo_create(struct radeon_winsys *rws, unsigned alignment, enum radeon_bo_domain domain, enum radeon_bo_flag flags); +static void amdgpu_bo_unmap(struct pb_buffer *buf); static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, enum radeon_bo_usage usage) @@ -173,6 +174,12 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf) assert(bo->bo && "must not be called for slab entries"); + if (!bo->is_user_ptr && bo->cpu_ptr) { + bo->cpu_ptr = NULL; + amdgpu_bo_unmap(&bo->base); + } + assert(bo->is_user_ptr || bo->u.real.map_count == 0); + if (ws->debug_all_bos) { simple_mtx_lock(&ws->global_bo_list_lock); LIST_DEL(&bo->u.real.global_list_item); @@ -184,8 +191,10 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf) util_hash_table_remove(ws->bo_export_table, bo->bo); simple_mtx_unlock(&ws->bo_export_table_lock); - amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP); - amdgpu_va_range_free(bo->u.real.va_handle); + if (bo->initial_domain & RADEON_DOMAIN_VRAM_GTT) { + amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP); + amdgpu_va_range_free(bo->u.real.va_handle); + } amdgpu_bo_free(bo->bo); amdgpu_bo_remove_fences(bo); @@ -195,14 +204,7 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf) else if (bo->initial_domain & RADEON_DOMAIN_GTT) ws->allocated_gtt -= align64(bo->base.size, ws->info.gart_page_size); - if (bo->u.real.map_count >= 1) { - if (bo->initial_domain & RADEON_DOMAIN_VRAM) - ws->mapped_vram -= bo->base.size; - else if (bo->initial_domain & RADEON_DOMAIN_GTT) - ws->mapped_gtt -= bo->base.size; - ws->num_mapped_buffers--; - } - + simple_mtx_destroy(&bo->lock); FREE(bo); } @@ -218,6 +220,37 @@ static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf) amdgpu_bo_destroy(_buf); } +static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws) +{ + for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) + pb_slabs_reclaim(&ws->bo_slabs[i]); + + pb_cache_release_all_buffers(&ws->bo_cache); +} + +static bool amdgpu_bo_do_map(struct amdgpu_winsys_bo *bo, void **cpu) +{ + assert(!bo->sparse && bo->bo && !bo->is_user_ptr); + int r = amdgpu_bo_cpu_map(bo->bo, cpu); + if (r) { + /* Clean up buffer managers and try again. */ + amdgpu_clean_up_buffer_managers(bo->ws); + r = amdgpu_bo_cpu_map(bo->bo, cpu); + if (r) + return false; + } + + if (p_atomic_inc_return(&bo->u.real.map_count) == 1) { + if (bo->initial_domain & RADEON_DOMAIN_VRAM) + bo->ws->mapped_vram += bo->base.size; + else if (bo->initial_domain & RADEON_DOMAIN_GTT) + bo->ws->mapped_gtt += bo->base.size; + bo->ws->num_mapped_buffers++; + } + + return true; +} + static void *amdgpu_bo_map(struct pb_buffer *buf, struct radeon_cmdbuf *rcs, enum pipe_transfer_usage usage) @@ -225,9 +258,6 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; struct amdgpu_winsys_bo *real; struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs; - int r; - void *cpu = NULL; - uint64_t offset = 0; assert(!bo->sparse); @@ -312,9 +342,9 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, } } - /* If the buffer is created from user memory, return the user pointer. */ - if (bo->user_ptr) - return bo->user_ptr; + /* Buffer synchronization has been checked, now actually map the buffer. */ + void *cpu = NULL; + uint64_t offset = 0; if (bo->bo) { real = bo; @@ -323,22 +353,31 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, offset = bo->va - real->va; } - r = amdgpu_bo_cpu_map(real->bo, &cpu); - if (r) { - /* Clear the cache and try again. */ - pb_cache_release_all_buffers(&real->ws->bo_cache); - r = amdgpu_bo_cpu_map(real->bo, &cpu); - if (r) - return NULL; + if (usage & RADEON_TRANSFER_TEMPORARY) { + if (real->is_user_ptr) { + cpu = real->cpu_ptr; + } else { + if (!amdgpu_bo_do_map(real, &cpu)) + return NULL; + } + } else { + cpu = p_atomic_read(&real->cpu_ptr); + if (!cpu) { + simple_mtx_lock(&real->lock); + /* Must re-check due to the possibility of a race. Re-check need not + * be atomic thanks to the lock. */ + cpu = real->cpu_ptr; + if (!cpu) { + if (!amdgpu_bo_do_map(real, &cpu)) { + simple_mtx_unlock(&real->lock); + return NULL; + } + p_atomic_set(&real->cpu_ptr, cpu); + } + simple_mtx_unlock(&real->lock); + } } - if (p_atomic_inc_return(&real->u.real.map_count) == 1) { - if (real->initial_domain & RADEON_DOMAIN_VRAM) - real->ws->mapped_vram += real->base.size; - else if (real->initial_domain & RADEON_DOMAIN_GTT) - real->ws->mapped_gtt += real->base.size; - real->ws->num_mapped_buffers++; - } return (uint8_t*)cpu + offset; } @@ -349,12 +388,15 @@ static void amdgpu_bo_unmap(struct pb_buffer *buf) assert(!bo->sparse); - if (bo->user_ptr) + if (bo->is_user_ptr) return; real = bo->bo ? bo : bo->u.slab.real; - + assert(real->u.real.map_count != 0 && "too many unmaps"); if (p_atomic_dec_zero(&real->u.real.map_count)) { + assert(!real->cpu_ptr && + "too many unmaps or forgot RADEON_TRANSFER_TEMPORARY flag"); + if (real->initial_domain & RADEON_DOMAIN_VRAM) real->ws->mapped_vram -= real->base.size; else if (real->initial_domain & RADEON_DOMAIN_GTT) @@ -384,6 +426,27 @@ static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo) } } +static uint64_t amdgpu_get_optimal_vm_alignment(struct amdgpu_winsys *ws, + uint64_t size, unsigned alignment) +{ + uint64_t vm_alignment = alignment; + + /* Increase the VM alignment for faster address translation. */ + if (size >= ws->info.pte_fragment_size) + vm_alignment = MAX2(vm_alignment, ws->info.pte_fragment_size); + + /* Gfx9: Increase the VM alignment to the most significant bit set + * in the size for faster address translation. + */ + if (ws->info.chip_class >= GFX9) { + unsigned msb = util_last_bit64(size); /* 0 = no bit is set */ + uint64_t msb_alignment = msb ? 1ull << (msb - 1) : 0; + + vm_alignment = MAX2(vm_alignment, msb_alignment); + } + return vm_alignment; +} + static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, uint64_t size, unsigned alignment, @@ -396,11 +459,12 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, uint64_t va = 0; struct amdgpu_winsys_bo *bo; amdgpu_va_handle va_handle; - unsigned va_gap_size; int r; /* VRAM or GTT must be specified, but not both at the same time. */ - assert(util_bitcount(initial_domain & RADEON_DOMAIN_VRAM_GTT) == 1); + assert(util_bitcount(initial_domain & (RADEON_DOMAIN_VRAM_GTT | + RADEON_DOMAIN_GDS | + RADEON_DOMAIN_OA)) == 1); bo = CALLOC_STRUCT(amdgpu_winsys_bo); if (!bo) { @@ -418,6 +482,10 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM; if (initial_domain & RADEON_DOMAIN_GTT) request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; + if (initial_domain & RADEON_DOMAIN_GDS) + request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS; + if (initial_domain & RADEON_DOMAIN_OA) + request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA; /* Since VRAM and GTT have almost the same performance on APUs, we could * just set GTT. However, in order to decrease GTT(RAM) usage, which is @@ -447,27 +515,31 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, goto error_bo_alloc; } - va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0; - if (size > ws->info.pte_fragment_size) - alignment = MAX2(alignment, ws->info.pte_fragment_size); - r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, - size + va_gap_size, alignment, 0, &va, &va_handle, - (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) | - AMDGPU_VA_RANGE_HIGH); - if (r) - goto error_va_alloc; + if (initial_domain & RADEON_DOMAIN_VRAM_GTT) { + unsigned va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0; + + r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, + size + va_gap_size, + amdgpu_get_optimal_vm_alignment(ws, size, alignment), + 0, &va, &va_handle, + (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) | + AMDGPU_VA_RANGE_HIGH); + if (r) + goto error_va_alloc; - unsigned vm_flags = AMDGPU_VM_PAGE_READABLE | - AMDGPU_VM_PAGE_EXECUTABLE; + unsigned vm_flags = AMDGPU_VM_PAGE_READABLE | + AMDGPU_VM_PAGE_EXECUTABLE; - if (!(flags & RADEON_FLAG_READ_ONLY)) - vm_flags |= AMDGPU_VM_PAGE_WRITEABLE; + if (!(flags & RADEON_FLAG_READ_ONLY)) + vm_flags |= AMDGPU_VM_PAGE_WRITEABLE; - r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags, + r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags, AMDGPU_VA_OP_MAP); - if (r) - goto error_va_map; + if (r) + goto error_va_map; + } + simple_mtx_init(&bo->lock, mtx_plain); pipe_reference_init(&bo->base.reference, 1); bo->base.alignment = alignment; bo->base.usage = 0; @@ -486,7 +558,7 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, else if (initial_domain & RADEON_DOMAIN_GTT) ws->allocated_gtt += align64(size, ws->info.gart_page_size); - amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms_noimport, &bo->u.real.kms_handle); + amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle); amdgpu_add_buffer_to_global_list(bo); @@ -522,13 +594,27 @@ bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry) return amdgpu_bo_can_reclaim(&bo->base); } +static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size) +{ + /* Find the correct slab allocator for the given size. */ + for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) { + struct pb_slabs *slabs = &ws->bo_slabs[i]; + + if (size <= 1 << (slabs->min_order + slabs->num_orders - 1)) + return slabs; + } + + assert(0); + return NULL; +} + static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf) { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); assert(!bo->bo); - pb_slab_free(&bo->ws->bo_slabs, &bo->u.slab.entry); + pb_slab_free(get_slabs(bo->ws, bo->base.size), &bo->u.slab.entry); } static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = { @@ -545,19 +631,37 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, enum radeon_bo_domain domains = radeon_domain_from_heap(heap); enum radeon_bo_flag flags = radeon_flags_from_heap(heap); uint32_t base_id; + unsigned slab_size = 0; if (!slab) return NULL; - unsigned slab_size = 1 << AMDGPU_SLAB_BO_SIZE_LOG2; + /* Determine the slab buffer size. */ + for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) { + struct pb_slabs *slabs = &ws->bo_slabs[i]; + unsigned max_entry_size = 1 << (slabs->min_order + slabs->num_orders - 1); + + if (entry_size <= max_entry_size) { + /* The slab size is twice the size of the largest possible entry. */ + slab_size = max_entry_size * 2; + + /* The largest slab should have the same size as the PTE fragment + * size to get faster address translation. + */ + if (i == NUM_SLAB_ALLOCATORS - 1 && + slab_size < ws->info.pte_fragment_size) + slab_size = ws->info.pte_fragment_size; + break; + } + } + assert(slab_size != 0); + slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base, slab_size, slab_size, domains, flags)); if (!slab->buffer) goto fail; - assert(slab->buffer->bo); - slab->base.num_entries = slab->buffer->base.size / entry_size; slab->base.num_free = slab->base.num_entries; slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries)); @@ -571,6 +675,7 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, for (unsigned i = 0; i < slab->base.num_entries; ++i) { struct amdgpu_winsys_bo *bo = &slab->entries[i]; + simple_mtx_init(&bo->lock, mtx_plain); bo->base.alignment = entry_size; bo->base.usage = slab->buffer->base.usage; bo->base.size = entry_size; @@ -581,7 +686,15 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, bo->unique_id = base_id + i; bo->u.slab.entry.slab = &slab->base; bo->u.slab.entry.group_index = group_index; - bo->u.slab.real = slab->buffer; + + if (slab->buffer->bo) { + /* The slab is not suballocated. */ + bo->u.slab.real = slab->buffer; + } else { + /* The slab is allocated out of a bigger slab. */ + bo->u.slab.real = slab->buffer->u.slab.real; + assert(bo->u.slab.real->bo); + } LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free); } @@ -599,8 +712,10 @@ void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab) { struct amdgpu_slab *slab = amdgpu_slab(pslab); - for (unsigned i = 0; i < slab->base.num_entries; ++i) + for (unsigned i = 0; i < slab->base.num_entries; ++i) { amdgpu_bo_remove_fences(&slab->entries[i]); + simple_mtx_destroy(&slab->entries[i].lock); + } FREE(slab->entries); amdgpu_winsys_bo_reference(&slab->buffer, NULL); @@ -858,8 +973,8 @@ static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf) } amdgpu_va_range_free(bo->u.sparse.va_handle); - simple_mtx_destroy(&bo->u.sparse.commit_lock); FREE(bo->u.sparse.commitments); + simple_mtx_destroy(&bo->lock); FREE(bo); } @@ -889,6 +1004,7 @@ amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size, if (!bo) return NULL; + simple_mtx_init(&bo->lock, mtx_plain); pipe_reference_init(&bo->base.reference, 1); bo->base.alignment = RADEON_SPARSE_PAGE_SIZE; bo->base.size = size; @@ -905,7 +1021,6 @@ amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size, if (!bo->u.sparse.commitments) goto error_alloc_commitments; - simple_mtx_init(&bo->u.sparse.commit_lock, mtx_plain); LIST_INITHEAD(&bo->u.sparse.backing); /* For simplicity, we always map a multiple of the page size. */ @@ -928,9 +1043,9 @@ amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size, error_va_map: amdgpu_va_range_free(bo->u.sparse.va_handle); error_va_alloc: - simple_mtx_destroy(&bo->u.sparse.commit_lock); FREE(bo->u.sparse.commitments); error_alloc_commitments: + simple_mtx_destroy(&bo->lock); FREE(bo); return NULL; } @@ -955,7 +1070,7 @@ amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size, va_page = offset / RADEON_SPARSE_PAGE_SIZE; end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); - simple_mtx_lock(&bo->u.sparse.commit_lock); + simple_mtx_lock(&bo->lock); #if DEBUG_SPARSE_COMMITS sparse_dump(bo, __func__); @@ -1059,7 +1174,7 @@ amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size, } out: - simple_mtx_unlock(&bo->u.sparse.commit_lock); + simple_mtx_unlock(&bo->lock); return ok; } @@ -1193,22 +1308,28 @@ amdgpu_bo_create(struct radeon_winsys *rws, /* Sparse buffers must have NO_CPU_ACCESS set. */ assert(!(flags & RADEON_FLAG_SPARSE) || flags & RADEON_FLAG_NO_CPU_ACCESS); + struct pb_slabs *last_slab = &ws->bo_slabs[NUM_SLAB_ALLOCATORS - 1]; + unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1); + /* Sub-allocate small buffers from slabs. */ if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) && - size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) && - alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) { + size <= max_slab_entry_size && + /* The alignment must be at most the size of the smallest slab entry or + * the next power of two. */ + alignment <= MAX2(1 << ws->bo_slabs[0].min_order, util_next_power_of_two(size))) { struct pb_slab_entry *entry; int heap = radeon_get_heap_index(domain, flags); if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS) goto no_slab; - entry = pb_slab_alloc(&ws->bo_slabs, size, heap); + struct pb_slabs *slabs = get_slabs(ws, size); + entry = pb_slab_alloc(slabs, size, heap); if (!entry) { - /* Clear the cache and try again. */ - pb_cache_release_all_buffers(&ws->bo_cache); + /* Clean up buffer managers and try again. */ + amdgpu_clean_up_buffer_managers(ws); - entry = pb_slab_alloc(&ws->bo_slabs, size, heap); + entry = pb_slab_alloc(slabs, size, heap); } if (!entry) return NULL; @@ -1235,8 +1356,10 @@ no_slab: * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, * like constant/uniform buffers, can benefit from better and more reuse. */ - size = align64(size, ws->info.gart_page_size); - alignment = align(alignment, ws->info.gart_page_size); + if (domain & RADEON_DOMAIN_VRAM_GTT) { + size = align64(size, ws->info.gart_page_size); + alignment = align(alignment, ws->info.gart_page_size); + } bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING; @@ -1254,9 +1377,9 @@ no_slab: /* Create a new one. */ bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap); if (!bo) { - /* Clear the cache and try again. */ - pb_slabs_reclaim(&ws->bo_slabs); - pb_cache_release_all_buffers(&ws->bo_cache); + /* Clean up buffer managers and try again. */ + amdgpu_clean_up_buffer_managers(ws); + bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap); if (!bo) return NULL; @@ -1268,6 +1391,7 @@ no_slab: static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, struct winsys_handle *whandle, + unsigned vm_alignment, unsigned *stride, unsigned *offset) { @@ -1325,8 +1449,10 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, goto error; r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, - result.alloc_size, 1 << 20, 0, &va, &va_handle, - AMDGPU_VA_RANGE_HIGH); + result.alloc_size, + amdgpu_get_optimal_vm_alignment(ws, result.alloc_size, + vm_alignment), + 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH); if (r) goto error; @@ -1344,6 +1470,7 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, initial |= RADEON_DOMAIN_GTT; /* Initialize the structure. */ + simple_mtx_init(&bo->lock, mtx_plain); pipe_reference_init(&bo->base.reference, 1); bo->base.alignment = info.phys_alignment; bo->bo = result.buf_handle; @@ -1361,7 +1488,7 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, else if (bo->initial_domain & RADEON_DOMAIN_GTT) ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size); - amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms_noimport, &bo->u.real.kms_handle); + amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle); amdgpu_add_buffer_to_global_list(bo); @@ -1445,21 +1572,25 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws, goto error; if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, - aligned_size, 1 << 12, 0, &va, &va_handle, - AMDGPU_VA_RANGE_HIGH)) + aligned_size, + amdgpu_get_optimal_vm_alignment(ws, aligned_size, + ws->info.gart_page_size), + 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH)) goto error_va_alloc; if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP)) goto error_va_map; /* Initialize it. */ + bo->is_user_ptr = true; pipe_reference_init(&bo->base.reference, 1); + simple_mtx_init(&bo->lock, mtx_plain); bo->bo = buf_handle; bo->base.alignment = 0; bo->base.size = size; bo->base.vtbl = &amdgpu_winsys_bo_vtbl; bo->ws = ws; - bo->user_ptr = pointer; + bo->cpu_ptr = pointer; bo->va = va; bo->u.real.va_handle = va_handle; bo->initial_domain = RADEON_DOMAIN_GTT; @@ -1469,7 +1600,7 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws, amdgpu_add_buffer_to_global_list(bo); - amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms_noimport, &bo->u.real.kms_handle); + amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle); return (struct pb_buffer*)bo; @@ -1486,7 +1617,7 @@ error: static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf) { - return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL; + return ((struct amdgpu_winsys_bo*)buf)->is_user_ptr; } static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf) diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h index 1e07e4734..88f424132 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h @@ -74,7 +74,6 @@ struct amdgpu_winsys_bo { struct amdgpu_winsys_bo *real; } slab; struct { - simple_mtx_t commit_lock; amdgpu_va_handle va_handle; enum radeon_bo_flag flags; @@ -89,10 +88,12 @@ struct amdgpu_winsys_bo { } u; struct amdgpu_winsys *ws; - void *user_ptr; /* from buffer_from_ptr */ + void *cpu_ptr; /* for user_ptr and permanent maps */ amdgpu_bo_handle bo; /* NULL for slab entries and sparse buffers */ bool sparse; + bool is_user_ptr; + bool is_local; uint32_t unique_id; uint64_t va; enum radeon_bo_domain initial_domain; @@ -114,7 +115,7 @@ struct amdgpu_winsys_bo { unsigned max_fences; struct pipe_fence_handle **fences; - bool is_local; + simple_mtx_t lock; }; struct amdgpu_slab { diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 3f0a6a1c9..2e595e5a1 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -172,45 +172,45 @@ static void amdgpu_fence_submitted(struct pipe_fence_handle *fence, uint64_t seq_no, uint64_t *user_fence_cpu_address) { - struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence; + struct amdgpu_fence *afence = (struct amdgpu_fence*)fence; - rfence->fence.fence = seq_no; - rfence->user_fence_cpu_address = user_fence_cpu_address; - util_queue_fence_signal(&rfence->submitted); + afence->fence.fence = seq_no; + afence->user_fence_cpu_address = user_fence_cpu_address; + util_queue_fence_signal(&afence->submitted); } static void amdgpu_fence_signalled(struct pipe_fence_handle *fence) { - struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence; + struct amdgpu_fence *afence = (struct amdgpu_fence*)fence; - rfence->signalled = true; - util_queue_fence_signal(&rfence->submitted); + afence->signalled = true; + util_queue_fence_signal(&afence->submitted); } bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, bool absolute) { - struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence; + struct amdgpu_fence *afence = (struct amdgpu_fence*)fence; uint32_t expired; int64_t abs_timeout; uint64_t *user_fence_cpu; int r; - if (rfence->signalled) + if (afence->signalled) return true; /* Handle syncobjs. */ - if (amdgpu_fence_is_syncobj(rfence)) { + if (amdgpu_fence_is_syncobj(afence)) { /* Absolute timeouts are only be used by BO fences, which aren't * backed by syncobjs. */ assert(!absolute); - if (amdgpu_cs_syncobj_wait(rfence->ws->dev, &rfence->syncobj, 1, + if (amdgpu_cs_syncobj_wait(afence->ws->dev, &afence->syncobj, 1, timeout, 0, NULL)) return false; - rfence->signalled = true; + afence->signalled = true; return true; } @@ -222,13 +222,13 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, /* The fence might not have a number assigned if its IB is being * submitted in the other thread right now. Wait until the submission * is done. */ - if (!util_queue_fence_wait_timeout(&rfence->submitted, abs_timeout)) + if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout)) return false; - user_fence_cpu = rfence->user_fence_cpu_address; + user_fence_cpu = afence->user_fence_cpu_address; if (user_fence_cpu) { - if (*user_fence_cpu >= rfence->fence.fence) { - rfence->signalled = true; + if (*user_fence_cpu >= afence->fence.fence) { + afence->signalled = true; return true; } @@ -238,7 +238,7 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, } /* Now use the libdrm query. */ - r = amdgpu_cs_query_fence_status(&rfence->fence, + r = amdgpu_cs_query_fence_status(&afence->fence, abs_timeout, AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE, &expired); @@ -250,7 +250,7 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, if (expired) { /* This variable can only transition from false to true, so it doesn't * matter if threads race for it. */ - rfence->signalled = true; + afence->signalled = true; return true; } return false; @@ -386,7 +386,8 @@ static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs) cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCE && cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_UVD_ENC && cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_DEC && - cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC; + cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC && + cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_JPEG; } static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs) @@ -598,7 +599,7 @@ static int amdgpu_lookup_or_add_sparse_buffer(struct amdgpu_cs *acs, /* We delay adding the backing buffers until we really have to. However, * we cannot delay accounting for memory use. */ - simple_mtx_lock(&bo->u.sparse.commit_lock); + simple_mtx_lock(&bo->lock); list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { if (bo->initial_domain & RADEON_DOMAIN_VRAM) @@ -607,7 +608,7 @@ static int amdgpu_lookup_or_add_sparse_buffer(struct amdgpu_cs *acs, acs->main.base.used_gart += backing->bo->base.size; } - simple_mtx_unlock(&bo->u.sparse.commit_lock); + simple_mtx_unlock(&bo->lock); return idx; } @@ -923,7 +924,8 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, enum ring_type ring_type, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), - void *flush_ctx) + void *flush_ctx, + bool stop_exec_on_failure) { struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; struct amdgpu_cs *cs; @@ -939,6 +941,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, cs->flush_cs = flush; cs->flush_data = flush_ctx; cs->ring_type = ring_type; + cs->stop_exec_on_failure = stop_exec_on_failure; struct amdgpu_cs_fence_info fence_info; fence_info.handle = cs->ctx->user_fence_bo; @@ -1263,7 +1266,7 @@ static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs) struct amdgpu_cs_buffer *buffer = &cs->sparse_buffers[i]; struct amdgpu_winsys_bo *bo = buffer->bo; - simple_mtx_lock(&bo->u.sparse.commit_lock); + simple_mtx_lock(&bo->lock); list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { /* We can directly add the buffer here, because we know that each @@ -1272,7 +1275,7 @@ static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs) int idx = amdgpu_do_add_real_buffer(cs, backing->bo); if (idx < 0) { fprintf(stderr, "%s: failed to add buffer\n", __FUNCTION__); - simple_mtx_unlock(&bo->u.sparse.commit_lock); + simple_mtx_unlock(&bo->lock); return false; } @@ -1281,7 +1284,7 @@ static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs) p_atomic_inc(&backing->bo->num_active_ioctls); } - simple_mtx_unlock(&bo->u.sparse.commit_lock); + simple_mtx_unlock(&bo->lock); } return true; @@ -1293,7 +1296,7 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) struct amdgpu_winsys *ws = acs->ctx->ws; struct amdgpu_cs_context *cs = acs->cst; int i, r; - amdgpu_bo_list_handle bo_list = NULL; + uint32_t bo_list = 0; uint64_t seq_no = 0; bool has_user_fence = amdgpu_cs_has_user_fence(cs); bool use_bo_list_create = ws->info.drm_minor < 27; @@ -1304,27 +1307,28 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) /* The buffer list contains all buffers. This is a slow path that * ensures that no buffer is missing in the BO list. */ + unsigned num_handles = 0; + struct drm_amdgpu_bo_list_entry *list = + alloca(ws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry)); struct amdgpu_winsys_bo *bo; - amdgpu_bo_handle *handles; - unsigned num = 0; simple_mtx_lock(&ws->global_bo_list_lock); - handles = alloca(sizeof(handles[0]) * ws->num_buffers); - LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, u.real.global_list_item) { - assert(num < ws->num_buffers); - handles[num++] = bo->bo; + if (bo->is_local) + continue; + + list[num_handles].bo_handle = bo->u.real.kms_handle; + list[num_handles].bo_priority = 0; + ++num_handles; } - r = amdgpu_bo_list_create(ws->dev, ws->num_buffers, - handles, NULL, &bo_list); + r = amdgpu_bo_list_create_raw(ws->dev, ws->num_buffers, list, &bo_list); simple_mtx_unlock(&ws->global_bo_list_lock); if (r) { fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r); goto cleanup; } - } else if (!use_bo_list_create) { - /* Standard path passing the buffer list via the CS ioctl. */ + } else { if (!amdgpu_add_sparse_backing_buffers(cs)) { fprintf(stderr, "amdgpu: amdgpu_add_sparse_backing_buffers failed\n"); r = -ENOMEM; @@ -1348,52 +1352,27 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) ++num_handles; } - bo_list_in.operation = ~0; - bo_list_in.list_handle = ~0; - bo_list_in.bo_number = num_handles; - bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry); - bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)list; - } else { - /* Legacy path creating the buffer list handle and passing it to the CS ioctl. */ - unsigned num_handles; - - if (!amdgpu_add_sparse_backing_buffers(cs)) { - fprintf(stderr, "amdgpu: amdgpu_add_sparse_backing_buffers failed\n"); - r = -ENOMEM; - goto cleanup; - } - - amdgpu_bo_handle *handles = alloca(sizeof(*handles) * cs->num_real_buffers); - uint8_t *flags = alloca(sizeof(*flags) * cs->num_real_buffers); - - num_handles = 0; - for (i = 0; i < cs->num_real_buffers; ++i) { - struct amdgpu_cs_buffer *buffer = &cs->real_buffers[i]; - - if (buffer->bo->is_local) - continue; - - assert(buffer->u.real.priority_usage != 0); - - handles[num_handles] = buffer->bo->bo; - flags[num_handles] = (util_last_bit(buffer->u.real.priority_usage) - 1) / 2; - ++num_handles; - } - - if (num_handles) { - r = amdgpu_bo_list_create(ws->dev, num_handles, - handles, flags, &bo_list); + if (use_bo_list_create) { + /* Legacy path creating the buffer list handle and passing it to the CS ioctl. */ + r = amdgpu_bo_list_create_raw(ws->dev, num_handles, list, &bo_list); if (r) { fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r); goto cleanup; } + } else { + /* Standard path passing the buffer list via the CS ioctl. */ + bo_list_in.operation = ~0; + bo_list_in.list_handle = ~0; + bo_list_in.bo_number = num_handles; + bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry); + bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)list; } } if (acs->ring_type == RING_GFX) ws->gfx_bo_list_counter += cs->num_real_buffers; - if (acs->ctx->num_rejected_cs) { + if (acs->stop_exec_on_failure && acs->ctx->num_rejected_cs) { r = -ECANCELED; } else { struct drm_amdgpu_cs_chunk chunks[6]; @@ -1497,8 +1476,8 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) assert(num_chunks <= ARRAY_SIZE(chunks)); - r = amdgpu_cs_submit_raw(ws->dev, acs->ctx->ctx, bo_list, - num_chunks, chunks, &seq_no); + r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list, + num_chunks, chunks, &seq_no); } if (r) { @@ -1523,7 +1502,7 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) /* Cleanup. */ if (bo_list) - amdgpu_bo_list_destroy(bo_list); + amdgpu_bo_list_destroy_raw(ws->dev, bo_list); cleanup: /* If there was an error, signal the fence, because it won't be signalled diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index 9f5a4fd99..07b5d4b35 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -129,6 +129,7 @@ struct amdgpu_cs { /* Flush CS. */ void (*flush_cs)(void *ctx, unsigned flags, struct pipe_fence_handle **fence); void *flush_data; + bool stop_exec_on_failure; struct util_queue_fence flush_completed; struct pipe_fence_handle *next_fence; @@ -169,11 +170,11 @@ static inline void amdgpu_ctx_unref(struct amdgpu_ctx *ctx) static inline void amdgpu_fence_reference(struct pipe_fence_handle **dst, struct pipe_fence_handle *src) { - struct amdgpu_fence **rdst = (struct amdgpu_fence **)dst; - struct amdgpu_fence *rsrc = (struct amdgpu_fence *)src; + struct amdgpu_fence **adst = (struct amdgpu_fence **)dst; + struct amdgpu_fence *asrc = (struct amdgpu_fence *)src; - if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) { - struct amdgpu_fence *fence = *rdst; + if (pipe_reference(&(*adst)->reference, &asrc->reference)) { + struct amdgpu_fence *fence = *adst; if (amdgpu_fence_is_syncobj(fence)) amdgpu_cs_destroy_syncobj(fence->ws->dev, fence->syncobj); @@ -183,7 +184,7 @@ static inline void amdgpu_fence_reference(struct pipe_fence_handle **dst, util_queue_fence_destroy(&fence->submitted); FREE(fence); } - *rdst = rsrc; + *adst = asrc; } int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo); diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index b20d70267..45e54b479 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -38,6 +38,7 @@ #include <xf86drm.h> #include <stdio.h> #include <sys/stat.h> +#include "amd/common/ac_llvm_util.h" #include "amd/common/sid.h" #include "amd/common/gfx9d.h" @@ -50,6 +51,39 @@ static simple_mtx_t dev_tab_mutex = _SIMPLE_MTX_INITIALIZER_NP; DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false) +static void handle_env_var_force_family(struct amdgpu_winsys *ws) +{ + const char *family = debug_get_option("SI_FORCE_FAMILY", NULL); + unsigned i; + + if (!family) + return; + + for (i = CHIP_TAHITI; i < CHIP_LAST; i++) { + if (!strcmp(family, ac_get_llvm_processor_name(i))) { + /* Override family and chip_class. */ + ws->info.family = i; + ws->info.name = "GCN-NOOP"; + + if (i >= CHIP_VEGA10) + ws->info.chip_class = GFX9; + else if (i >= CHIP_TONGA) + ws->info.chip_class = VI; + else if (i >= CHIP_BONAIRE) + ws->info.chip_class = CIK; + else + ws->info.chip_class = SI; + + /* Don't submit any IBs. */ + setenv("RADEON_NOOP", "1", 1); + return; + } + } + + fprintf(stderr, "radeonsi: Unknown family: %s\n", family); + exit(1); +} + /* Helper function to do the ioctls needed for setup and init. */ static bool do_winsys_init(struct amdgpu_winsys *ws, const struct pipe_screen_config *config, @@ -58,6 +92,12 @@ static bool do_winsys_init(struct amdgpu_winsys *ws, if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo)) goto fail; + /* TODO: Enable this once the kernel handles it efficiently. */ + if (ws->info.has_dedicated_vram) + ws->info.has_local_buffers = false; + + handle_env_var_force_family(ws); + ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, &ws->info.max_alignment); if (!ws->addrlib) { fprintf(stderr, "amdgpu: Cannot create addrlib.\n"); @@ -95,7 +135,10 @@ static void amdgpu_winsys_destroy(struct radeon_winsys *rws) util_queue_destroy(&ws->cs_queue); simple_mtx_destroy(&ws->bo_fence_lock); - pb_slabs_deinit(&ws->bo_slabs); + for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) { + if (ws->bo_slabs[i].groups) + pb_slabs_deinit(&ws->bo_slabs[i]); + } pb_cache_deinit(&ws->bo_cache); util_hash_table_destroy(ws->bo_export_table); simple_mtx_destroy(&ws->global_bo_list_lock); @@ -307,16 +350,33 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config, (ws->info.vram_size + ws->info.gart_size) / 8, amdgpu_bo_destroy, amdgpu_bo_can_reclaim); - if (!pb_slabs_init(&ws->bo_slabs, - AMDGPU_SLAB_MIN_SIZE_LOG2, AMDGPU_SLAB_MAX_SIZE_LOG2, - RADEON_MAX_SLAB_HEAPS, - ws, - amdgpu_bo_can_reclaim_slab, - amdgpu_bo_slab_alloc, - amdgpu_bo_slab_free)) - goto fail_cache; + unsigned min_slab_order = 9; /* 512 bytes */ + unsigned max_slab_order = 18; /* 256 KB - higher numbers increase memory usage */ + unsigned num_slab_orders_per_allocator = (max_slab_order - min_slab_order) / + NUM_SLAB_ALLOCATORS; + + /* Divide the size order range among slab managers. */ + for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) { + unsigned min_order = min_slab_order; + unsigned max_order = MIN2(min_order + num_slab_orders_per_allocator, + max_slab_order); + + if (!pb_slabs_init(&ws->bo_slabs[i], + min_order, max_order, + RADEON_MAX_SLAB_HEAPS, + ws, + amdgpu_bo_can_reclaim_slab, + amdgpu_bo_slab_alloc, + amdgpu_bo_slab_free)) { + amdgpu_winsys_destroy(&ws->base); + simple_mtx_unlock(&dev_tab_mutex); + return NULL; + } + + min_slab_order = max_order + 1; + } - ws->info.min_alloc_size = 1 << AMDGPU_SLAB_MIN_SIZE_LOG2; + ws->info.min_alloc_size = 1 << ws->bo_slabs[0].min_order; /* init reference */ pipe_reference_init(&ws->reference, 1); diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h index c355eff52..4f0b1262e 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h @@ -31,22 +31,24 @@ #include "pipebuffer/pb_cache.h" #include "pipebuffer/pb_slab.h" #include "gallium/drivers/radeon/radeon_winsys.h" -#include "addrlib/addrinterface.h" +#include "addrlib/inc/addrinterface.h" #include "util/simple_mtx.h" #include "util/u_queue.h" #include <amdgpu.h> struct amdgpu_cs; -#define AMDGPU_SLAB_MIN_SIZE_LOG2 9 /* 512 bytes */ -#define AMDGPU_SLAB_MAX_SIZE_LOG2 16 /* 64 KB */ -#define AMDGPU_SLAB_BO_SIZE_LOG2 17 /* 128 KB */ +#define NUM_SLAB_ALLOCATORS 3 struct amdgpu_winsys { struct radeon_winsys base; struct pipe_reference reference; struct pb_cache bo_cache; - struct pb_slabs bo_slabs; + + /* Each slab buffer can only contain suballocations of equal sizes, so we + * need to layer the allocators, so that we don't waste too much memory. + */ + struct pb_slabs bo_slabs[NUM_SLAB_ALLOCATORS]; amdgpu_device_handle dev; |