summaryrefslogtreecommitdiff
path: root/lib/mesa/src/gallium/winsys/amdgpu/drm
diff options
context:
space:
mode:
authorJonathan Gray <jsg@cvs.openbsd.org>2019-05-23 05:33:34 +0000
committerJonathan Gray <jsg@cvs.openbsd.org>2019-05-23 05:33:34 +0000
commit9886815a25d84be79f51e65ebd8e458bb5d26ca8 (patch)
treea65edf018dd992543337433f7303fb29a6c8e8cf /lib/mesa/src/gallium/winsys/amdgpu/drm
parente2a3acb64af2657b1181806818eacad061103c23 (diff)
Merge Mesa 19.0.5
Diffstat (limited to 'lib/mesa/src/gallium/winsys/amdgpu/drm')
-rw-r--r--lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am1
-rw-r--r--lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in1
-rw-r--r--lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c293
-rw-r--r--lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h7
-rw-r--r--lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c129
-rw-r--r--lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h11
-rw-r--r--lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c80
-rw-r--r--lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h12
8 files changed, 355 insertions, 179 deletions
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am
index e35fa2cd0..1c2ec010f 100644
--- a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am
+++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am
@@ -4,6 +4,7 @@ include $(top_srcdir)/src/gallium/Automake.inc
AM_CFLAGS = \
$(GALLIUM_WINSYS_CFLAGS) \
$(AMDGPU_CFLAGS) \
+ $(LLVM_CFLAGS) \
-I$(top_srcdir)/src/amd/
AM_CXXFLAGS = $(AM_CFLAGS)
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in
index 2cba679ac..2730755f8 100644
--- a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in
+++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in
@@ -539,6 +539,7 @@ GALLIUM_PIPE_LOADER_WINSYS_LIBS = \
AM_CFLAGS = \
$(GALLIUM_WINSYS_CFLAGS) \
$(AMDGPU_CFLAGS) \
+ $(LLVM_CFLAGS) \
-I$(top_srcdir)/src/amd/
AM_CXXFLAGS = $(AM_CFLAGS)
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index f10805805..58979bd4e 100644
--- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -56,6 +56,7 @@ amdgpu_bo_create(struct radeon_winsys *rws,
unsigned alignment,
enum radeon_bo_domain domain,
enum radeon_bo_flag flags);
+static void amdgpu_bo_unmap(struct pb_buffer *buf);
static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
enum radeon_bo_usage usage)
@@ -173,6 +174,12 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf)
assert(bo->bo && "must not be called for slab entries");
+ if (!bo->is_user_ptr && bo->cpu_ptr) {
+ bo->cpu_ptr = NULL;
+ amdgpu_bo_unmap(&bo->base);
+ }
+ assert(bo->is_user_ptr || bo->u.real.map_count == 0);
+
if (ws->debug_all_bos) {
simple_mtx_lock(&ws->global_bo_list_lock);
LIST_DEL(&bo->u.real.global_list_item);
@@ -184,8 +191,10 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf)
util_hash_table_remove(ws->bo_export_table, bo->bo);
simple_mtx_unlock(&ws->bo_export_table_lock);
- amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
- amdgpu_va_range_free(bo->u.real.va_handle);
+ if (bo->initial_domain & RADEON_DOMAIN_VRAM_GTT) {
+ amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
+ amdgpu_va_range_free(bo->u.real.va_handle);
+ }
amdgpu_bo_free(bo->bo);
amdgpu_bo_remove_fences(bo);
@@ -195,14 +204,7 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf)
else if (bo->initial_domain & RADEON_DOMAIN_GTT)
ws->allocated_gtt -= align64(bo->base.size, ws->info.gart_page_size);
- if (bo->u.real.map_count >= 1) {
- if (bo->initial_domain & RADEON_DOMAIN_VRAM)
- ws->mapped_vram -= bo->base.size;
- else if (bo->initial_domain & RADEON_DOMAIN_GTT)
- ws->mapped_gtt -= bo->base.size;
- ws->num_mapped_buffers--;
- }
-
+ simple_mtx_destroy(&bo->lock);
FREE(bo);
}
@@ -218,6 +220,37 @@ static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf)
amdgpu_bo_destroy(_buf);
}
+static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws)
+{
+ for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++)
+ pb_slabs_reclaim(&ws->bo_slabs[i]);
+
+ pb_cache_release_all_buffers(&ws->bo_cache);
+}
+
+static bool amdgpu_bo_do_map(struct amdgpu_winsys_bo *bo, void **cpu)
+{
+ assert(!bo->sparse && bo->bo && !bo->is_user_ptr);
+ int r = amdgpu_bo_cpu_map(bo->bo, cpu);
+ if (r) {
+ /* Clean up buffer managers and try again. */
+ amdgpu_clean_up_buffer_managers(bo->ws);
+ r = amdgpu_bo_cpu_map(bo->bo, cpu);
+ if (r)
+ return false;
+ }
+
+ if (p_atomic_inc_return(&bo->u.real.map_count) == 1) {
+ if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+ bo->ws->mapped_vram += bo->base.size;
+ else if (bo->initial_domain & RADEON_DOMAIN_GTT)
+ bo->ws->mapped_gtt += bo->base.size;
+ bo->ws->num_mapped_buffers++;
+ }
+
+ return true;
+}
+
static void *amdgpu_bo_map(struct pb_buffer *buf,
struct radeon_cmdbuf *rcs,
enum pipe_transfer_usage usage)
@@ -225,9 +258,6 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
struct amdgpu_winsys_bo *real;
struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
- int r;
- void *cpu = NULL;
- uint64_t offset = 0;
assert(!bo->sparse);
@@ -312,9 +342,9 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
}
}
- /* If the buffer is created from user memory, return the user pointer. */
- if (bo->user_ptr)
- return bo->user_ptr;
+ /* Buffer synchronization has been checked, now actually map the buffer. */
+ void *cpu = NULL;
+ uint64_t offset = 0;
if (bo->bo) {
real = bo;
@@ -323,22 +353,31 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
offset = bo->va - real->va;
}
- r = amdgpu_bo_cpu_map(real->bo, &cpu);
- if (r) {
- /* Clear the cache and try again. */
- pb_cache_release_all_buffers(&real->ws->bo_cache);
- r = amdgpu_bo_cpu_map(real->bo, &cpu);
- if (r)
- return NULL;
+ if (usage & RADEON_TRANSFER_TEMPORARY) {
+ if (real->is_user_ptr) {
+ cpu = real->cpu_ptr;
+ } else {
+ if (!amdgpu_bo_do_map(real, &cpu))
+ return NULL;
+ }
+ } else {
+ cpu = p_atomic_read(&real->cpu_ptr);
+ if (!cpu) {
+ simple_mtx_lock(&real->lock);
+ /* Must re-check due to the possibility of a race. Re-check need not
+ * be atomic thanks to the lock. */
+ cpu = real->cpu_ptr;
+ if (!cpu) {
+ if (!amdgpu_bo_do_map(real, &cpu)) {
+ simple_mtx_unlock(&real->lock);
+ return NULL;
+ }
+ p_atomic_set(&real->cpu_ptr, cpu);
+ }
+ simple_mtx_unlock(&real->lock);
+ }
}
- if (p_atomic_inc_return(&real->u.real.map_count) == 1) {
- if (real->initial_domain & RADEON_DOMAIN_VRAM)
- real->ws->mapped_vram += real->base.size;
- else if (real->initial_domain & RADEON_DOMAIN_GTT)
- real->ws->mapped_gtt += real->base.size;
- real->ws->num_mapped_buffers++;
- }
return (uint8_t*)cpu + offset;
}
@@ -349,12 +388,15 @@ static void amdgpu_bo_unmap(struct pb_buffer *buf)
assert(!bo->sparse);
- if (bo->user_ptr)
+ if (bo->is_user_ptr)
return;
real = bo->bo ? bo : bo->u.slab.real;
-
+ assert(real->u.real.map_count != 0 && "too many unmaps");
if (p_atomic_dec_zero(&real->u.real.map_count)) {
+ assert(!real->cpu_ptr &&
+ "too many unmaps or forgot RADEON_TRANSFER_TEMPORARY flag");
+
if (real->initial_domain & RADEON_DOMAIN_VRAM)
real->ws->mapped_vram -= real->base.size;
else if (real->initial_domain & RADEON_DOMAIN_GTT)
@@ -384,6 +426,27 @@ static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo)
}
}
+static uint64_t amdgpu_get_optimal_vm_alignment(struct amdgpu_winsys *ws,
+ uint64_t size, unsigned alignment)
+{
+ uint64_t vm_alignment = alignment;
+
+ /* Increase the VM alignment for faster address translation. */
+ if (size >= ws->info.pte_fragment_size)
+ vm_alignment = MAX2(vm_alignment, ws->info.pte_fragment_size);
+
+ /* Gfx9: Increase the VM alignment to the most significant bit set
+ * in the size for faster address translation.
+ */
+ if (ws->info.chip_class >= GFX9) {
+ unsigned msb = util_last_bit64(size); /* 0 = no bit is set */
+ uint64_t msb_alignment = msb ? 1ull << (msb - 1) : 0;
+
+ vm_alignment = MAX2(vm_alignment, msb_alignment);
+ }
+ return vm_alignment;
+}
+
static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
uint64_t size,
unsigned alignment,
@@ -396,11 +459,12 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
uint64_t va = 0;
struct amdgpu_winsys_bo *bo;
amdgpu_va_handle va_handle;
- unsigned va_gap_size;
int r;
/* VRAM or GTT must be specified, but not both at the same time. */
- assert(util_bitcount(initial_domain & RADEON_DOMAIN_VRAM_GTT) == 1);
+ assert(util_bitcount(initial_domain & (RADEON_DOMAIN_VRAM_GTT |
+ RADEON_DOMAIN_GDS |
+ RADEON_DOMAIN_OA)) == 1);
bo = CALLOC_STRUCT(amdgpu_winsys_bo);
if (!bo) {
@@ -418,6 +482,10 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
if (initial_domain & RADEON_DOMAIN_GTT)
request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
+ if (initial_domain & RADEON_DOMAIN_GDS)
+ request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS;
+ if (initial_domain & RADEON_DOMAIN_OA)
+ request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA;
/* Since VRAM and GTT have almost the same performance on APUs, we could
* just set GTT. However, in order to decrease GTT(RAM) usage, which is
@@ -447,27 +515,31 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
goto error_bo_alloc;
}
- va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
- if (size > ws->info.pte_fragment_size)
- alignment = MAX2(alignment, ws->info.pte_fragment_size);
- r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
- size + va_gap_size, alignment, 0, &va, &va_handle,
- (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
- AMDGPU_VA_RANGE_HIGH);
- if (r)
- goto error_va_alloc;
+ if (initial_domain & RADEON_DOMAIN_VRAM_GTT) {
+ unsigned va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
+
+ r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
+ size + va_gap_size,
+ amdgpu_get_optimal_vm_alignment(ws, size, alignment),
+ 0, &va, &va_handle,
+ (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
+ AMDGPU_VA_RANGE_HIGH);
+ if (r)
+ goto error_va_alloc;
- unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |
- AMDGPU_VM_PAGE_EXECUTABLE;
+ unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |
+ AMDGPU_VM_PAGE_EXECUTABLE;
- if (!(flags & RADEON_FLAG_READ_ONLY))
- vm_flags |= AMDGPU_VM_PAGE_WRITEABLE;
+ if (!(flags & RADEON_FLAG_READ_ONLY))
+ vm_flags |= AMDGPU_VM_PAGE_WRITEABLE;
- r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags,
+ r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags,
AMDGPU_VA_OP_MAP);
- if (r)
- goto error_va_map;
+ if (r)
+ goto error_va_map;
+ }
+ simple_mtx_init(&bo->lock, mtx_plain);
pipe_reference_init(&bo->base.reference, 1);
bo->base.alignment = alignment;
bo->base.usage = 0;
@@ -486,7 +558,7 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
else if (initial_domain & RADEON_DOMAIN_GTT)
ws->allocated_gtt += align64(size, ws->info.gart_page_size);
- amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms_noimport, &bo->u.real.kms_handle);
+ amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
amdgpu_add_buffer_to_global_list(bo);
@@ -522,13 +594,27 @@ bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
return amdgpu_bo_can_reclaim(&bo->base);
}
+static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size)
+{
+ /* Find the correct slab allocator for the given size. */
+ for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
+ struct pb_slabs *slabs = &ws->bo_slabs[i];
+
+ if (size <= 1 << (slabs->min_order + slabs->num_orders - 1))
+ return slabs;
+ }
+
+ assert(0);
+ return NULL;
+}
+
static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf)
{
struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
assert(!bo->bo);
- pb_slab_free(&bo->ws->bo_slabs, &bo->u.slab.entry);
+ pb_slab_free(get_slabs(bo->ws, bo->base.size), &bo->u.slab.entry);
}
static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {
@@ -545,19 +631,37 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
uint32_t base_id;
+ unsigned slab_size = 0;
if (!slab)
return NULL;
- unsigned slab_size = 1 << AMDGPU_SLAB_BO_SIZE_LOG2;
+ /* Determine the slab buffer size. */
+ for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
+ struct pb_slabs *slabs = &ws->bo_slabs[i];
+ unsigned max_entry_size = 1 << (slabs->min_order + slabs->num_orders - 1);
+
+ if (entry_size <= max_entry_size) {
+ /* The slab size is twice the size of the largest possible entry. */
+ slab_size = max_entry_size * 2;
+
+ /* The largest slab should have the same size as the PTE fragment
+ * size to get faster address translation.
+ */
+ if (i == NUM_SLAB_ALLOCATORS - 1 &&
+ slab_size < ws->info.pte_fragment_size)
+ slab_size = ws->info.pte_fragment_size;
+ break;
+ }
+ }
+ assert(slab_size != 0);
+
slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base,
slab_size, slab_size,
domains, flags));
if (!slab->buffer)
goto fail;
- assert(slab->buffer->bo);
-
slab->base.num_entries = slab->buffer->base.size / entry_size;
slab->base.num_free = slab->base.num_entries;
slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
@@ -571,6 +675,7 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
for (unsigned i = 0; i < slab->base.num_entries; ++i) {
struct amdgpu_winsys_bo *bo = &slab->entries[i];
+ simple_mtx_init(&bo->lock, mtx_plain);
bo->base.alignment = entry_size;
bo->base.usage = slab->buffer->base.usage;
bo->base.size = entry_size;
@@ -581,7 +686,15 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
bo->unique_id = base_id + i;
bo->u.slab.entry.slab = &slab->base;
bo->u.slab.entry.group_index = group_index;
- bo->u.slab.real = slab->buffer;
+
+ if (slab->buffer->bo) {
+ /* The slab is not suballocated. */
+ bo->u.slab.real = slab->buffer;
+ } else {
+ /* The slab is allocated out of a bigger slab. */
+ bo->u.slab.real = slab->buffer->u.slab.real;
+ assert(bo->u.slab.real->bo);
+ }
LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free);
}
@@ -599,8 +712,10 @@ void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
{
struct amdgpu_slab *slab = amdgpu_slab(pslab);
- for (unsigned i = 0; i < slab->base.num_entries; ++i)
+ for (unsigned i = 0; i < slab->base.num_entries; ++i) {
amdgpu_bo_remove_fences(&slab->entries[i]);
+ simple_mtx_destroy(&slab->entries[i].lock);
+ }
FREE(slab->entries);
amdgpu_winsys_bo_reference(&slab->buffer, NULL);
@@ -858,8 +973,8 @@ static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf)
}
amdgpu_va_range_free(bo->u.sparse.va_handle);
- simple_mtx_destroy(&bo->u.sparse.commit_lock);
FREE(bo->u.sparse.commitments);
+ simple_mtx_destroy(&bo->lock);
FREE(bo);
}
@@ -889,6 +1004,7 @@ amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size,
if (!bo)
return NULL;
+ simple_mtx_init(&bo->lock, mtx_plain);
pipe_reference_init(&bo->base.reference, 1);
bo->base.alignment = RADEON_SPARSE_PAGE_SIZE;
bo->base.size = size;
@@ -905,7 +1021,6 @@ amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size,
if (!bo->u.sparse.commitments)
goto error_alloc_commitments;
- simple_mtx_init(&bo->u.sparse.commit_lock, mtx_plain);
LIST_INITHEAD(&bo->u.sparse.backing);
/* For simplicity, we always map a multiple of the page size. */
@@ -928,9 +1043,9 @@ amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size,
error_va_map:
amdgpu_va_range_free(bo->u.sparse.va_handle);
error_va_alloc:
- simple_mtx_destroy(&bo->u.sparse.commit_lock);
FREE(bo->u.sparse.commitments);
error_alloc_commitments:
+ simple_mtx_destroy(&bo->lock);
FREE(bo);
return NULL;
}
@@ -955,7 +1070,7 @@ amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size,
va_page = offset / RADEON_SPARSE_PAGE_SIZE;
end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
- simple_mtx_lock(&bo->u.sparse.commit_lock);
+ simple_mtx_lock(&bo->lock);
#if DEBUG_SPARSE_COMMITS
sparse_dump(bo, __func__);
@@ -1059,7 +1174,7 @@ amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size,
}
out:
- simple_mtx_unlock(&bo->u.sparse.commit_lock);
+ simple_mtx_unlock(&bo->lock);
return ok;
}
@@ -1193,22 +1308,28 @@ amdgpu_bo_create(struct radeon_winsys *rws,
/* Sparse buffers must have NO_CPU_ACCESS set. */
assert(!(flags & RADEON_FLAG_SPARSE) || flags & RADEON_FLAG_NO_CPU_ACCESS);
+ struct pb_slabs *last_slab = &ws->bo_slabs[NUM_SLAB_ALLOCATORS - 1];
+ unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1);
+
/* Sub-allocate small buffers from slabs. */
if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
- size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
- alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) {
+ size <= max_slab_entry_size &&
+ /* The alignment must be at most the size of the smallest slab entry or
+ * the next power of two. */
+ alignment <= MAX2(1 << ws->bo_slabs[0].min_order, util_next_power_of_two(size))) {
struct pb_slab_entry *entry;
int heap = radeon_get_heap_index(domain, flags);
if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS)
goto no_slab;
- entry = pb_slab_alloc(&ws->bo_slabs, size, heap);
+ struct pb_slabs *slabs = get_slabs(ws, size);
+ entry = pb_slab_alloc(slabs, size, heap);
if (!entry) {
- /* Clear the cache and try again. */
- pb_cache_release_all_buffers(&ws->bo_cache);
+ /* Clean up buffer managers and try again. */
+ amdgpu_clean_up_buffer_managers(ws);
- entry = pb_slab_alloc(&ws->bo_slabs, size, heap);
+ entry = pb_slab_alloc(slabs, size, heap);
}
if (!entry)
return NULL;
@@ -1235,8 +1356,10 @@ no_slab:
* BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
* like constant/uniform buffers, can benefit from better and more reuse.
*/
- size = align64(size, ws->info.gart_page_size);
- alignment = align(alignment, ws->info.gart_page_size);
+ if (domain & RADEON_DOMAIN_VRAM_GTT) {
+ size = align64(size, ws->info.gart_page_size);
+ alignment = align(alignment, ws->info.gart_page_size);
+ }
bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
@@ -1254,9 +1377,9 @@ no_slab:
/* Create a new one. */
bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);
if (!bo) {
- /* Clear the cache and try again. */
- pb_slabs_reclaim(&ws->bo_slabs);
- pb_cache_release_all_buffers(&ws->bo_cache);
+ /* Clean up buffer managers and try again. */
+ amdgpu_clean_up_buffer_managers(ws);
+
bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);
if (!bo)
return NULL;
@@ -1268,6 +1391,7 @@ no_slab:
static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
struct winsys_handle *whandle,
+ unsigned vm_alignment,
unsigned *stride,
unsigned *offset)
{
@@ -1325,8 +1449,10 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
goto error;
r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
- result.alloc_size, 1 << 20, 0, &va, &va_handle,
- AMDGPU_VA_RANGE_HIGH);
+ result.alloc_size,
+ amdgpu_get_optimal_vm_alignment(ws, result.alloc_size,
+ vm_alignment),
+ 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH);
if (r)
goto error;
@@ -1344,6 +1470,7 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
initial |= RADEON_DOMAIN_GTT;
/* Initialize the structure. */
+ simple_mtx_init(&bo->lock, mtx_plain);
pipe_reference_init(&bo->base.reference, 1);
bo->base.alignment = info.phys_alignment;
bo->bo = result.buf_handle;
@@ -1361,7 +1488,7 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
else if (bo->initial_domain & RADEON_DOMAIN_GTT)
ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size);
- amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms_noimport, &bo->u.real.kms_handle);
+ amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
amdgpu_add_buffer_to_global_list(bo);
@@ -1445,21 +1572,25 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
goto error;
if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
- aligned_size, 1 << 12, 0, &va, &va_handle,
- AMDGPU_VA_RANGE_HIGH))
+ aligned_size,
+ amdgpu_get_optimal_vm_alignment(ws, aligned_size,
+ ws->info.gart_page_size),
+ 0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH))
goto error_va_alloc;
if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP))
goto error_va_map;
/* Initialize it. */
+ bo->is_user_ptr = true;
pipe_reference_init(&bo->base.reference, 1);
+ simple_mtx_init(&bo->lock, mtx_plain);
bo->bo = buf_handle;
bo->base.alignment = 0;
bo->base.size = size;
bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
bo->ws = ws;
- bo->user_ptr = pointer;
+ bo->cpu_ptr = pointer;
bo->va = va;
bo->u.real.va_handle = va_handle;
bo->initial_domain = RADEON_DOMAIN_GTT;
@@ -1469,7 +1600,7 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
amdgpu_add_buffer_to_global_list(bo);
- amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms_noimport, &bo->u.real.kms_handle);
+ amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
return (struct pb_buffer*)bo;
@@ -1486,7 +1617,7 @@ error:
static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)
{
- return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL;
+ return ((struct amdgpu_winsys_bo*)buf)->is_user_ptr;
}
static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf)
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
index 1e07e4734..88f424132 100644
--- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
+++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@@ -74,7 +74,6 @@ struct amdgpu_winsys_bo {
struct amdgpu_winsys_bo *real;
} slab;
struct {
- simple_mtx_t commit_lock;
amdgpu_va_handle va_handle;
enum radeon_bo_flag flags;
@@ -89,10 +88,12 @@ struct amdgpu_winsys_bo {
} u;
struct amdgpu_winsys *ws;
- void *user_ptr; /* from buffer_from_ptr */
+ void *cpu_ptr; /* for user_ptr and permanent maps */
amdgpu_bo_handle bo; /* NULL for slab entries and sparse buffers */
bool sparse;
+ bool is_user_ptr;
+ bool is_local;
uint32_t unique_id;
uint64_t va;
enum radeon_bo_domain initial_domain;
@@ -114,7 +115,7 @@ struct amdgpu_winsys_bo {
unsigned max_fences;
struct pipe_fence_handle **fences;
- bool is_local;
+ simple_mtx_t lock;
};
struct amdgpu_slab {
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 3f0a6a1c9..2e595e5a1 100644
--- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -172,45 +172,45 @@ static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
uint64_t seq_no,
uint64_t *user_fence_cpu_address)
{
- struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
+ struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
- rfence->fence.fence = seq_no;
- rfence->user_fence_cpu_address = user_fence_cpu_address;
- util_queue_fence_signal(&rfence->submitted);
+ afence->fence.fence = seq_no;
+ afence->user_fence_cpu_address = user_fence_cpu_address;
+ util_queue_fence_signal(&afence->submitted);
}
static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
{
- struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
+ struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
- rfence->signalled = true;
- util_queue_fence_signal(&rfence->submitted);
+ afence->signalled = true;
+ util_queue_fence_signal(&afence->submitted);
}
bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
bool absolute)
{
- struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
+ struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
uint32_t expired;
int64_t abs_timeout;
uint64_t *user_fence_cpu;
int r;
- if (rfence->signalled)
+ if (afence->signalled)
return true;
/* Handle syncobjs. */
- if (amdgpu_fence_is_syncobj(rfence)) {
+ if (amdgpu_fence_is_syncobj(afence)) {
/* Absolute timeouts are only be used by BO fences, which aren't
* backed by syncobjs.
*/
assert(!absolute);
- if (amdgpu_cs_syncobj_wait(rfence->ws->dev, &rfence->syncobj, 1,
+ if (amdgpu_cs_syncobj_wait(afence->ws->dev, &afence->syncobj, 1,
timeout, 0, NULL))
return false;
- rfence->signalled = true;
+ afence->signalled = true;
return true;
}
@@ -222,13 +222,13 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
/* The fence might not have a number assigned if its IB is being
* submitted in the other thread right now. Wait until the submission
* is done. */
- if (!util_queue_fence_wait_timeout(&rfence->submitted, abs_timeout))
+ if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout))
return false;
- user_fence_cpu = rfence->user_fence_cpu_address;
+ user_fence_cpu = afence->user_fence_cpu_address;
if (user_fence_cpu) {
- if (*user_fence_cpu >= rfence->fence.fence) {
- rfence->signalled = true;
+ if (*user_fence_cpu >= afence->fence.fence) {
+ afence->signalled = true;
return true;
}
@@ -238,7 +238,7 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
}
/* Now use the libdrm query. */
- r = amdgpu_cs_query_fence_status(&rfence->fence,
+ r = amdgpu_cs_query_fence_status(&afence->fence,
abs_timeout,
AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE,
&expired);
@@ -250,7 +250,7 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
if (expired) {
/* This variable can only transition from false to true, so it doesn't
* matter if threads race for it. */
- rfence->signalled = true;
+ afence->signalled = true;
return true;
}
return false;
@@ -386,7 +386,8 @@ static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs)
cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCE &&
cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_UVD_ENC &&
cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_DEC &&
- cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC;
+ cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC &&
+ cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_JPEG;
}
static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs)
@@ -598,7 +599,7 @@ static int amdgpu_lookup_or_add_sparse_buffer(struct amdgpu_cs *acs,
/* We delay adding the backing buffers until we really have to. However,
* we cannot delay accounting for memory use.
*/
- simple_mtx_lock(&bo->u.sparse.commit_lock);
+ simple_mtx_lock(&bo->lock);
list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
if (bo->initial_domain & RADEON_DOMAIN_VRAM)
@@ -607,7 +608,7 @@ static int amdgpu_lookup_or_add_sparse_buffer(struct amdgpu_cs *acs,
acs->main.base.used_gart += backing->bo->base.size;
}
- simple_mtx_unlock(&bo->u.sparse.commit_lock);
+ simple_mtx_unlock(&bo->lock);
return idx;
}
@@ -923,7 +924,8 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
enum ring_type ring_type,
void (*flush)(void *ctx, unsigned flags,
struct pipe_fence_handle **fence),
- void *flush_ctx)
+ void *flush_ctx,
+ bool stop_exec_on_failure)
{
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
struct amdgpu_cs *cs;
@@ -939,6 +941,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
cs->flush_cs = flush;
cs->flush_data = flush_ctx;
cs->ring_type = ring_type;
+ cs->stop_exec_on_failure = stop_exec_on_failure;
struct amdgpu_cs_fence_info fence_info;
fence_info.handle = cs->ctx->user_fence_bo;
@@ -1263,7 +1266,7 @@ static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs)
struct amdgpu_cs_buffer *buffer = &cs->sparse_buffers[i];
struct amdgpu_winsys_bo *bo = buffer->bo;
- simple_mtx_lock(&bo->u.sparse.commit_lock);
+ simple_mtx_lock(&bo->lock);
list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
/* We can directly add the buffer here, because we know that each
@@ -1272,7 +1275,7 @@ static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs)
int idx = amdgpu_do_add_real_buffer(cs, backing->bo);
if (idx < 0) {
fprintf(stderr, "%s: failed to add buffer\n", __FUNCTION__);
- simple_mtx_unlock(&bo->u.sparse.commit_lock);
+ simple_mtx_unlock(&bo->lock);
return false;
}
@@ -1281,7 +1284,7 @@ static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs)
p_atomic_inc(&backing->bo->num_active_ioctls);
}
- simple_mtx_unlock(&bo->u.sparse.commit_lock);
+ simple_mtx_unlock(&bo->lock);
}
return true;
@@ -1293,7 +1296,7 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
struct amdgpu_winsys *ws = acs->ctx->ws;
struct amdgpu_cs_context *cs = acs->cst;
int i, r;
- amdgpu_bo_list_handle bo_list = NULL;
+ uint32_t bo_list = 0;
uint64_t seq_no = 0;
bool has_user_fence = amdgpu_cs_has_user_fence(cs);
bool use_bo_list_create = ws->info.drm_minor < 27;
@@ -1304,27 +1307,28 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
/* The buffer list contains all buffers. This is a slow path that
* ensures that no buffer is missing in the BO list.
*/
+ unsigned num_handles = 0;
+ struct drm_amdgpu_bo_list_entry *list =
+ alloca(ws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
struct amdgpu_winsys_bo *bo;
- amdgpu_bo_handle *handles;
- unsigned num = 0;
simple_mtx_lock(&ws->global_bo_list_lock);
- handles = alloca(sizeof(handles[0]) * ws->num_buffers);
-
LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, u.real.global_list_item) {
- assert(num < ws->num_buffers);
- handles[num++] = bo->bo;
+ if (bo->is_local)
+ continue;
+
+ list[num_handles].bo_handle = bo->u.real.kms_handle;
+ list[num_handles].bo_priority = 0;
+ ++num_handles;
}
- r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
- handles, NULL, &bo_list);
+ r = amdgpu_bo_list_create_raw(ws->dev, ws->num_buffers, list, &bo_list);
simple_mtx_unlock(&ws->global_bo_list_lock);
if (r) {
fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r);
goto cleanup;
}
- } else if (!use_bo_list_create) {
- /* Standard path passing the buffer list via the CS ioctl. */
+ } else {
if (!amdgpu_add_sparse_backing_buffers(cs)) {
fprintf(stderr, "amdgpu: amdgpu_add_sparse_backing_buffers failed\n");
r = -ENOMEM;
@@ -1348,52 +1352,27 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
++num_handles;
}
- bo_list_in.operation = ~0;
- bo_list_in.list_handle = ~0;
- bo_list_in.bo_number = num_handles;
- bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
- bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)list;
- } else {
- /* Legacy path creating the buffer list handle and passing it to the CS ioctl. */
- unsigned num_handles;
-
- if (!amdgpu_add_sparse_backing_buffers(cs)) {
- fprintf(stderr, "amdgpu: amdgpu_add_sparse_backing_buffers failed\n");
- r = -ENOMEM;
- goto cleanup;
- }
-
- amdgpu_bo_handle *handles = alloca(sizeof(*handles) * cs->num_real_buffers);
- uint8_t *flags = alloca(sizeof(*flags) * cs->num_real_buffers);
-
- num_handles = 0;
- for (i = 0; i < cs->num_real_buffers; ++i) {
- struct amdgpu_cs_buffer *buffer = &cs->real_buffers[i];
-
- if (buffer->bo->is_local)
- continue;
-
- assert(buffer->u.real.priority_usage != 0);
-
- handles[num_handles] = buffer->bo->bo;
- flags[num_handles] = (util_last_bit(buffer->u.real.priority_usage) - 1) / 2;
- ++num_handles;
- }
-
- if (num_handles) {
- r = amdgpu_bo_list_create(ws->dev, num_handles,
- handles, flags, &bo_list);
+ if (use_bo_list_create) {
+ /* Legacy path creating the buffer list handle and passing it to the CS ioctl. */
+ r = amdgpu_bo_list_create_raw(ws->dev, num_handles, list, &bo_list);
if (r) {
fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r);
goto cleanup;
}
+ } else {
+ /* Standard path passing the buffer list via the CS ioctl. */
+ bo_list_in.operation = ~0;
+ bo_list_in.list_handle = ~0;
+ bo_list_in.bo_number = num_handles;
+ bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
+ bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)list;
}
}
if (acs->ring_type == RING_GFX)
ws->gfx_bo_list_counter += cs->num_real_buffers;
- if (acs->ctx->num_rejected_cs) {
+ if (acs->stop_exec_on_failure && acs->ctx->num_rejected_cs) {
r = -ECANCELED;
} else {
struct drm_amdgpu_cs_chunk chunks[6];
@@ -1497,8 +1476,8 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
assert(num_chunks <= ARRAY_SIZE(chunks));
- r = amdgpu_cs_submit_raw(ws->dev, acs->ctx->ctx, bo_list,
- num_chunks, chunks, &seq_no);
+ r = amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list,
+ num_chunks, chunks, &seq_no);
}
if (r) {
@@ -1523,7 +1502,7 @@ void amdgpu_cs_submit_ib(void *job, int thread_index)
/* Cleanup. */
if (bo_list)
- amdgpu_bo_list_destroy(bo_list);
+ amdgpu_bo_list_destroy_raw(ws->dev, bo_list);
cleanup:
/* If there was an error, signal the fence, because it won't be signalled
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index 9f5a4fd99..07b5d4b35 100644
--- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -129,6 +129,7 @@ struct amdgpu_cs {
/* Flush CS. */
void (*flush_cs)(void *ctx, unsigned flags, struct pipe_fence_handle **fence);
void *flush_data;
+ bool stop_exec_on_failure;
struct util_queue_fence flush_completed;
struct pipe_fence_handle *next_fence;
@@ -169,11 +170,11 @@ static inline void amdgpu_ctx_unref(struct amdgpu_ctx *ctx)
static inline void amdgpu_fence_reference(struct pipe_fence_handle **dst,
struct pipe_fence_handle *src)
{
- struct amdgpu_fence **rdst = (struct amdgpu_fence **)dst;
- struct amdgpu_fence *rsrc = (struct amdgpu_fence *)src;
+ struct amdgpu_fence **adst = (struct amdgpu_fence **)dst;
+ struct amdgpu_fence *asrc = (struct amdgpu_fence *)src;
- if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
- struct amdgpu_fence *fence = *rdst;
+ if (pipe_reference(&(*adst)->reference, &asrc->reference)) {
+ struct amdgpu_fence *fence = *adst;
if (amdgpu_fence_is_syncobj(fence))
amdgpu_cs_destroy_syncobj(fence->ws->dev, fence->syncobj);
@@ -183,7 +184,7 @@ static inline void amdgpu_fence_reference(struct pipe_fence_handle **dst,
util_queue_fence_destroy(&fence->submitted);
FREE(fence);
}
- *rdst = rsrc;
+ *adst = asrc;
}
int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo);
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index b20d70267..45e54b479 100644
--- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -38,6 +38,7 @@
#include <xf86drm.h>
#include <stdio.h>
#include <sys/stat.h>
+#include "amd/common/ac_llvm_util.h"
#include "amd/common/sid.h"
#include "amd/common/gfx9d.h"
@@ -50,6 +51,39 @@ static simple_mtx_t dev_tab_mutex = _SIMPLE_MTX_INITIALIZER_NP;
DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false)
+static void handle_env_var_force_family(struct amdgpu_winsys *ws)
+{
+ const char *family = debug_get_option("SI_FORCE_FAMILY", NULL);
+ unsigned i;
+
+ if (!family)
+ return;
+
+ for (i = CHIP_TAHITI; i < CHIP_LAST; i++) {
+ if (!strcmp(family, ac_get_llvm_processor_name(i))) {
+ /* Override family and chip_class. */
+ ws->info.family = i;
+ ws->info.name = "GCN-NOOP";
+
+ if (i >= CHIP_VEGA10)
+ ws->info.chip_class = GFX9;
+ else if (i >= CHIP_TONGA)
+ ws->info.chip_class = VI;
+ else if (i >= CHIP_BONAIRE)
+ ws->info.chip_class = CIK;
+ else
+ ws->info.chip_class = SI;
+
+ /* Don't submit any IBs. */
+ setenv("RADEON_NOOP", "1", 1);
+ return;
+ }
+ }
+
+ fprintf(stderr, "radeonsi: Unknown family: %s\n", family);
+ exit(1);
+}
+
/* Helper function to do the ioctls needed for setup and init. */
static bool do_winsys_init(struct amdgpu_winsys *ws,
const struct pipe_screen_config *config,
@@ -58,6 +92,12 @@ static bool do_winsys_init(struct amdgpu_winsys *ws,
if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo))
goto fail;
+ /* TODO: Enable this once the kernel handles it efficiently. */
+ if (ws->info.has_dedicated_vram)
+ ws->info.has_local_buffers = false;
+
+ handle_env_var_force_family(ws);
+
ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, &ws->info.max_alignment);
if (!ws->addrlib) {
fprintf(stderr, "amdgpu: Cannot create addrlib.\n");
@@ -95,7 +135,10 @@ static void amdgpu_winsys_destroy(struct radeon_winsys *rws)
util_queue_destroy(&ws->cs_queue);
simple_mtx_destroy(&ws->bo_fence_lock);
- pb_slabs_deinit(&ws->bo_slabs);
+ for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
+ if (ws->bo_slabs[i].groups)
+ pb_slabs_deinit(&ws->bo_slabs[i]);
+ }
pb_cache_deinit(&ws->bo_cache);
util_hash_table_destroy(ws->bo_export_table);
simple_mtx_destroy(&ws->global_bo_list_lock);
@@ -307,16 +350,33 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
(ws->info.vram_size + ws->info.gart_size) / 8,
amdgpu_bo_destroy, amdgpu_bo_can_reclaim);
- if (!pb_slabs_init(&ws->bo_slabs,
- AMDGPU_SLAB_MIN_SIZE_LOG2, AMDGPU_SLAB_MAX_SIZE_LOG2,
- RADEON_MAX_SLAB_HEAPS,
- ws,
- amdgpu_bo_can_reclaim_slab,
- amdgpu_bo_slab_alloc,
- amdgpu_bo_slab_free))
- goto fail_cache;
+ unsigned min_slab_order = 9; /* 512 bytes */
+ unsigned max_slab_order = 18; /* 256 KB - higher numbers increase memory usage */
+ unsigned num_slab_orders_per_allocator = (max_slab_order - min_slab_order) /
+ NUM_SLAB_ALLOCATORS;
+
+ /* Divide the size order range among slab managers. */
+ for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
+ unsigned min_order = min_slab_order;
+ unsigned max_order = MIN2(min_order + num_slab_orders_per_allocator,
+ max_slab_order);
+
+ if (!pb_slabs_init(&ws->bo_slabs[i],
+ min_order, max_order,
+ RADEON_MAX_SLAB_HEAPS,
+ ws,
+ amdgpu_bo_can_reclaim_slab,
+ amdgpu_bo_slab_alloc,
+ amdgpu_bo_slab_free)) {
+ amdgpu_winsys_destroy(&ws->base);
+ simple_mtx_unlock(&dev_tab_mutex);
+ return NULL;
+ }
+
+ min_slab_order = max_order + 1;
+ }
- ws->info.min_alloc_size = 1 << AMDGPU_SLAB_MIN_SIZE_LOG2;
+ ws->info.min_alloc_size = 1 << ws->bo_slabs[0].min_order;
/* init reference */
pipe_reference_init(&ws->reference, 1);
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
index c355eff52..4f0b1262e 100644
--- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
+++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@@ -31,22 +31,24 @@
#include "pipebuffer/pb_cache.h"
#include "pipebuffer/pb_slab.h"
#include "gallium/drivers/radeon/radeon_winsys.h"
-#include "addrlib/addrinterface.h"
+#include "addrlib/inc/addrinterface.h"
#include "util/simple_mtx.h"
#include "util/u_queue.h"
#include <amdgpu.h>
struct amdgpu_cs;
-#define AMDGPU_SLAB_MIN_SIZE_LOG2 9 /* 512 bytes */
-#define AMDGPU_SLAB_MAX_SIZE_LOG2 16 /* 64 KB */
-#define AMDGPU_SLAB_BO_SIZE_LOG2 17 /* 128 KB */
+#define NUM_SLAB_ALLOCATORS 3
struct amdgpu_winsys {
struct radeon_winsys base;
struct pipe_reference reference;
struct pb_cache bo_cache;
- struct pb_slabs bo_slabs;
+
+ /* Each slab buffer can only contain suballocations of equal sizes, so we
+ * need to layer the allocators, so that we don't waste too much memory.
+ */
+ struct pb_slabs bo_slabs[NUM_SLAB_ALLOCATORS];
amdgpu_device_handle dev;