diff options
Diffstat (limited to 'lib/mesa/src/gallium/winsys/amdgpu/drm')
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in | 47 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 768 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 65 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 1165 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 136 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c | 250 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 263 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h | 23 |
8 files changed, 1933 insertions, 784 deletions
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in index 1487d6a5a..5e197a855 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in @@ -54,13 +54,10 @@ target_triplet = @target@ DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \ $(srcdir)/Makefile.sources $(top_srcdir)/bin/depcomp \ $(top_srcdir)/src/gallium/Automake.inc -@HAVE_LIBDRM_TRUE@am__append_1 = \ -@HAVE_LIBDRM_TRUE@ $(LIBDRM_LIBS) - -@HAVE_DRISW_TRUE@am__append_2 = \ +@HAVE_DRISW_TRUE@am__append_1 = \ @HAVE_DRISW_TRUE@ $(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la -@HAVE_DRISW_KMS_TRUE@am__append_3 = \ +@HAVE_DRISW_KMS_TRUE@am__append_2 = \ @HAVE_DRISW_KMS_TRUE@ $(top_builddir)/src/gallium/winsys/sw/kms-dri/libswkmsdri.la \ @HAVE_DRISW_KMS_TRUE@ $(LIBDRM_LIBS) @@ -141,8 +138,6 @@ AMDGPU_CFLAGS = @AMDGPU_CFLAGS@ AMDGPU_LIBS = @AMDGPU_LIBS@ AMTAR = @AMTAR@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ -ANDROID_CFLAGS = @ANDROID_CFLAGS@ -ANDROID_LIBS = @ANDROID_LIBS@ AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ @@ -173,6 +168,8 @@ DLLTOOL = @DLLTOOL@ DLOPEN_LIBS = @DLOPEN_LIBS@ DRI2PROTO_CFLAGS = @DRI2PROTO_CFLAGS@ DRI2PROTO_LIBS = @DRI2PROTO_LIBS@ +DRI3PROTO_CFLAGS = @DRI3PROTO_CFLAGS@ +DRI3PROTO_LIBS = @DRI3PROTO_LIBS@ DRIGL_CFLAGS = @DRIGL_CFLAGS@ DRIGL_LIBS = @DRIGL_LIBS@ DRI_DRIVER_INSTALL_DIR = @DRI_DRIVER_INSTALL_DIR@ @@ -185,11 +182,10 @@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGL_CFLAGS = @EGL_CFLAGS@ +EGL_CLIENT_APIS = @EGL_CLIENT_APIS@ EGL_LIB_DEPS = @EGL_LIB_DEPS@ EGL_NATIVE_PLATFORM = @EGL_NATIVE_PLATFORM@ EGREP = @EGREP@ -ETNAVIV_CFLAGS = @ETNAVIV_CFLAGS@ -ETNAVIV_LIBS = @ETNAVIV_LIBS@ EXEEXT = @EXEEXT@ EXPAT_CFLAGS = @EXPAT_CFLAGS@ EXPAT_LIBS = @EXPAT_LIBS@ @@ -237,27 +233,31 @@ LIBDRM_CFLAGS = @LIBDRM_CFLAGS@ LIBDRM_LIBS = @LIBDRM_LIBS@ LIBELF_CFLAGS = @LIBELF_CFLAGS@ LIBELF_LIBS = @LIBELF_LIBS@ -LIBGLVND_DATADIR = @LIBGLVND_DATADIR@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ -LIBSENSORS_LIBS = @LIBSENSORS_LIBS@ +LIBSENSORS_LDFLAGS = @LIBSENSORS_LDFLAGS@ +LIBSHA1_CFLAGS = @LIBSHA1_CFLAGS@ +LIBSHA1_LIBS = @LIBSHA1_LIBS@ LIBTOOL = @LIBTOOL@ -LIBUNWIND_CFLAGS = @LIBUNWIND_CFLAGS@ -LIBUNWIND_LIBS = @LIBUNWIND_LIBS@ LIB_DIR = @LIB_DIR@ LIB_EXT = @LIB_EXT@ LIPO = @LIPO@ +LLVM_BINDIR = @LLVM_BINDIR@ LLVM_CFLAGS = @LLVM_CFLAGS@ LLVM_CONFIG = @LLVM_CONFIG@ +LLVM_CPPFLAGS = @LLVM_CPPFLAGS@ LLVM_CXXFLAGS = @LLVM_CXXFLAGS@ LLVM_INCLUDEDIR = @LLVM_INCLUDEDIR@ LLVM_LDFLAGS = @LLVM_LDFLAGS@ +LLVM_LIBDIR = @LLVM_LIBDIR@ LLVM_LIBS = @LLVM_LIBS@ +LLVM_VERSION = @LLVM_VERSION@ LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ MAINT = @MAINT@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ +MESA_LLVM = @MESA_LLVM@ MKDIR_P = @MKDIR_P@ MSVC2013_COMPAT_CFLAGS = @MSVC2013_COMPAT_CFLAGS@ MSVC2013_COMPAT_CXXFLAGS = @MSVC2013_COMPAT_CXXFLAGS@ @@ -278,6 +278,8 @@ OMX_LIBS = @OMX_LIBS@ OMX_LIB_INSTALL_DIR = @OMX_LIB_INSTALL_DIR@ OPENCL_LIBNAME = @OPENCL_LIBNAME@ OPENCL_VERSION = @OPENCL_VERSION@ +OPENSSL_CFLAGS = @OPENSSL_CFLAGS@ +OPENSSL_LIBS = @OPENSSL_LIBS@ OSMESA_LIB = @OSMESA_LIB@ OSMESA_LIB_DEPS = @OSMESA_LIB_DEPS@ OSMESA_PC_LIB_PRIV = @OSMESA_PC_LIB_PRIV@ @@ -297,6 +299,8 @@ PKG_CONFIG = @PKG_CONFIG@ PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ POSIX_SHELL = @POSIX_SHELL@ +PRESENTPROTO_CFLAGS = @PRESENTPROTO_CFLAGS@ +PRESENTPROTO_LIBS = @PRESENTPROTO_LIBS@ PTHREADSTUBS_CFLAGS = @PTHREADSTUBS_CFLAGS@ PTHREADSTUBS_LIBS = @PTHREADSTUBS_LIBS@ PTHREAD_CC = @PTHREAD_CC@ @@ -312,6 +316,8 @@ SED = @SED@ SELINUX_CFLAGS = @SELINUX_CFLAGS@ SELINUX_LIBS = @SELINUX_LIBS@ SET_MAKE = @SET_MAKE@ +SHA1_CFLAGS = @SHA1_CFLAGS@ +SHA1_LIBS = @SHA1_LIBS@ SHELL = @SHELL@ SIMPENROSE_CFLAGS = @SIMPENROSE_CFLAGS@ SIMPENROSE_LIBS = @SIMPENROSE_LIBS@ @@ -320,6 +326,7 @@ STRIP = @STRIP@ SWR_AVX2_CXXFLAGS = @SWR_AVX2_CXXFLAGS@ SWR_AVX_CXXFLAGS = @SWR_AVX_CXXFLAGS@ SWR_CXX11_CXXFLAGS = @SWR_CXX11_CXXFLAGS@ +TIMESTAMP_CMD = @TIMESTAMP_CMD@ VALGRIND_CFLAGS = @VALGRIND_CFLAGS@ VALGRIND_LIBS = @VALGRIND_LIBS@ VA_CFLAGS = @VA_CFLAGS@ @@ -335,6 +342,7 @@ VDPAU_LIB_INSTALL_DIR = @VDPAU_LIB_INSTALL_DIR@ VDPAU_MAJOR = @VDPAU_MAJOR@ VDPAU_MINOR = @VDPAU_MINOR@ VERSION = @VERSION@ +VG_LIB_DEPS = @VG_LIB_DEPS@ VISIBILITY_CFLAGS = @VISIBILITY_CFLAGS@ VISIBILITY_CXXFLAGS = @VISIBILITY_CXXFLAGS@ VL_CFLAGS = @VL_CFLAGS@ @@ -363,10 +371,9 @@ XVMC_LIBS = @XVMC_LIBS@ XVMC_LIB_INSTALL_DIR = @XVMC_LIB_INSTALL_DIR@ XVMC_MAJOR = @XVMC_MAJOR@ XVMC_MINOR = @XVMC_MINOR@ +XXD = @XXD@ YACC = @YACC@ YFLAGS = @YFLAGS@ -ZLIB_CFLAGS = @ZLIB_CFLAGS@ -ZLIB_LIBS = @ZLIB_LIBS@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -484,8 +491,12 @@ GALLIUM_TARGET_CFLAGS = \ $(LIBDRM_CFLAGS) \ $(VISIBILITY_CFLAGS) -GALLIUM_COMMON_LIB_DEPS = -lm $(LIBUNWIND_LIBS) $(LIBSENSORS_LIBS) \ - $(CLOCK_LIB) $(PTHREAD_LIBS) $(DLOPEN_LIBS) $(am__append_1) +GALLIUM_COMMON_LIB_DEPS = \ + -lm \ + $(CLOCK_LIB) \ + $(PTHREAD_LIBS) \ + $(DLOPEN_LIBS) + GALLIUM_WINSYS_CFLAGS = \ -I$(top_srcdir)/src \ -I$(top_srcdir)/include \ @@ -497,7 +508,7 @@ GALLIUM_WINSYS_CFLAGS = \ GALLIUM_PIPE_LOADER_WINSYS_LIBS = \ $(top_builddir)/src/gallium/winsys/sw/null/libws_null.la \ $(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \ - $(am__append_2) $(am__append_3) + $(am__append_1) $(am__append_2) AM_CFLAGS = \ $(GALLIUM_WINSYS_CFLAGS) \ $(AMDGPU_CFLAGS) \ diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index fe55dc310..e7ea51978 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -36,50 +36,34 @@ #include <amdgpu_drm.h> #include <xf86drm.h> #include <stdio.h> +#include <inttypes.h> -static const struct pb_vtbl amdgpu_winsys_bo_vtbl; - -static inline struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo) -{ - assert(bo->vtbl == &amdgpu_winsys_bo_vtbl); - return (struct amdgpu_winsys_bo *)bo; -} - -struct amdgpu_bomgr { - struct pb_manager base; - struct amdgpu_winsys *rws; -}; +static struct pb_buffer * +amdgpu_bo_create(struct radeon_winsys *rws, + uint64_t size, + unsigned alignment, + enum radeon_bo_domain domain, + enum radeon_bo_flag flags); -static struct amdgpu_winsys *get_winsys(struct pb_manager *mgr) +static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, + enum radeon_bo_usage usage) { - return ((struct amdgpu_bomgr*)mgr)->rws; -} + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); + struct amdgpu_winsys *ws = bo->ws; + int64_t abs_timeout; -static struct amdgpu_winsys_bo *get_amdgpu_winsys_bo(struct pb_buffer *_buf) -{ - struct amdgpu_winsys_bo *bo = NULL; + if (timeout == 0) { + if (p_atomic_read(&bo->num_active_ioctls)) + return false; - if (_buf->vtbl == &amdgpu_winsys_bo_vtbl) { - bo = amdgpu_winsys_bo(_buf); } else { - struct pb_buffer *base_buf; - pb_size offset; - pb_get_base_buffer(_buf, &base_buf, &offset); + abs_timeout = os_time_get_absolute_timeout(timeout); - if (base_buf->vtbl == &amdgpu_winsys_bo_vtbl) - bo = amdgpu_winsys_bo(base_buf); + /* Wait if any ioctl is being submitted with this buffer. */ + if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout)) + return false; } - return bo; -} - -static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, - enum radeon_bo_usage usage) -{ - struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf); - struct amdgpu_winsys *ws = bo->rws; - int i; - if (bo->is_shared) { /* We can't use user fences for shared buffers, because user fences * are local to this process only. If we want to wait for all buffer @@ -96,51 +80,57 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, } if (timeout == 0) { - /* Timeout == 0 is quite simple. */ + unsigned idle_fences; + bool buffer_idle; + pipe_mutex_lock(ws->bo_fence_lock); - for (i = 0; i < RING_LAST; i++) - if (bo->fence[i]) { - if (amdgpu_fence_wait(bo->fence[i], 0, false)) { - /* Release the idle fence to avoid checking it again later. */ - amdgpu_fence_reference(&bo->fence[i], NULL); - } else { - pipe_mutex_unlock(ws->bo_fence_lock); - return false; - } - } + + for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) { + if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false)) + break; + } + + /* Release the idle fences to avoid checking them again later. */ + for (unsigned i = 0; i < idle_fences; ++i) + amdgpu_fence_reference(&bo->fences[i], NULL); + + memmove(&bo->fences[0], &bo->fences[idle_fences], + (bo->num_fences - idle_fences) * sizeof(*bo->fences)); + bo->num_fences -= idle_fences; + + buffer_idle = !bo->num_fences; pipe_mutex_unlock(ws->bo_fence_lock); - return true; + return buffer_idle; } else { - struct pipe_fence_handle *fence[RING_LAST] = {}; - bool fence_idle[RING_LAST] = {}; bool buffer_idle = true; - int64_t abs_timeout = os_time_get_absolute_timeout(timeout); - /* Take references to all fences, so that we can wait for them - * without the lock. */ pipe_mutex_lock(ws->bo_fence_lock); - for (i = 0; i < RING_LAST; i++) - amdgpu_fence_reference(&fence[i], bo->fence[i]); - pipe_mutex_unlock(ws->bo_fence_lock); - - /* Now wait for the fences. */ - for (i = 0; i < RING_LAST; i++) { - if (fence[i]) { - if (amdgpu_fence_wait(fence[i], abs_timeout, true)) - fence_idle[i] = true; - else - buffer_idle = false; + while (bo->num_fences && buffer_idle) { + struct pipe_fence_handle *fence = NULL; + bool fence_idle = false; + + amdgpu_fence_reference(&fence, bo->fences[0]); + + /* Wait for the fence. */ + pipe_mutex_unlock(ws->bo_fence_lock); + if (amdgpu_fence_wait(fence, abs_timeout, true)) + fence_idle = true; + else + buffer_idle = false; + pipe_mutex_lock(ws->bo_fence_lock); + + /* Release an idle fence to avoid checking it again later, keeping in + * mind that the fence array may have been modified by other threads. + */ + if (fence_idle && bo->num_fences && bo->fences[0] == fence) { + amdgpu_fence_reference(&bo->fences[0], NULL); + memmove(&bo->fences[0], &bo->fences[1], + (bo->num_fences - 1) * sizeof(*bo->fences)); + bo->num_fences--; } - } - /* Release idle fences to avoid checking them again later. */ - pipe_mutex_lock(ws->bo_fence_lock); - for (i = 0; i < RING_LAST; i++) { - if (fence[i] == bo->fence[i] && fence_idle[i]) - amdgpu_fence_reference(&bo->fence[i], NULL); - - amdgpu_fence_reference(&fence[i], NULL); + amdgpu_fence_reference(&fence, NULL); } pipe_mutex_unlock(ws->bo_fence_lock); @@ -149,38 +139,75 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, } static enum radeon_bo_domain amdgpu_bo_get_initial_domain( - struct radeon_winsys_cs_handle *buf) + struct pb_buffer *buf) { return ((struct amdgpu_winsys_bo*)buf)->initial_domain; } -static void amdgpu_bo_destroy(struct pb_buffer *_buf) +static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo) +{ + for (unsigned i = 0; i < bo->num_fences; ++i) + amdgpu_fence_reference(&bo->fences[i], NULL); + + FREE(bo->fences); + bo->num_fences = 0; + bo->max_fences = 0; +} + +void amdgpu_bo_destroy(struct pb_buffer *_buf) { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); - int i; + + assert(bo->bo && "must not be called for slab entries"); + + pipe_mutex_lock(bo->ws->global_bo_list_lock); + LIST_DEL(&bo->u.real.global_list_item); + bo->ws->num_buffers--; + pipe_mutex_unlock(bo->ws->global_bo_list_lock); amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP); - amdgpu_va_range_free(bo->va_handle); + amdgpu_va_range_free(bo->u.real.va_handle); amdgpu_bo_free(bo->bo); - for (i = 0; i < RING_LAST; i++) - amdgpu_fence_reference(&bo->fence[i], NULL); + amdgpu_bo_remove_fences(bo); if (bo->initial_domain & RADEON_DOMAIN_VRAM) - bo->rws->allocated_vram -= align(bo->base.size, bo->rws->gart_page_size); + bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->info.gart_page_size); else if (bo->initial_domain & RADEON_DOMAIN_GTT) - bo->rws->allocated_gtt -= align(bo->base.size, bo->rws->gart_page_size); + bo->ws->allocated_gtt -= align64(bo->base.size, bo->ws->info.gart_page_size); + + if (bo->u.real.map_count >= 1) { + if (bo->initial_domain & RADEON_DOMAIN_VRAM) + bo->ws->mapped_vram -= bo->base.size; + else if (bo->initial_domain & RADEON_DOMAIN_GTT) + bo->ws->mapped_gtt -= bo->base.size; + } + FREE(bo); } -static void *amdgpu_bo_map(struct radeon_winsys_cs_handle *buf, +static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf) +{ + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); + + assert(bo->bo); /* slab buffers have a separate vtbl */ + + if (bo->u.real.use_reusable_pool) + pb_cache_add_buffer(&bo->u.real.cache_entry); + else + amdgpu_bo_destroy(_buf); +} + +static void *amdgpu_bo_map(struct pb_buffer *buf, struct radeon_winsys_cs *rcs, enum pipe_transfer_usage usage) { struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; + struct amdgpu_winsys_bo *real; struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs; int r; void *cpu = NULL; + uint64_t offset = 0; /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */ if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { @@ -226,114 +253,156 @@ static void *amdgpu_bo_map(struct radeon_winsys_cs_handle *buf, * (neither one is changing it). * * Only check whether the buffer is being used for write. */ - if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, - RADEON_USAGE_WRITE)) { - cs->flush_cs(cs->flush_data, 0, NULL); + if (cs) { + if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, + RADEON_USAGE_WRITE)) { + cs->flush_cs(cs->flush_data, 0, NULL); + } else { + /* Try to avoid busy-waiting in amdgpu_bo_wait. */ + if (p_atomic_read(&bo->num_active_ioctls)) + amdgpu_cs_sync_flush(rcs); + } } + amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_WRITE); } else { /* Mapping for write. */ - if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) - cs->flush_cs(cs->flush_data, 0, NULL); + if (cs) { + if (amdgpu_bo_is_referenced_by_cs(cs, bo)) { + cs->flush_cs(cs->flush_data, 0, NULL); + } else { + /* Try to avoid busy-waiting in amdgpu_bo_wait. */ + if (p_atomic_read(&bo->num_active_ioctls)) + amdgpu_cs_sync_flush(rcs); + } + } amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_READWRITE); } - bo->rws->buffer_wait_time += os_time_get_nano() - time; + bo->ws->buffer_wait_time += os_time_get_nano() - time; } } /* If the buffer is created from user memory, return the user pointer. */ if (bo->user_ptr) - return bo->user_ptr; + return bo->user_ptr; + + if (bo->bo) { + real = bo; + } else { + real = bo->u.slab.real; + offset = bo->va - real->va; + } + + r = amdgpu_bo_cpu_map(real->bo, &cpu); + if (r) { + /* Clear the cache and try again. */ + pb_cache_release_all_buffers(&real->ws->bo_cache); + r = amdgpu_bo_cpu_map(real->bo, &cpu); + if (r) + return NULL; + } - r = amdgpu_bo_cpu_map(bo->bo, &cpu); - return r ? NULL : cpu; + if (p_atomic_inc_return(&real->u.real.map_count) == 1) { + if (real->initial_domain & RADEON_DOMAIN_VRAM) + real->ws->mapped_vram += real->base.size; + else if (real->initial_domain & RADEON_DOMAIN_GTT) + real->ws->mapped_gtt += real->base.size; + } + return (uint8_t*)cpu + offset; } -static void amdgpu_bo_unmap(struct radeon_winsys_cs_handle *buf) +static void amdgpu_bo_unmap(struct pb_buffer *buf) { struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; + struct amdgpu_winsys_bo *real; - amdgpu_bo_cpu_unmap(bo->bo); -} + if (bo->user_ptr) + return; -static void amdgpu_bo_get_base_buffer(struct pb_buffer *buf, - struct pb_buffer **base_buf, - unsigned *offset) -{ - *base_buf = buf; - *offset = 0; -} + real = bo->bo ? bo : bo->u.slab.real; -static enum pipe_error amdgpu_bo_validate(struct pb_buffer *_buf, - struct pb_validate *vl, - unsigned flags) -{ - /* Always pinned */ - return PIPE_OK; -} + if (p_atomic_dec_zero(&real->u.real.map_count)) { + if (real->initial_domain & RADEON_DOMAIN_VRAM) + real->ws->mapped_vram -= real->base.size; + else if (real->initial_domain & RADEON_DOMAIN_GTT) + real->ws->mapped_gtt -= real->base.size; + } -static void amdgpu_bo_fence(struct pb_buffer *buf, - struct pipe_fence_handle *fence) -{ + amdgpu_bo_cpu_unmap(real->bo); } static const struct pb_vtbl amdgpu_winsys_bo_vtbl = { - amdgpu_bo_destroy, - NULL, /* never called */ - NULL, /* never called */ - amdgpu_bo_validate, - amdgpu_bo_fence, - amdgpu_bo_get_base_buffer, + amdgpu_bo_destroy_or_cache + /* other functions are never called */ }; -static struct pb_buffer *amdgpu_bomgr_create_bo(struct pb_manager *_mgr, - pb_size size, - const struct pb_desc *desc) +static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo) +{ + struct amdgpu_winsys *ws = bo->ws; + + assert(bo->bo); + + pipe_mutex_lock(ws->global_bo_list_lock); + LIST_ADDTAIL(&bo->u.real.global_list_item, &ws->global_bo_list); + ws->num_buffers++; + pipe_mutex_unlock(ws->global_bo_list_lock); +} + +static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, + uint64_t size, + unsigned alignment, + unsigned usage, + enum radeon_bo_domain initial_domain, + unsigned flags, + unsigned pb_cache_bucket) { - struct amdgpu_winsys *rws = get_winsys(_mgr); - struct amdgpu_bo_desc *rdesc = (struct amdgpu_bo_desc*)desc; struct amdgpu_bo_alloc_request request = {0}; amdgpu_bo_handle buf_handle; uint64_t va = 0; struct amdgpu_winsys_bo *bo; amdgpu_va_handle va_handle; + unsigned va_gap_size; int r; - assert(rdesc->initial_domain & RADEON_DOMAIN_VRAM_GTT); + assert(initial_domain & RADEON_DOMAIN_VRAM_GTT); bo = CALLOC_STRUCT(amdgpu_winsys_bo); if (!bo) { return NULL; } + pb_cache_init_entry(&ws->bo_cache, &bo->u.real.cache_entry, &bo->base, + pb_cache_bucket); request.alloc_size = size; - request.phys_alignment = desc->alignment; + request.phys_alignment = alignment; - if (rdesc->initial_domain & RADEON_DOMAIN_VRAM) { + if (initial_domain & RADEON_DOMAIN_VRAM) request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM; - if (rdesc->flags & RADEON_FLAG_CPU_ACCESS) - request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; - } - if (rdesc->initial_domain & RADEON_DOMAIN_GTT) { + if (initial_domain & RADEON_DOMAIN_GTT) request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; - if (rdesc->flags & RADEON_FLAG_GTT_WC) - request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; - } - r = amdgpu_bo_alloc(rws->dev, &request, &buf_handle); + if (flags & RADEON_FLAG_CPU_ACCESS) + request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; + if (flags & RADEON_FLAG_NO_CPU_ACCESS) + request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS; + if (flags & RADEON_FLAG_GTT_WC) + request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; + + r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle); if (r) { fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n"); - fprintf(stderr, "amdgpu: size : %d bytes\n", size); - fprintf(stderr, "amdgpu: alignment : %d bytes\n", desc->alignment); - fprintf(stderr, "amdgpu: domains : %d\n", rdesc->initial_domain); + fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size); + fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment); + fprintf(stderr, "amdgpu: domains : %u\n", initial_domain); goto error_bo_alloc; } - r = amdgpu_va_range_alloc(rws->dev, amdgpu_gpu_va_range_general, - size, desc->alignment, 0, &va, &va_handle, 0); + va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0; + r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, + size + va_gap_size, alignment, 0, &va, &va_handle, 0); if (r) goto error_va_alloc; @@ -342,23 +411,25 @@ static struct pb_buffer *amdgpu_bomgr_create_bo(struct pb_manager *_mgr, goto error_va_map; pipe_reference_init(&bo->base.reference, 1); - bo->base.alignment = desc->alignment; - bo->base.usage = desc->usage; + bo->base.alignment = alignment; + bo->base.usage = usage; bo->base.size = size; bo->base.vtbl = &amdgpu_winsys_bo_vtbl; - bo->rws = rws; + bo->ws = ws; bo->bo = buf_handle; bo->va = va; - bo->va_handle = va_handle; - bo->initial_domain = rdesc->initial_domain; - bo->unique_id = __sync_fetch_and_add(&rws->next_bo_unique_id, 1); + bo->u.real.va_handle = va_handle; + bo->initial_domain = initial_domain; + bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); - if (rdesc->initial_domain & RADEON_DOMAIN_VRAM) - rws->allocated_vram += align(size, rws->gart_page_size); - else if (rdesc->initial_domain & RADEON_DOMAIN_GTT) - rws->allocated_gtt += align(size, rws->gart_page_size); + if (initial_domain & RADEON_DOMAIN_VRAM) + ws->allocated_vram += align64(size, ws->info.gart_page_size); + else if (initial_domain & RADEON_DOMAIN_GTT) + ws->allocated_gtt += align64(size, ws->info.gart_page_size); - return &bo->base; + amdgpu_add_buffer_to_global_list(bo); + + return bo; error_va_map: amdgpu_va_range_free(va_handle); @@ -371,48 +442,125 @@ error_bo_alloc: return NULL; } -static void amdgpu_bomgr_flush(struct pb_manager *mgr) -{ - /* NOP */ -} - -/* This is for the cache bufmgr. */ -static boolean amdgpu_bomgr_is_buffer_busy(struct pb_manager *_mgr, - struct pb_buffer *_buf) +bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf) { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); if (amdgpu_bo_is_referenced_by_any_cs(bo)) { - return TRUE; + return false; } - if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_READWRITE)) { - return TRUE; - } + return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE); +} - return FALSE; +bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry) +{ + struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */ + bo = container_of(entry, bo, u.slab.entry); + + return amdgpu_bo_can_reclaim(&bo->base); } -static void amdgpu_bomgr_destroy(struct pb_manager *mgr) +static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf) { - FREE(mgr); + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); + + assert(!bo->bo); + + pb_slab_free(&bo->ws->bo_slabs, &bo->u.slab.entry); } -struct pb_manager *amdgpu_bomgr_create(struct amdgpu_winsys *rws) +static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = { + amdgpu_bo_slab_destroy + /* other functions are never called */ +}; + +struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, + unsigned entry_size, + unsigned group_index) { - struct amdgpu_bomgr *mgr; + struct amdgpu_winsys *ws = priv; + struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab); + enum radeon_bo_domain domains; + enum radeon_bo_flag flags = 0; + uint32_t base_id; - mgr = CALLOC_STRUCT(amdgpu_bomgr); - if (!mgr) + if (!slab) return NULL; - mgr->base.destroy = amdgpu_bomgr_destroy; - mgr->base.create_buffer = amdgpu_bomgr_create_bo; - mgr->base.flush = amdgpu_bomgr_flush; - mgr->base.is_buffer_busy = amdgpu_bomgr_is_buffer_busy; + if (heap & 1) + flags |= RADEON_FLAG_GTT_WC; + if (heap & 2) + flags |= RADEON_FLAG_CPU_ACCESS; + + switch (heap >> 2) { + case 0: + domains = RADEON_DOMAIN_VRAM; + break; + default: + case 1: + domains = RADEON_DOMAIN_VRAM_GTT; + break; + case 2: + domains = RADEON_DOMAIN_GTT; + break; + } + + slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base, + 64 * 1024, 64 * 1024, + domains, flags)); + if (!slab->buffer) + goto fail; + + assert(slab->buffer->bo); + + slab->base.num_entries = slab->buffer->base.size / entry_size; + slab->base.num_free = slab->base.num_entries; + slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries)); + if (!slab->entries) + goto fail_buffer; + + LIST_INITHEAD(&slab->base.free); + + base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries); + + for (unsigned i = 0; i < slab->base.num_entries; ++i) { + struct amdgpu_winsys_bo *bo = &slab->entries[i]; + + bo->base.alignment = entry_size; + bo->base.usage = slab->buffer->base.usage; + bo->base.size = entry_size; + bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl; + bo->ws = ws; + bo->va = slab->buffer->va + i * entry_size; + bo->initial_domain = domains; + bo->unique_id = base_id + i; + bo->u.slab.entry.slab = &slab->base; + bo->u.slab.entry.group_index = group_index; + bo->u.slab.real = slab->buffer; + + LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free); + } + + return &slab->base; + +fail_buffer: + amdgpu_winsys_bo_reference(&slab->buffer, NULL); +fail: + FREE(slab); + return NULL; +} + +void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab) +{ + struct amdgpu_slab *slab = amdgpu_slab(pslab); + + for (unsigned i = 0; i < slab->base.num_entries; ++i) + amdgpu_bo_remove_fences(&slab->entries[i]); - mgr->rws = rws; - return &mgr->base; + FREE(slab->entries); + amdgpu_winsys_bo_reference(&slab->buffer, NULL); + FREE(slab); } static unsigned eg_tile_split(unsigned tile_split) @@ -444,152 +592,192 @@ static unsigned eg_tile_split_rev(unsigned eg_tile_split) } } -static void amdgpu_bo_get_tiling(struct pb_buffer *_buf, - enum radeon_bo_layout *microtiled, - enum radeon_bo_layout *macrotiled, - unsigned *bankw, unsigned *bankh, - unsigned *tile_split, - unsigned *stencil_tile_split, - unsigned *mtilea, - bool *scanout) +static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf, + struct radeon_bo_metadata *md) { - struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf); + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); struct amdgpu_bo_info info = {0}; uint32_t tiling_flags; int r; + assert(bo->bo && "must not be called for slab entries"); + r = amdgpu_bo_query_info(bo->bo, &info); if (r) return; tiling_flags = info.metadata.tiling_info; - *microtiled = RADEON_LAYOUT_LINEAR; - *macrotiled = RADEON_LAYOUT_LINEAR; + md->microtile = RADEON_LAYOUT_LINEAR; + md->macrotile = RADEON_LAYOUT_LINEAR; if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */ - *macrotiled = RADEON_LAYOUT_TILED; + md->macrotile = RADEON_LAYOUT_TILED; else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */ - *microtiled = RADEON_LAYOUT_TILED; - - if (bankw && tile_split && mtilea && tile_split) { - *bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH); - *bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT); - *tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT)); - *mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT); - } - if (scanout) - *scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */ + md->microtile = RADEON_LAYOUT_TILED; + + md->pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG); + md->bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH); + md->bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT); + md->tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT)); + md->mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT); + md->num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS); + md->scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */ + + md->size_metadata = info.metadata.size_metadata; + memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata)); } -static void amdgpu_bo_set_tiling(struct pb_buffer *_buf, - struct radeon_winsys_cs *rcs, - enum radeon_bo_layout microtiled, - enum radeon_bo_layout macrotiled, - unsigned pipe_config, - unsigned bankw, unsigned bankh, - unsigned tile_split, - unsigned stencil_tile_split, - unsigned mtilea, unsigned num_banks, - uint32_t pitch, - bool scanout) +static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf, + struct radeon_bo_metadata *md) { - struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf); + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); struct amdgpu_bo_metadata metadata = {0}; uint32_t tiling_flags = 0; - if (macrotiled == RADEON_LAYOUT_TILED) + assert(bo->bo && "must not be called for slab entries"); + + if (md->macrotile == RADEON_LAYOUT_TILED) tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */ - else if (microtiled == RADEON_LAYOUT_TILED) + else if (md->microtile == RADEON_LAYOUT_TILED) tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */ else tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */ - tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, pipe_config); - tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(bankw)); - tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(bankh)); - if (tile_split) - tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(tile_split)); - tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(mtilea)); - tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(num_banks)-1); + tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->pipe_config); + tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->bankw)); + tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->bankh)); + if (md->tile_split) + tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->tile_split)); + tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->mtilea)); + tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->num_banks)-1); - if (scanout) + if (md->scanout) tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */ else tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */ metadata.tiling_info = tiling_flags; + metadata.size_metadata = md->size_metadata; + memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata)); amdgpu_bo_set_metadata(bo->bo, &metadata); } -static struct radeon_winsys_cs_handle *amdgpu_get_cs_handle(struct pb_buffer *_buf) -{ - /* return a direct pointer to amdgpu_winsys_bo. */ - return (struct radeon_winsys_cs_handle*)get_amdgpu_winsys_bo(_buf); -} - static struct pb_buffer * amdgpu_bo_create(struct radeon_winsys *rws, - unsigned size, + uint64_t size, unsigned alignment, - boolean use_reusable_pool, enum radeon_bo_domain domain, enum radeon_bo_flag flags) { struct amdgpu_winsys *ws = amdgpu_winsys(rws); - struct amdgpu_bo_desc desc; - struct pb_manager *provider; - struct pb_buffer *buffer; - - /* Don't use VRAM if the GPU doesn't have much. This is only the initial - * domain. The kernel is free to move the buffer if it wants to. - * - * 64MB means no VRAM by todays standards. - */ - if (domain & RADEON_DOMAIN_VRAM && ws->info.vram_size <= 64*1024*1024) { - domain = RADEON_DOMAIN_GTT; - flags = RADEON_FLAG_GTT_WC; + struct amdgpu_winsys_bo *bo; + unsigned usage = 0, pb_cache_bucket; + + /* Sub-allocate small buffers from slabs. */ + if (!(flags & RADEON_FLAG_HANDLE) && + size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) && + alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) { + struct pb_slab_entry *entry; + unsigned heap = 0; + + if (flags & RADEON_FLAG_GTT_WC) + heap |= 1; + if (flags & RADEON_FLAG_CPU_ACCESS) + heap |= 2; + if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_CPU_ACCESS)) + goto no_slab; + + switch (domain) { + case RADEON_DOMAIN_VRAM: + heap |= 0 * 4; + break; + case RADEON_DOMAIN_VRAM_GTT: + heap |= 1 * 4; + break; + case RADEON_DOMAIN_GTT: + heap |= 2 * 4; + break; + default: + goto no_slab; + } + + entry = pb_slab_alloc(&ws->bo_slabs, size, heap); + if (!entry) { + /* Clear the cache and try again. */ + pb_cache_release_all_buffers(&ws->bo_cache); + + entry = pb_slab_alloc(&ws->bo_slabs, size, heap); + } + if (!entry) + return NULL; + + bo = NULL; + bo = container_of(entry, bo, u.slab.entry); + + pipe_reference_init(&bo->base.reference, 1); + + return &bo->base; } +no_slab: - memset(&desc, 0, sizeof(desc)); - desc.base.alignment = alignment; + /* This flag is irrelevant for the cache. */ + flags &= ~RADEON_FLAG_HANDLE; /* Align size to page size. This is the minimum alignment for normal * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, * like constant/uniform buffers, can benefit from better and more reuse. */ - size = align(size, ws->gart_page_size); + size = align64(size, ws->info.gart_page_size); + alignment = align(alignment, ws->info.gart_page_size); /* Only set one usage bit each for domains and flags, or the cache manager * might consider different sets of domains / flags compatible */ if (domain == RADEON_DOMAIN_VRAM_GTT) - desc.base.usage = 1 << 2; - else - desc.base.usage = domain >> 1; - assert(flags < sizeof(desc.base.usage) * 8 - 3); - desc.base.usage |= 1 << (flags + 3); - - desc.initial_domain = domain; - desc.flags = flags; - - /* Assign a buffer manager. */ - if (use_reusable_pool) - provider = ws->cman; + usage = 1 << 2; else - provider = ws->kman; - - buffer = provider->create_buffer(provider, size, &desc.base); - if (!buffer) - return NULL; + usage = domain >> 1; + assert(flags < sizeof(usage) * 8 - 3); + usage |= 1 << (flags + 3); + + /* Determine the pb_cache bucket for minimizing pb_cache misses. */ + pb_cache_bucket = 0; + if (domain & RADEON_DOMAIN_VRAM) /* VRAM or VRAM+GTT */ + pb_cache_bucket += 1; + if (flags == RADEON_FLAG_GTT_WC) /* WC */ + pb_cache_bucket += 2; + assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); + + /* Get a buffer from the cache. */ + bo = (struct amdgpu_winsys_bo*) + pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, + pb_cache_bucket); + if (bo) + return &bo->base; + + /* Create a new one. */ + bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags, + pb_cache_bucket); + if (!bo) { + /* Clear the cache and try again. */ + pb_slabs_reclaim(&ws->bo_slabs); + pb_cache_release_all_buffers(&ws->bo_cache); + bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags, + pb_cache_bucket); + if (!bo) + return NULL; + } - return (struct pb_buffer*)buffer; + bo->u.real.use_reusable_pool = true; + return &bo->base; } static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, struct winsys_handle *whandle, - unsigned *stride) + unsigned *stride, + unsigned *offset) { struct amdgpu_winsys *ws = amdgpu_winsys(rws); struct amdgpu_winsys_bo *bo; @@ -644,24 +832,27 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, pipe_reference_init(&bo->base.reference, 1); bo->base.alignment = info.phys_alignment; - bo->base.usage = PB_USAGE_GPU_WRITE | PB_USAGE_GPU_READ; bo->bo = result.buf_handle; bo->base.size = result.alloc_size; bo->base.vtbl = &amdgpu_winsys_bo_vtbl; - bo->rws = ws; + bo->ws = ws; bo->va = va; - bo->va_handle = va_handle; + bo->u.real.va_handle = va_handle; bo->initial_domain = initial; bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); bo->is_shared = true; if (stride) *stride = whandle->stride; + if (offset) + *offset = whandle->offset; if (bo->initial_domain & RADEON_DOMAIN_VRAM) - ws->allocated_vram += align(bo->base.size, ws->gart_page_size); + ws->allocated_vram += align64(bo->base.size, ws->info.gart_page_size); else if (bo->initial_domain & RADEON_DOMAIN_GTT) - ws->allocated_gtt += align(bo->base.size, ws->gart_page_size); + ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size); + + amdgpu_add_buffer_to_global_list(bo); return &bo->base; @@ -676,16 +867,21 @@ error: return NULL; } -static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer, - unsigned stride, - struct winsys_handle *whandle) +static bool amdgpu_bo_get_handle(struct pb_buffer *buffer, + unsigned stride, unsigned offset, + unsigned slice_size, + struct winsys_handle *whandle) { - struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(buffer); + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer); enum amdgpu_bo_handle_type type; int r; - if ((void*)bo != (void*)buffer) - pb_cache_manager_remove_buffer(buffer); + if (!bo->bo) { + offset += bo->va - bo->u.slab.real->va; + bo = bo->u.slab.real; + } + + bo->u.real.use_reusable_pool = false; switch (whandle->type) { case DRM_API_HANDLE_TYPE_SHARED: @@ -698,20 +894,22 @@ static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer, type = amdgpu_bo_handle_type_kms; break; default: - return FALSE; + return false; } r = amdgpu_bo_export(bo->bo, type, &whandle->handle); if (r) - return FALSE; + return false; whandle->stride = stride; + whandle->offset = offset; + whandle->offset += slice_size * whandle->layer; bo->is_shared = true; - return TRUE; + return true; } static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws, - void *pointer, unsigned size) + void *pointer, uint64_t size) { struct amdgpu_winsys *ws = amdgpu_winsys(rws); amdgpu_bo_handle buf_handle; @@ -737,17 +935,18 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws, pipe_reference_init(&bo->base.reference, 1); bo->bo = buf_handle; bo->base.alignment = 0; - bo->base.usage = PB_USAGE_GPU_WRITE | PB_USAGE_GPU_READ; bo->base.size = size; bo->base.vtbl = &amdgpu_winsys_bo_vtbl; - bo->rws = ws; + bo->ws = ws; bo->user_ptr = pointer; bo->va = va; - bo->va_handle = va_handle; + bo->u.real.va_handle = va_handle; bo->initial_domain = RADEON_DOMAIN_GTT; bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); - ws->allocated_gtt += align(bo->base.size, ws->gart_page_size); + ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size); + + amdgpu_add_buffer_to_global_list(bo); return (struct pb_buffer*)bo; @@ -762,22 +961,27 @@ error: return NULL; } -static uint64_t amdgpu_bo_get_va(struct radeon_winsys_cs_handle *buf) +static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf) +{ + return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL; +} + +static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf) { return ((struct amdgpu_winsys_bo*)buf)->va; } -void amdgpu_bomgr_init_functions(struct amdgpu_winsys *ws) +void amdgpu_bo_init_functions(struct amdgpu_winsys *ws) { - ws->base.buffer_get_cs_handle = amdgpu_get_cs_handle; - ws->base.buffer_set_tiling = amdgpu_bo_set_tiling; - ws->base.buffer_get_tiling = amdgpu_bo_get_tiling; + ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata; + ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata; ws->base.buffer_map = amdgpu_bo_map; ws->base.buffer_unmap = amdgpu_bo_unmap; ws->base.buffer_wait = amdgpu_bo_wait; ws->base.buffer_create = amdgpu_bo_create; ws->base.buffer_from_handle = amdgpu_bo_from_handle; ws->base.buffer_from_ptr = amdgpu_bo_from_ptr; + ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr; ws->base.buffer_get_handle = amdgpu_bo_get_handle; ws->base.buffer_get_virtual_address = amdgpu_bo_get_va; ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain; diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h index 3739fd136..1e25897b6 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h @@ -34,41 +34,80 @@ #define AMDGPU_BO_H #include "amdgpu_winsys.h" -#include "pipebuffer/pb_bufmgr.h" -struct amdgpu_bo_desc { - struct pb_desc base; - - enum radeon_bo_domain initial_domain; - unsigned flags; -}; +#include "pipebuffer/pb_slab.h" struct amdgpu_winsys_bo { struct pb_buffer base; + union { + struct { + struct pb_cache_entry cache_entry; + + amdgpu_va_handle va_handle; + int map_count; + bool use_reusable_pool; + + struct list_head global_list_item; + } real; + struct { + struct pb_slab_entry entry; + struct amdgpu_winsys_bo *real; + } slab; + } u; - struct amdgpu_winsys *rws; + struct amdgpu_winsys *ws; void *user_ptr; /* from buffer_from_ptr */ - amdgpu_bo_handle bo; + amdgpu_bo_handle bo; /* NULL for slab entries */ uint32_t unique_id; - amdgpu_va_handle va_handle; uint64_t va; enum radeon_bo_domain initial_domain; /* how many command streams is this bo referenced in? */ int num_cs_references; + /* how many command streams, which are being emitted in a separate + * thread, is this bo referenced in? */ + volatile int num_active_ioctls; + /* whether buffer_get_handle or buffer_from_handle was called, * it can only transition from false to true */ volatile int is_shared; /* bool (int for atomicity) */ /* Fences for buffer synchronization. */ - struct pipe_fence_handle *fence[RING_LAST]; + unsigned num_fences; + unsigned max_fences; + struct pipe_fence_handle **fences; +}; + +struct amdgpu_slab { + struct pb_slab base; + struct amdgpu_winsys_bo *buffer; + struct amdgpu_winsys_bo *entries; }; -struct pb_manager *amdgpu_bomgr_create(struct amdgpu_winsys *rws); -void amdgpu_bomgr_init_functions(struct amdgpu_winsys *ws); +bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf); +void amdgpu_bo_destroy(struct pb_buffer *_buf); +void amdgpu_bo_init_functions(struct amdgpu_winsys *ws); + +bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry); +struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, + unsigned entry_size, + unsigned group_index); +void amdgpu_bo_slab_free(void *priv, struct pb_slab *slab); + +static inline +struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo) +{ + return (struct amdgpu_winsys_bo *)bo; +} + +static inline +struct amdgpu_slab *amdgpu_slab(struct pb_slab *slab) +{ + return (struct amdgpu_slab *)slab; +} static inline void amdgpu_winsys_bo_reference(struct amdgpu_winsys_bo **dst, diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 0f42298c2..2b86827ff 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -35,6 +35,9 @@ #include <stdio.h> #include <amdgpu_drm.h> +#include "amd/common/sid.h" + +DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false) /* FENCES */ @@ -50,6 +53,7 @@ amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type, fence->fence.ip_type = ip_type; fence->fence.ip_instance = ip_instance; fence->fence.ring = ring; + fence->submission_in_progress = true; p_atomic_inc(&ctx->refcount); return (struct pipe_fence_handle *)fence; } @@ -62,6 +66,7 @@ static void amdgpu_fence_submitted(struct pipe_fence_handle *fence, rfence->fence.fence = request->seq_no; rfence->user_fence_cpu_address = user_fence_cpu_address; + rfence->submission_in_progress = false; } static void amdgpu_fence_signalled(struct pipe_fence_handle *fence) @@ -69,6 +74,7 @@ static void amdgpu_fence_signalled(struct pipe_fence_handle *fence) struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence; rfence->signalled = true; + rfence->submission_in_progress = false; } bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, @@ -88,11 +94,25 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, else abs_timeout = os_time_get_absolute_timeout(timeout); + /* The fence might not have a number assigned if its IB is being + * submitted in the other thread right now. Wait until the submission + * is done. */ + if (!os_wait_until_zero_abs_timeout(&rfence->submission_in_progress, + abs_timeout)) + return false; + user_fence_cpu = rfence->user_fence_cpu_address; - if (user_fence_cpu && *user_fence_cpu >= rfence->fence.fence) { - rfence->signalled = true; - return true; + if (user_fence_cpu) { + if (*user_fence_cpu >= rfence->fence.fence) { + rfence->signalled = true; + return true; + } + + /* No timeout, just query: no need for the ioctl. */ + if (!absolute && !timeout) + return false; } + /* Now use the libdrm query. */ r = amdgpu_cs_query_fence_status(&rfence->fence, abs_timeout, @@ -100,7 +120,7 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, &expired); if (r) { fprintf(stderr, "amdgpu: amdgpu_cs_query_fence_status failed.\n"); - return FALSE; + return false; } if (expired) { @@ -119,6 +139,31 @@ static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws, return amdgpu_fence_wait(fence, timeout, false); } +static struct pipe_fence_handle * +amdgpu_cs_get_next_fence(struct radeon_winsys_cs *rcs) +{ + struct amdgpu_cs *cs = amdgpu_cs(rcs); + struct pipe_fence_handle *fence = NULL; + + if (debug_get_option_noop()) + return NULL; + + if (cs->next_fence) { + amdgpu_fence_reference(&fence, cs->next_fence); + return fence; + } + + fence = amdgpu_fence_create(cs->ctx, + cs->csc->request.ip_type, + cs->csc->request.ip_instance, + cs->csc->request.ring); + if (!fence) + return NULL; + + amdgpu_fence_reference(&cs->next_fence, fence); + return fence; +} + /* CONTEXTS */ static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *ws) @@ -128,41 +173,46 @@ static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *ws) struct amdgpu_bo_alloc_request alloc_buffer = {}; amdgpu_bo_handle buf_handle; + if (!ctx) + return NULL; + ctx->ws = amdgpu_winsys(ws); ctx->refcount = 1; r = amdgpu_cs_ctx_create(ctx->ws->dev, &ctx->ctx); if (r) { fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create failed. (%i)\n", r); - FREE(ctx); - return NULL; + goto error_create; } - alloc_buffer.alloc_size = 4 * 1024; - alloc_buffer.phys_alignment = 4 *1024; + alloc_buffer.alloc_size = ctx->ws->info.gart_page_size; + alloc_buffer.phys_alignment = ctx->ws->info.gart_page_size; alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT; r = amdgpu_bo_alloc(ctx->ws->dev, &alloc_buffer, &buf_handle); if (r) { fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r); - amdgpu_cs_ctx_free(ctx->ctx); - FREE(ctx); - return NULL; + goto error_user_fence_alloc; } r = amdgpu_bo_cpu_map(buf_handle, (void**)&ctx->user_fence_cpu_address_base); if (r) { fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r); - amdgpu_bo_free(buf_handle); - amdgpu_cs_ctx_free(ctx->ctx); - FREE(ctx); - return NULL; + goto error_user_fence_map; } memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size); ctx->user_fence_bo = buf_handle; return (struct radeon_winsys_ctx*)ctx; + +error_user_fence_map: + amdgpu_bo_free(buf_handle); +error_user_fence_alloc: + amdgpu_cs_ctx_free(ctx->ctx); +error_create: + FREE(ctx); + return NULL; } static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx) @@ -198,53 +248,366 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx) /* COMMAND SUBMISSION */ -static bool amdgpu_get_new_ib(struct amdgpu_cs *cs) +static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs) +{ + return cs->request.ip_type != AMDGPU_HW_IP_UVD && + cs->request.ip_type != AMDGPU_HW_IP_VCE; +} + +static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs) { - /* The maximum size is 4MB - 1B, which is unaligned. - * Use aligned size 4MB - 16B. */ - const unsigned max_ib_size = (1024 * 1024 - 16) * 4; - const unsigned min_ib_size = 24 * 1024 * 4; + return cs->ctx->ws->info.chip_class >= CIK && + cs->ring_type == RING_GFX; +} - cs->base.cdw = 0; - cs->base.buf = NULL; +static unsigned amdgpu_cs_epilog_dws(enum ring_type ring_type) +{ + if (ring_type == RING_GFX) + return 4; /* for chaining */ - /* Allocate a new buffer for IBs if the current buffer is all used. */ - if (!cs->big_ib_buffer || - cs->used_ib_space + min_ib_size > cs->big_ib_buffer->size) { - struct radeon_winsys *ws = &cs->ctx->ws->base; - struct radeon_winsys_cs_handle *winsys_bo; - - pb_reference(&cs->big_ib_buffer, NULL); - cs->big_ib_winsys_buffer = NULL; - cs->ib_mapped = NULL; - cs->used_ib_space = 0; - - cs->big_ib_buffer = ws->buffer_create(ws, max_ib_size, - 4096, true, - RADEON_DOMAIN_GTT, - RADEON_FLAG_CPU_ACCESS); - if (!cs->big_ib_buffer) - return false; + return 0; +} - winsys_bo = ws->buffer_get_cs_handle(cs->big_ib_buffer); +int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo) +{ + unsigned hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); + int i = cs->buffer_indices_hashlist[hash]; + struct amdgpu_cs_buffer *buffers; + int num_buffers; - cs->ib_mapped = ws->buffer_map(winsys_bo, NULL, PIPE_TRANSFER_WRITE); - if (!cs->ib_mapped) { - pb_reference(&cs->big_ib_buffer, NULL); - return false; + if (bo->bo) { + buffers = cs->real_buffers; + num_buffers = cs->num_real_buffers; + } else { + buffers = cs->slab_buffers; + num_buffers = cs->num_slab_buffers; + } + + /* not found or found */ + if (i < 0 || (i < num_buffers && buffers[i].bo == bo)) + return i; + + /* Hash collision, look for the BO in the list of buffers linearly. */ + for (i = num_buffers - 1; i >= 0; i--) { + if (buffers[i].bo == bo) { + /* Put this buffer in the hash list. + * This will prevent additional hash collisions if there are + * several consecutive lookup_buffer calls for the same buffer. + * + * Example: Assuming buffers A,B,C collide in the hash list, + * the following sequence of buffers: + * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC + * will collide here: ^ and here: ^, + * meaning that we should get very few collisions in the end. */ + cs->buffer_indices_hashlist[hash] = i; + return i; + } + } + return -1; +} + +static int +amdgpu_lookup_or_add_real_buffer(struct amdgpu_cs *acs, struct amdgpu_winsys_bo *bo) +{ + struct amdgpu_cs_context *cs = acs->csc; + struct amdgpu_cs_buffer *buffer; + unsigned hash; + int idx = amdgpu_lookup_buffer(cs, bo); + + if (idx >= 0) + return idx; + + /* New buffer, check if the backing array is large enough. */ + if (cs->num_real_buffers >= cs->max_real_buffers) { + unsigned new_max = + MAX2(cs->max_real_buffers + 16, (unsigned)(cs->max_real_buffers * 1.3)); + struct amdgpu_cs_buffer *new_buffers; + amdgpu_bo_handle *new_handles; + uint8_t *new_flags; + + new_buffers = MALLOC(new_max * sizeof(*new_buffers)); + new_handles = MALLOC(new_max * sizeof(*new_handles)); + new_flags = MALLOC(new_max * sizeof(*new_flags)); + + if (!new_buffers || !new_handles || !new_flags) { + fprintf(stderr, "amdgpu_lookup_or_add_buffer: allocation failed\n"); + FREE(new_buffers); + FREE(new_handles); + FREE(new_flags); + return -1; + } + + memcpy(new_buffers, cs->real_buffers, cs->num_real_buffers * sizeof(*new_buffers)); + memcpy(new_handles, cs->handles, cs->num_real_buffers * sizeof(*new_handles)); + memcpy(new_flags, cs->flags, cs->num_real_buffers * sizeof(*new_flags)); + + FREE(cs->real_buffers); + FREE(cs->handles); + FREE(cs->flags); + + cs->max_real_buffers = new_max; + cs->real_buffers = new_buffers; + cs->handles = new_handles; + cs->flags = new_flags; + } + + idx = cs->num_real_buffers; + buffer = &cs->real_buffers[idx]; + + memset(buffer, 0, sizeof(*buffer)); + amdgpu_winsys_bo_reference(&buffer->bo, bo); + cs->handles[idx] = bo->bo; + cs->flags[idx] = 0; + p_atomic_inc(&bo->num_cs_references); + cs->num_real_buffers++; + + hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); + cs->buffer_indices_hashlist[hash] = idx; + + if (bo->initial_domain & RADEON_DOMAIN_VRAM) + acs->main.base.used_vram += bo->base.size; + else if (bo->initial_domain & RADEON_DOMAIN_GTT) + acs->main.base.used_gart += bo->base.size; + + return idx; +} + +static int amdgpu_lookup_or_add_slab_buffer(struct amdgpu_cs *acs, + struct amdgpu_winsys_bo *bo) +{ + struct amdgpu_cs_context *cs = acs->csc; + struct amdgpu_cs_buffer *buffer; + unsigned hash; + int idx = amdgpu_lookup_buffer(cs, bo); + int real_idx; + + if (idx >= 0) + return idx; + + real_idx = amdgpu_lookup_or_add_real_buffer(acs, bo->u.slab.real); + if (real_idx < 0) + return -1; + + /* New buffer, check if the backing array is large enough. */ + if (cs->num_slab_buffers >= cs->max_slab_buffers) { + unsigned new_max = + MAX2(cs->max_slab_buffers + 16, (unsigned)(cs->max_slab_buffers * 1.3)); + struct amdgpu_cs_buffer *new_buffers; + + new_buffers = REALLOC(cs->slab_buffers, + cs->max_slab_buffers * sizeof(*new_buffers), + new_max * sizeof(*new_buffers)); + if (!new_buffers) { + fprintf(stderr, "amdgpu_lookup_or_add_slab_buffer: allocation failed\n"); + return -1; } - cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)winsys_bo; + cs->max_slab_buffers = new_max; + cs->slab_buffers = new_buffers; } - cs->ib.ib_mc_address = cs->big_ib_winsys_buffer->va + cs->used_ib_space; - cs->base.buf = (uint32_t*)(cs->ib_mapped + cs->used_ib_space); - cs->base.max_dw = (cs->big_ib_buffer->size - cs->used_ib_space) / 4; + idx = cs->num_slab_buffers; + buffer = &cs->slab_buffers[idx]; + + memset(buffer, 0, sizeof(*buffer)); + amdgpu_winsys_bo_reference(&buffer->bo, bo); + buffer->u.slab.real_idx = real_idx; + p_atomic_inc(&bo->num_cs_references); + cs->num_slab_buffers++; + + hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); + cs->buffer_indices_hashlist[hash] = idx; + + return idx; +} + +static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs, + struct pb_buffer *buf, + enum radeon_bo_usage usage, + enum radeon_bo_domain domains, + enum radeon_bo_priority priority) +{ + /* Don't use the "domains" parameter. Amdgpu doesn't support changing + * the buffer placement during command submission. + */ + struct amdgpu_cs *acs = amdgpu_cs(rcs); + struct amdgpu_cs_context *cs = acs->csc; + struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; + struct amdgpu_cs_buffer *buffer; + int index; + + if (!bo->bo) { + index = amdgpu_lookup_or_add_slab_buffer(acs, bo); + if (index < 0) + return 0; + + buffer = &cs->slab_buffers[index]; + buffer->usage |= usage; + + usage &= ~RADEON_USAGE_SYNCHRONIZED; + index = buffer->u.slab.real_idx; + } else { + index = amdgpu_lookup_or_add_real_buffer(acs, bo); + if (index < 0) + return 0; + } + + buffer = &cs->real_buffers[index]; + buffer->u.real.priority_usage |= 1llu << priority; + buffer->usage |= usage; + cs->flags[index] = MAX2(cs->flags[index], priority / 4); + return index; +} + +static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) +{ + struct pb_buffer *pb; + uint8_t *mapped; + unsigned buffer_size; + + /* Always create a buffer that is at least as large as the maximum seen IB + * size, aligned to a power of two (and multiplied by 4 to reduce internal + * fragmentation if chaining is not available). Limit to 512k dwords, which + * is the largest power of two that fits into the size field of the + * INDIRECT_BUFFER packet. + */ + if (amdgpu_cs_has_chaining(amdgpu_cs_from_ib(ib))) + buffer_size = 4 *util_next_power_of_two(ib->max_ib_size); + else + buffer_size = 4 *util_next_power_of_two(4 * ib->max_ib_size); + + buffer_size = MIN2(buffer_size, 4 * 512 * 1024); + + switch (ib->ib_type) { + case IB_CONST_PREAMBLE: + buffer_size = MAX2(buffer_size, 4 * 1024); + break; + case IB_CONST: + buffer_size = MAX2(buffer_size, 16 * 1024 * 4); + break; + case IB_MAIN: + buffer_size = MAX2(buffer_size, 8 * 1024 * 4); + break; + default: + unreachable("unhandled IB type"); + } + + pb = ws->base.buffer_create(&ws->base, buffer_size, + ws->info.gart_page_size, + RADEON_DOMAIN_GTT, + RADEON_FLAG_CPU_ACCESS); + if (!pb) + return false; + + mapped = ws->base.buffer_map(pb, NULL, PIPE_TRANSFER_WRITE); + if (!mapped) { + pb_reference(&pb, NULL); + return false; + } + + pb_reference(&ib->big_ib_buffer, pb); + pb_reference(&pb, NULL); + + ib->ib_mapped = mapped; + ib->used_ib_space = 0; + return true; } -static boolean amdgpu_init_cs_context(struct amdgpu_cs *cs, - enum ring_type ring_type) +static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type) +{ + switch (ib_type) { + case IB_MAIN: + /* Smaller submits means the GPU gets busy sooner and there is less + * waiting for buffers and fences. Proof: + * http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1 + */ + return 20 * 1024; + case IB_CONST_PREAMBLE: + case IB_CONST: + /* There isn't really any reason to limit CE IB size beyond the natural + * limit implied by the main IB, except perhaps GTT size. Just return + * an extremely large value that we never get anywhere close to. + */ + return 16 * 1024 * 1024; + default: + unreachable("bad ib_type"); + } +} + +static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs, + enum ib_type ib_type) +{ + struct amdgpu_winsys *aws = (struct amdgpu_winsys*)ws; + /* Small IBs are better than big IBs, because the GPU goes idle quicker + * and there is less waiting for buffers and fences. Proof: + * http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1 + */ + struct amdgpu_ib *ib = NULL; + struct amdgpu_cs_ib_info *info = &cs->csc->ib[ib_type]; + unsigned ib_size = 0; + + switch (ib_type) { + case IB_CONST_PREAMBLE: + ib = &cs->const_preamble_ib; + ib_size = 256 * 4; + break; + case IB_CONST: + ib = &cs->const_ib; + ib_size = 8 * 1024 * 4; + break; + case IB_MAIN: + ib = &cs->main; + ib_size = 4 * 1024 * 4; + break; + default: + unreachable("unhandled IB type"); + } + + if (!amdgpu_cs_has_chaining(cs)) { + ib_size = MAX2(ib_size, + 4 * MIN2(util_next_power_of_two(ib->max_ib_size), + amdgpu_ib_max_submit_dwords(ib_type))); + } + + ib->max_ib_size = ib->max_ib_size - ib->max_ib_size / 32; + + ib->base.prev_dw = 0; + ib->base.num_prev = 0; + ib->base.current.cdw = 0; + ib->base.current.buf = NULL; + + /* Allocate a new buffer for IBs if the current buffer is all used. */ + if (!ib->big_ib_buffer || + ib->used_ib_space + ib_size > ib->big_ib_buffer->size) { + if (!amdgpu_ib_new_buffer(aws, ib)) + return false; + } + + info->ib_mc_address = amdgpu_winsys_bo(ib->big_ib_buffer)->va + + ib->used_ib_space; + info->size = 0; + ib->ptr_ib_size = &info->size; + + amdgpu_cs_add_buffer(&cs->main.base, ib->big_ib_buffer, + RADEON_USAGE_READ, 0, RADEON_PRIO_IB1); + + ib->base.current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space); + + ib_size = ib->big_ib_buffer->size - ib->used_ib_space; + ib->base.current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs->ring_type); + return true; +} + +static void amdgpu_ib_finalize(struct amdgpu_ib *ib) +{ + *ib->ptr_ib_size |= ib->base.current.cdw; + ib->used_ib_space += ib->base.current.cdw * 4; + ib->max_ib_size = MAX2(ib->max_ib_size, ib->base.prev_dw + ib->base.current.cdw); +} + +static bool amdgpu_init_cs_context(struct amdgpu_cs_context *cs, + enum ring_type ring_type) { int i; @@ -271,61 +634,49 @@ static boolean amdgpu_init_cs_context(struct amdgpu_cs *cs, break; } - cs->request.number_of_ibs = 1; - cs->request.ibs = &cs->ib; - - cs->max_num_buffers = 512; - cs->buffers = (struct amdgpu_cs_buffer*) - CALLOC(1, cs->max_num_buffers * sizeof(struct amdgpu_cs_buffer)); - if (!cs->buffers) { - return FALSE; + for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) { + cs->buffer_indices_hashlist[i] = -1; } - cs->handles = CALLOC(1, cs->max_num_buffers * sizeof(amdgpu_bo_handle)); - if (!cs->handles) { - FREE(cs->buffers); - return FALSE; - } + cs->request.number_of_ibs = 1; + cs->request.ibs = &cs->ib[IB_MAIN]; - cs->flags = CALLOC(1, cs->max_num_buffers); - if (!cs->flags) { - FREE(cs->handles); - FREE(cs->buffers); - return FALSE; - } + cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE; + cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE | + AMDGPU_IB_FLAG_PREAMBLE; - for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) { - cs->buffer_indices_hashlist[i] = -1; - } - return TRUE; + return true; } -static void amdgpu_cs_context_cleanup(struct amdgpu_cs *cs) +static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs) { unsigned i; - for (i = 0; i < cs->num_buffers; i++) { - p_atomic_dec(&cs->buffers[i].bo->num_cs_references); - amdgpu_winsys_bo_reference(&cs->buffers[i].bo, NULL); - cs->handles[i] = NULL; - cs->flags[i] = 0; + for (i = 0; i < cs->num_real_buffers; i++) { + p_atomic_dec(&cs->real_buffers[i].bo->num_cs_references); + amdgpu_winsys_bo_reference(&cs->real_buffers[i].bo, NULL); + } + for (i = 0; i < cs->num_slab_buffers; i++) { + p_atomic_dec(&cs->slab_buffers[i].bo->num_cs_references); + amdgpu_winsys_bo_reference(&cs->slab_buffers[i].bo, NULL); } - cs->num_buffers = 0; - cs->used_gart = 0; - cs->used_vram = 0; + cs->num_real_buffers = 0; + cs->num_slab_buffers = 0; + amdgpu_fence_reference(&cs->fence, NULL); - for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) { + for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) { cs->buffer_indices_hashlist[i] = -1; } } -static void amdgpu_destroy_cs_context(struct amdgpu_cs *cs) +static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs) { amdgpu_cs_context_cleanup(cs); FREE(cs->flags); - FREE(cs->buffers); + FREE(cs->real_buffers); FREE(cs->handles); + FREE(cs->slab_buffers); FREE(cs->request.dependencies); } @@ -335,8 +686,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, enum ring_type ring_type, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), - void *flush_ctx, - struct radeon_winsys_cs_handle *trace_buf) + void *flush_ctx) { struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; struct amdgpu_cs *cs; @@ -346,338 +696,540 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, return NULL; } + util_queue_fence_init(&cs->flush_completed); + cs->ctx = ctx; cs->flush_cs = flush; cs->flush_data = flush_ctx; - cs->base.ring_type = ring_type; + cs->ring_type = ring_type; + + cs->main.ib_type = IB_MAIN; + cs->const_ib.ib_type = IB_CONST; + cs->const_preamble_ib.ib_type = IB_CONST_PREAMBLE; - if (!amdgpu_init_cs_context(cs, ring_type)) { + if (!amdgpu_init_cs_context(&cs->csc1, ring_type)) { FREE(cs); return NULL; } - if (!amdgpu_get_new_ib(cs)) { - amdgpu_destroy_cs_context(cs); + if (!amdgpu_init_cs_context(&cs->csc2, ring_type)) { + amdgpu_destroy_cs_context(&cs->csc1); + FREE(cs); + return NULL; + } + + /* Set the first submission context as current. */ + cs->csc = &cs->csc1; + cs->cst = &cs->csc2; + + if (!amdgpu_get_new_ib(&ctx->ws->base, cs, IB_MAIN)) { + amdgpu_destroy_cs_context(&cs->csc2); + amdgpu_destroy_cs_context(&cs->csc1); FREE(cs); return NULL; } p_atomic_inc(&ctx->ws->num_cs); - return &cs->base; + return &cs->main.base; } -#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value) +static struct radeon_winsys_cs * +amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs) +{ + struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs; + struct amdgpu_winsys *ws = cs->ctx->ws; + + /* only one const IB can be added */ + if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped) + return NULL; + + if (!amdgpu_get_new_ib(&ws->base, cs, IB_CONST)) + return NULL; + + cs->csc->request.number_of_ibs = 2; + cs->csc->request.ibs = &cs->csc->ib[IB_CONST]; + + cs->cst->request.number_of_ibs = 2; + cs->cst->request.ibs = &cs->cst->ib[IB_CONST]; -int amdgpu_get_reloc(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo) + return &cs->const_ib.base; +} + +static struct radeon_winsys_cs * +amdgpu_cs_add_const_preamble_ib(struct radeon_winsys_cs *rcs) { - unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1); - int i = cs->buffer_indices_hashlist[hash]; + struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs; + struct amdgpu_winsys *ws = cs->ctx->ws; - /* not found or found */ - if (i == -1 || cs->buffers[i].bo == bo) - return i; + /* only one const preamble IB can be added and only when the const IB has + * also been mapped */ + if (cs->ring_type != RING_GFX || !cs->const_ib.ib_mapped || + cs->const_preamble_ib.ib_mapped) + return NULL; - /* Hash collision, look for the BO in the list of relocs linearly. */ - for (i = cs->num_buffers - 1; i >= 0; i--) { - if (cs->buffers[i].bo == bo) { - /* Put this reloc in the hash list. - * This will prevent additional hash collisions if there are - * several consecutive get_reloc calls for the same buffer. - * - * Example: Assuming buffers A,B,C collide in the hash list, - * the following sequence of relocs: - * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC - * will collide here: ^ and here: ^, - * meaning that we should get very few collisions in the end. */ - cs->buffer_indices_hashlist[hash] = i; - return i; - } - } - return -1; + if (!amdgpu_get_new_ib(&ws->base, cs, IB_CONST_PREAMBLE)) + return NULL; + + cs->csc->request.number_of_ibs = 3; + cs->csc->request.ibs = &cs->csc->ib[IB_CONST_PREAMBLE]; + + cs->cst->request.number_of_ibs = 3; + cs->cst->request.ibs = &cs->cst->ib[IB_CONST_PREAMBLE]; + + return &cs->const_preamble_ib.base; } -static unsigned amdgpu_add_reloc(struct amdgpu_cs *cs, - struct amdgpu_winsys_bo *bo, - enum radeon_bo_usage usage, - enum radeon_bo_domain domains, - unsigned priority, - enum radeon_bo_domain *added_domains) +static bool amdgpu_cs_validate(struct radeon_winsys_cs *rcs) { - struct amdgpu_cs_buffer *reloc; - unsigned hash = bo->unique_id & (Elements(cs->buffer_indices_hashlist)-1); - int i = -1; + return true; +} - priority = MIN2(priority, 15); - *added_domains = 0; +static bool amdgpu_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw) +{ + struct amdgpu_ib *ib = amdgpu_ib(rcs); + struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib); + unsigned requested_size = rcs->prev_dw + rcs->current.cdw + dw; + uint64_t va; + uint32_t *new_ptr_ib_size; - i = amdgpu_get_reloc(cs, bo); + assert(rcs->current.cdw <= rcs->current.max_dw); - if (i >= 0) { - reloc = &cs->buffers[i]; - reloc->usage |= usage; - *added_domains = domains & ~reloc->domains; - reloc->domains |= domains; - cs->flags[i] = MAX2(cs->flags[i], priority); - return i; - } + if (requested_size > amdgpu_ib_max_submit_dwords(ib->ib_type)) + return false; + + ib->max_ib_size = MAX2(ib->max_ib_size, requested_size); - /* New relocation, check if the backing array is large enough. */ - if (cs->num_buffers >= cs->max_num_buffers) { - uint32_t size; - cs->max_num_buffers += 10; + if (rcs->current.max_dw - rcs->current.cdw >= dw) + return true; - size = cs->max_num_buffers * sizeof(struct amdgpu_cs_buffer); - cs->buffers = realloc(cs->buffers, size); + if (!amdgpu_cs_has_chaining(cs)) + return false; - size = cs->max_num_buffers * sizeof(amdgpu_bo_handle); - cs->handles = realloc(cs->handles, size); + /* Allocate a new chunk */ + if (rcs->num_prev >= rcs->max_prev) { + unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev); + struct radeon_winsys_cs_chunk *new_prev; - cs->flags = realloc(cs->flags, cs->max_num_buffers); + new_prev = REALLOC(rcs->prev, + sizeof(*new_prev) * rcs->max_prev, + sizeof(*new_prev) * new_max_prev); + if (!new_prev) + return false; + + rcs->prev = new_prev; + rcs->max_prev = new_max_prev; } - /* Initialize the new relocation. */ - cs->buffers[cs->num_buffers].bo = NULL; - amdgpu_winsys_bo_reference(&cs->buffers[cs->num_buffers].bo, bo); - cs->handles[cs->num_buffers] = bo->bo; - cs->flags[cs->num_buffers] = priority; - p_atomic_inc(&bo->num_cs_references); - reloc = &cs->buffers[cs->num_buffers]; - reloc->bo = bo; - reloc->usage = usage; - reloc->domains = domains; + if (!amdgpu_ib_new_buffer(cs->ctx->ws, ib)) + return false; - cs->buffer_indices_hashlist[hash] = cs->num_buffers; + assert(ib->used_ib_space == 0); + va = amdgpu_winsys_bo(ib->big_ib_buffer)->va; - *added_domains = domains; - return cs->num_buffers++; -} + /* This space was originally reserved. */ + rcs->current.max_dw += 4; + assert(ib->used_ib_space + 4 * rcs->current.max_dw <= ib->big_ib_buffer->size); -static unsigned amdgpu_cs_add_reloc(struct radeon_winsys_cs *rcs, - struct radeon_winsys_cs_handle *buf, - enum radeon_bo_usage usage, - enum radeon_bo_domain domains, - enum radeon_bo_priority priority) -{ - /* Don't use the "domains" parameter. Amdgpu doesn't support changing - * the buffer placement during command submission. - */ - struct amdgpu_cs *cs = amdgpu_cs(rcs); - struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; - enum radeon_bo_domain added_domains; - unsigned index = amdgpu_add_reloc(cs, bo, usage, bo->initial_domain, - priority, &added_domains); + /* Pad with NOPs and add INDIRECT_BUFFER packet */ + while ((rcs->current.cdw & 7) != 4) + radeon_emit(rcs, 0xffff1000); /* type3 nop packet */ - if (added_domains & RADEON_DOMAIN_GTT) - cs->used_gart += bo->base.size; - if (added_domains & RADEON_DOMAIN_VRAM) - cs->used_vram += bo->base.size; + radeon_emit(rcs, PKT3(ib->ib_type == IB_MAIN ? PKT3_INDIRECT_BUFFER_CIK + : PKT3_INDIRECT_BUFFER_CONST, 2, 0)); + radeon_emit(rcs, va); + radeon_emit(rcs, va >> 32); + new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw]; + radeon_emit(rcs, S_3F2_CHAIN(1) | S_3F2_VALID(1)); - return index; + assert((rcs->current.cdw & 7) == 0); + assert(rcs->current.cdw <= rcs->current.max_dw); + + *ib->ptr_ib_size |= rcs->current.cdw; + ib->ptr_ib_size = new_ptr_ib_size; + + /* Hook up the new chunk */ + rcs->prev[rcs->num_prev].buf = rcs->current.buf; + rcs->prev[rcs->num_prev].cdw = rcs->current.cdw; + rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */ + rcs->num_prev++; + + ib->base.prev_dw += ib->base.current.cdw; + ib->base.current.cdw = 0; + + ib->base.current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space); + ib->base.current.max_dw = ib->big_ib_buffer->size / 4 - amdgpu_cs_epilog_dws(cs->ring_type); + + amdgpu_cs_add_buffer(&cs->main.base, ib->big_ib_buffer, + RADEON_USAGE_READ, 0, RADEON_PRIO_IB1); + + return true; } -static int amdgpu_cs_get_reloc(struct radeon_winsys_cs *rcs, - struct radeon_winsys_cs_handle *buf) +static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs, + struct radeon_bo_list_item *list) { - struct amdgpu_cs *cs = amdgpu_cs(rcs); + struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc; + int i; + + if (list) { + for (i = 0; i < cs->num_real_buffers; i++) { + list[i].bo_size = cs->real_buffers[i].bo->base.size; + list[i].vm_address = cs->real_buffers[i].bo->va; + list[i].priority_usage = cs->real_buffers[i].u.real.priority_usage; + } + } + return cs->num_real_buffers; +} + +DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false) + +static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs, + struct amdgpu_cs_buffer *buffer) +{ + struct amdgpu_cs_context *cs = acs->csc; + struct amdgpu_winsys_bo *bo = buffer->bo; + struct amdgpu_cs_fence *dep; + unsigned new_num_fences = 0; + + for (unsigned j = 0; j < bo->num_fences; ++j) { + struct amdgpu_fence *bo_fence = (void *)bo->fences[j]; + unsigned idx; + + if (bo_fence->ctx == acs->ctx && + bo_fence->fence.ip_type == cs->request.ip_type && + bo_fence->fence.ip_instance == cs->request.ip_instance && + bo_fence->fence.ring == cs->request.ring) + continue; + + if (amdgpu_fence_wait((void *)bo_fence, 0, false)) + continue; + + amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]); + new_num_fences++; + + if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED)) + continue; + + if (bo_fence->submission_in_progress) + os_wait_until_zero(&bo_fence->submission_in_progress, + PIPE_TIMEOUT_INFINITE); + + idx = cs->request.number_of_dependencies++; + if (idx >= cs->max_dependencies) { + unsigned size; + + cs->max_dependencies = idx + 8; + size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence); + cs->request.dependencies = realloc(cs->request.dependencies, size); + } + + dep = &cs->request.dependencies[idx]; + memcpy(dep, &bo_fence->fence, sizeof(*dep)); + } + + for (unsigned j = new_num_fences; j < bo->num_fences; ++j) + amdgpu_fence_reference(&bo->fences[j], NULL); - return amdgpu_get_reloc(cs, (struct amdgpu_winsys_bo*)buf); + bo->num_fences = new_num_fences; } -static boolean amdgpu_cs_validate(struct radeon_winsys_cs *rcs) +/* Since the kernel driver doesn't synchronize execution between different + * rings automatically, we have to add fence dependencies manually. + */ +static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs) { - return TRUE; + struct amdgpu_cs_context *cs = acs->csc; + int i; + + cs->request.number_of_dependencies = 0; + + for (i = 0; i < cs->num_real_buffers; i++) + amdgpu_add_fence_dependency(acs, &cs->real_buffers[i]); + for (i = 0; i < cs->num_slab_buffers; i++) + amdgpu_add_fence_dependency(acs, &cs->slab_buffers[i]); } -static boolean amdgpu_cs_memory_below_limit(struct radeon_winsys_cs *rcs, uint64_t vram, uint64_t gtt) +static void amdgpu_add_fence(struct amdgpu_winsys_bo *bo, + struct pipe_fence_handle *fence) { - struct amdgpu_cs *cs = amdgpu_cs(rcs); - boolean status = - (cs->used_gart + gtt) < cs->ctx->ws->info.gart_size * 0.7 && - (cs->used_vram + vram) < cs->ctx->ws->info.vram_size * 0.7; + if (bo->num_fences >= bo->max_fences) { + unsigned new_max_fences = MAX2(1, bo->max_fences * 2); + struct pipe_fence_handle **new_fences = + REALLOC(bo->fences, + bo->num_fences * sizeof(*new_fences), + new_max_fences * sizeof(*new_fences)); + if (new_fences) { + bo->fences = new_fences; + bo->max_fences = new_max_fences; + } else { + fprintf(stderr, "amdgpu_add_fence: allocation failure, dropping fence\n"); + if (!bo->num_fences) + return; - return status; + bo->num_fences--; /* prefer to keep a more recent fence if possible */ + amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL); + } + } + + bo->fences[bo->num_fences] = NULL; + amdgpu_fence_reference(&bo->fences[bo->num_fences], fence); + bo->num_fences++; } -static void amdgpu_cs_do_submission(struct amdgpu_cs *cs, - struct pipe_fence_handle **out_fence) +void amdgpu_cs_submit_ib(void *job, int thread_index) { - struct amdgpu_winsys *ws = cs->ctx->ws; - struct pipe_fence_handle *fence; - int i, j, r; + struct amdgpu_cs *acs = (struct amdgpu_cs*)job; + struct amdgpu_winsys *ws = acs->ctx->ws; + struct amdgpu_cs_context *cs = acs->cst; + int i, r; - /* Create a fence. */ - fence = amdgpu_fence_create(cs->ctx, - cs->request.ip_type, - cs->request.ip_instance, - cs->request.ring); - if (out_fence) - amdgpu_fence_reference(out_fence, fence); + cs->request.fence_info.handle = NULL; + if (amdgpu_cs_has_user_fence(cs)) { + cs->request.fence_info.handle = acs->ctx->user_fence_bo; + cs->request.fence_info.offset = acs->ring_type; + } - cs->request.number_of_dependencies = 0; + /* Create the buffer list. + * Use a buffer list containing all allocated buffers if requested. + */ + if (debug_get_option_all_bos()) { + struct amdgpu_winsys_bo *bo; + amdgpu_bo_handle *handles; + unsigned num = 0; + + pipe_mutex_lock(ws->global_bo_list_lock); + + handles = malloc(sizeof(handles[0]) * ws->num_buffers); + if (!handles) { + pipe_mutex_unlock(ws->global_bo_list_lock); + amdgpu_cs_context_cleanup(cs); + cs->error_code = -ENOMEM; + return; + } - /* Since the kernel driver doesn't synchronize execution between different - * rings automatically, we have to add fence dependencies manually. */ - pipe_mutex_lock(ws->bo_fence_lock); - for (i = 0; i < cs->num_buffers; i++) { - for (j = 0; j < RING_LAST; j++) { - struct amdgpu_cs_fence *dep; - unsigned idx; - - struct amdgpu_fence *bo_fence = (void *)cs->buffers[i].bo->fence[j]; - if (!bo_fence) - continue; - - if (bo_fence->ctx == cs->ctx && - bo_fence->fence.ip_type == cs->request.ip_type && - bo_fence->fence.ip_instance == cs->request.ip_instance && - bo_fence->fence.ring == cs->request.ring) - continue; - - if (amdgpu_fence_wait((void *)bo_fence, 0, false)) - continue; - - idx = cs->request.number_of_dependencies++; - if (idx >= cs->max_dependencies) { - unsigned size; - - cs->max_dependencies = idx + 8; - size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence); - cs->request.dependencies = realloc(cs->request.dependencies, size); - } - - dep = &cs->request.dependencies[idx]; - memcpy(dep, &bo_fence->fence, sizeof(*dep)); + LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, u.real.global_list_item) { + assert(num < ws->num_buffers); + handles[num++] = bo->bo; } + + r = amdgpu_bo_list_create(ws->dev, ws->num_buffers, + handles, NULL, + &cs->request.resources); + free(handles); + pipe_mutex_unlock(ws->global_bo_list_lock); + } else { + r = amdgpu_bo_list_create(ws->dev, cs->num_real_buffers, + cs->handles, cs->flags, + &cs->request.resources); } - cs->request.fence_info.handle = NULL; - if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != AMDGPU_HW_IP_VCE) { - cs->request.fence_info.handle = cs->ctx->user_fence_bo; - cs->request.fence_info.offset = cs->base.ring_type; + if (r) { + fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r); + cs->request.resources = NULL; + amdgpu_fence_signalled(cs->fence); + cs->error_code = r; + goto cleanup; } - r = amdgpu_cs_submit(cs->ctx->ctx, 0, &cs->request, 1); + r = amdgpu_cs_submit(acs->ctx->ctx, 0, &cs->request, 1); + cs->error_code = r; if (r) { if (r == -ENOMEM) fprintf(stderr, "amdgpu: Not enough memory for command submission.\n"); else fprintf(stderr, "amdgpu: The CS has been rejected, " - "see dmesg for more information.\n"); + "see dmesg for more information (%i).\n", r); - amdgpu_fence_signalled(fence); + amdgpu_fence_signalled(cs->fence); } else { /* Success. */ uint64_t *user_fence = NULL; - if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != AMDGPU_HW_IP_VCE) - user_fence = cs->ctx->user_fence_cpu_address_base + + if (amdgpu_cs_has_user_fence(cs)) + user_fence = acs->ctx->user_fence_cpu_address_base + cs->request.fence_info.offset; - amdgpu_fence_submitted(fence, &cs->request, user_fence); - - for (i = 0; i < cs->num_buffers; i++) - amdgpu_fence_reference(&cs->buffers[i].bo->fence[cs->base.ring_type], - fence); + amdgpu_fence_submitted(cs->fence, &cs->request, user_fence); } - pipe_mutex_unlock(ws->bo_fence_lock); - amdgpu_fence_reference(&fence, NULL); + + /* Cleanup. */ + if (cs->request.resources) + amdgpu_bo_list_destroy(cs->request.resources); + +cleanup: + for (i = 0; i < cs->num_real_buffers; i++) + p_atomic_dec(&cs->real_buffers[i].bo->num_active_ioctls); + for (i = 0; i < cs->num_slab_buffers; i++) + p_atomic_dec(&cs->slab_buffers[i].bo->num_active_ioctls); + + amdgpu_cs_context_cleanup(cs); } -static void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs) +/* Make sure the previous submission is completed. */ +void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs) { - /* no-op */ -} + struct amdgpu_cs *cs = amdgpu_cs(rcs); + struct amdgpu_winsys *ws = cs->ctx->ws; -DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE) + /* Wait for any pending ioctl of this CS to complete. */ + if (util_queue_is_initialized(&ws->cs_queue)) + util_queue_job_wait(&cs->flush_completed); +} -static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs, - unsigned flags, - struct pipe_fence_handle **fence, - uint32_t cs_trace_id) +static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, + unsigned flags, + struct pipe_fence_handle **fence) { struct amdgpu_cs *cs = amdgpu_cs(rcs); struct amdgpu_winsys *ws = cs->ctx->ws; + int error_code = 0; - switch (cs->base.ring_type) { + rcs->current.max_dw += amdgpu_cs_epilog_dws(cs->ring_type); + + switch (cs->ring_type) { case RING_DMA: /* pad DMA ring to 8 DWs */ if (ws->info.chip_class <= SI) { - while (rcs->cdw & 7) - OUT_CS(&cs->base, 0xf0000000); /* NOP packet */ + while (rcs->current.cdw & 7) + radeon_emit(rcs, 0xf0000000); /* NOP packet */ } else { - while (rcs->cdw & 7) - OUT_CS(&cs->base, 0x00000000); /* NOP packet */ + while (rcs->current.cdw & 7) + radeon_emit(rcs, 0x00000000); /* NOP packet */ } break; case RING_GFX: - /* pad DMA ring to 8 DWs to meet CP fetch alignment requirements - * r6xx, requires at least 4 dw alignment to avoid a hw bug. - */ - if (ws->info.chip_class <= SI) { - while (rcs->cdw & 7) - OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */ + /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements */ + if (ws->info.gfx_ib_pad_with_type2) { + while (rcs->current.cdw & 7) + radeon_emit(rcs, 0x80000000); /* type2 nop packet */ } else { - while (rcs->cdw & 7) - OUT_CS(&cs->base, 0xffff1000); /* type3 nop packet */ + while (rcs->current.cdw & 7) + radeon_emit(rcs, 0xffff1000); /* type3 nop packet */ } + + /* Also pad the const IB. */ + if (cs->const_ib.ib_mapped) + while (!cs->const_ib.base.current.cdw || (cs->const_ib.base.current.cdw & 7)) + radeon_emit(&cs->const_ib.base, 0xffff1000); /* type3 nop packet */ + + if (cs->const_preamble_ib.ib_mapped) + while (!cs->const_preamble_ib.base.current.cdw || (cs->const_preamble_ib.base.current.cdw & 7)) + radeon_emit(&cs->const_preamble_ib.base, 0xffff1000); break; case RING_UVD: - while (rcs->cdw & 15) - OUT_CS(&cs->base, 0x80000000); /* type2 nop packet */ + while (rcs->current.cdw & 15) + radeon_emit(rcs, 0x80000000); /* type2 nop packet */ break; default: break; } - if (rcs->cdw > rcs->max_dw) { + if (rcs->current.cdw > rcs->current.max_dw) { fprintf(stderr, "amdgpu: command stream overflowed\n"); } - amdgpu_cs_add_reloc(rcs, (void*)cs->big_ib_winsys_buffer, - RADEON_USAGE_READ, 0, RADEON_PRIO_MIN); - /* If the CS is not empty or overflowed.... */ - if (cs->base.cdw && cs->base.cdw <= cs->base.max_dw && !debug_get_option_noop()) { - int r; - - r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, - cs->handles, cs->flags, - &cs->request.resources); + if (radeon_emitted(&cs->main.base, 0) && + cs->main.base.current.cdw <= cs->main.base.current.max_dw && + !debug_get_option_noop()) { + struct amdgpu_cs_context *cur = cs->csc; + unsigned i, num_buffers; + + /* Set IB sizes. */ + amdgpu_ib_finalize(&cs->main); + + if (cs->const_ib.ib_mapped) + amdgpu_ib_finalize(&cs->const_ib); + + if (cs->const_preamble_ib.ib_mapped) + amdgpu_ib_finalize(&cs->const_preamble_ib); + + /* Create a fence. */ + amdgpu_fence_reference(&cur->fence, NULL); + if (cs->next_fence) { + /* just move the reference */ + cur->fence = cs->next_fence; + cs->next_fence = NULL; + } else { + cur->fence = amdgpu_fence_create(cs->ctx, + cur->request.ip_type, + cur->request.ip_instance, + cur->request.ring); + } + if (fence) + amdgpu_fence_reference(fence, cur->fence); + + /* Prepare buffers. */ + pipe_mutex_lock(ws->bo_fence_lock); + amdgpu_add_fence_dependencies(cs); + + num_buffers = cur->num_real_buffers; + for (i = 0; i < num_buffers; i++) { + struct amdgpu_winsys_bo *bo = cur->real_buffers[i].bo; + p_atomic_inc(&bo->num_active_ioctls); + amdgpu_add_fence(bo, cur->fence); + } - if (r) { - fprintf(stderr, "amdgpu: resource list creation failed (%d)\n", r); - cs->request.resources = NULL; - goto cleanup; + num_buffers = cur->num_slab_buffers; + for (i = 0; i < num_buffers; i++) { + struct amdgpu_winsys_bo *bo = cur->slab_buffers[i].bo; + p_atomic_inc(&bo->num_active_ioctls); + amdgpu_add_fence(bo, cur->fence); } + pipe_mutex_unlock(ws->bo_fence_lock); - cs->ib.size = cs->base.cdw; - cs->used_ib_space += cs->base.cdw * 4; + amdgpu_cs_sync_flush(rcs); - amdgpu_cs_do_submission(cs, fence); + /* Swap command streams. "cst" is going to be submitted. */ + cs->csc = cs->cst; + cs->cst = cur; - /* Cleanup. */ - if (cs->request.resources) - amdgpu_bo_list_destroy(cs->request.resources); + /* Submit. */ + if ((flags & RADEON_FLUSH_ASYNC) && + util_queue_is_initialized(&ws->cs_queue)) { + util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed, + amdgpu_cs_submit_ib, NULL); + } else { + amdgpu_cs_submit_ib(cs, 0); + error_code = cs->cst->error_code; + } + } else { + amdgpu_cs_context_cleanup(cs->csc); } -cleanup: - amdgpu_cs_context_cleanup(cs); - amdgpu_get_new_ib(cs); + amdgpu_get_new_ib(&ws->base, cs, IB_MAIN); + if (cs->const_ib.ib_mapped) + amdgpu_get_new_ib(&ws->base, cs, IB_CONST); + if (cs->const_preamble_ib.ib_mapped) + amdgpu_get_new_ib(&ws->base, cs, IB_CONST_PREAMBLE); + + cs->main.base.used_gart = 0; + cs->main.base.used_vram = 0; ws->num_cs_flushes++; + return error_code; } static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs) { struct amdgpu_cs *cs = amdgpu_cs(rcs); - amdgpu_destroy_cs_context(cs); + amdgpu_cs_sync_flush(rcs); + util_queue_fence_destroy(&cs->flush_completed); p_atomic_dec(&cs->ctx->ws->num_cs); - pb_reference(&cs->big_ib_buffer, NULL); + pb_reference(&cs->main.big_ib_buffer, NULL); + FREE(cs->main.base.prev); + pb_reference(&cs->const_ib.big_ib_buffer, NULL); + FREE(cs->const_ib.base.prev); + pb_reference(&cs->const_preamble_ib.big_ib_buffer, NULL); + FREE(cs->const_preamble_ib.base.prev); + amdgpu_destroy_cs_context(&cs->csc1); + amdgpu_destroy_cs_context(&cs->csc2); + amdgpu_fence_reference(&cs->next_fence, NULL); FREE(cs); } -static boolean amdgpu_bo_is_referenced(struct radeon_winsys_cs *rcs, - struct radeon_winsys_cs_handle *_buf, - enum radeon_bo_usage usage) +static bool amdgpu_bo_is_referenced(struct radeon_winsys_cs *rcs, + struct pb_buffer *_buf, + enum radeon_bo_usage usage) { struct amdgpu_cs *cs = amdgpu_cs(rcs); struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf; @@ -691,12 +1243,15 @@ void amdgpu_cs_init_functions(struct amdgpu_winsys *ws) ws->base.ctx_destroy = amdgpu_ctx_destroy; ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status; ws->base.cs_create = amdgpu_cs_create; + ws->base.cs_add_const_ib = amdgpu_cs_add_const_ib; + ws->base.cs_add_const_preamble_ib = amdgpu_cs_add_const_preamble_ib; ws->base.cs_destroy = amdgpu_cs_destroy; - ws->base.cs_add_reloc = amdgpu_cs_add_reloc; - ws->base.cs_get_reloc = amdgpu_cs_get_reloc; + ws->base.cs_add_buffer = amdgpu_cs_add_buffer; ws->base.cs_validate = amdgpu_cs_validate; - ws->base.cs_memory_below_limit = amdgpu_cs_memory_below_limit; + ws->base.cs_check_space = amdgpu_cs_check_space; + ws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list; ws->base.cs_flush = amdgpu_cs_flush; + ws->base.cs_get_next_fence = amdgpu_cs_get_next_fence; ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced; ws->base.cs_sync_flush = amdgpu_cs_sync_flush; ws->base.fence_wait = amdgpu_fence_wait_rel_timeout; diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index 12c6b624b..5f181a5da 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -45,42 +45,84 @@ struct amdgpu_ctx { struct amdgpu_cs_buffer { struct amdgpu_winsys_bo *bo; + union { + struct { + uint64_t priority_usage; + } real; + struct { + uint32_t real_idx; /* index of underlying real BO */ + } slab; + } u; enum radeon_bo_usage usage; - enum radeon_bo_domain domains; }; +enum ib_type { + IB_CONST_PREAMBLE = 0, + IB_CONST = 1, /* the const IB must be first */ + IB_MAIN = 2, + IB_NUM +}; -struct amdgpu_cs { +struct amdgpu_ib { struct radeon_winsys_cs base; - struct amdgpu_ctx *ctx; - - /* Flush CS. */ - void (*flush_cs)(void *ctx, unsigned flags, struct pipe_fence_handle **fence); - void *flush_data; /* A buffer out of which new IBs are allocated. */ - struct pb_buffer *big_ib_buffer; /* for holding the reference */ - struct amdgpu_winsys_bo *big_ib_winsys_buffer; - uint8_t *ib_mapped; - unsigned used_ib_space; + struct pb_buffer *big_ib_buffer; + uint8_t *ib_mapped; + unsigned used_ib_space; + unsigned max_ib_size; + uint32_t *ptr_ib_size; + enum ib_type ib_type; +}; - /* amdgpu_cs_submit parameters */ +struct amdgpu_cs_context { struct amdgpu_cs_request request; - struct amdgpu_cs_ib_info ib; + struct amdgpu_cs_ib_info ib[IB_NUM]; - /* Relocs. */ - unsigned max_num_buffers; - unsigned num_buffers; + /* Buffers. */ + unsigned max_real_buffers; + unsigned num_real_buffers; amdgpu_bo_handle *handles; uint8_t *flags; - struct amdgpu_cs_buffer *buffers; + struct amdgpu_cs_buffer *real_buffers; - int buffer_indices_hashlist[512]; + unsigned num_slab_buffers; + unsigned max_slab_buffers; + struct amdgpu_cs_buffer *slab_buffers; - uint64_t used_vram; - uint64_t used_gart; + int buffer_indices_hashlist[4096]; unsigned max_dependencies; + + struct pipe_fence_handle *fence; + + /* the error returned from cs_flush for non-async submissions */ + int error_code; +}; + +struct amdgpu_cs { + struct amdgpu_ib main; /* must be first because this is inherited */ + struct amdgpu_ib const_ib; /* optional constant engine IB */ + struct amdgpu_ib const_preamble_ib; + struct amdgpu_ctx *ctx; + enum ring_type ring_type; + + /* We flip between these two CS. While one is being consumed + * by the kernel in another thread, the other one is being filled + * by the pipe driver. */ + struct amdgpu_cs_context csc1; + struct amdgpu_cs_context csc2; + /* The currently-used CS. */ + struct amdgpu_cs_context *csc; + /* The CS being currently-owned by the other thread. */ + struct amdgpu_cs_context *cst; + + /* Flush CS. */ + void (*flush_cs)(void *ctx, unsigned flags, struct pipe_fence_handle **fence); + void *flush_data; + + struct util_queue_fence flush_completed; + struct pipe_fence_handle *next_fence; }; struct amdgpu_fence { @@ -90,6 +132,9 @@ struct amdgpu_fence { struct amdgpu_cs_fence fence; uint64_t *user_fence_cpu_address; + /* If the fence is unknown due to an IB still being submitted + * in the other thread. */ + volatile int submission_in_progress; /* bool (int for atomicity) */ volatile int signalled; /* bool (int for atomicity) */ }; @@ -115,41 +160,70 @@ static inline void amdgpu_fence_reference(struct pipe_fence_handle **dst, *rdst = rsrc; } -int amdgpu_get_reloc(struct amdgpu_cs *csc, struct amdgpu_winsys_bo *bo); +int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo); + +static inline struct amdgpu_ib * +amdgpu_ib(struct radeon_winsys_cs *base) +{ + return (struct amdgpu_ib *)base; +} static inline struct amdgpu_cs * amdgpu_cs(struct radeon_winsys_cs *base) { + assert(amdgpu_ib(base)->ib_type == IB_MAIN); return (struct amdgpu_cs*)base; } -static inline boolean +#define get_container(member_ptr, container_type, container_member) \ + (container_type *)((char *)(member_ptr) - offsetof(container_type, container_member)) + +static inline struct amdgpu_cs * +amdgpu_cs_from_ib(struct amdgpu_ib *ib) +{ + switch (ib->ib_type) { + case IB_MAIN: + return get_container(ib, struct amdgpu_cs, main); + case IB_CONST: + return get_container(ib, struct amdgpu_cs, const_ib); + case IB_CONST_PREAMBLE: + return get_container(ib, struct amdgpu_cs, const_preamble_ib); + default: + unreachable("bad ib_type"); + } +} + +static inline bool amdgpu_bo_is_referenced_by_cs(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo) { int num_refs = bo->num_cs_references; - return num_refs == bo->rws->num_cs || - (num_refs && amdgpu_get_reloc(cs, bo) != -1); + return num_refs == bo->ws->num_cs || + (num_refs && amdgpu_lookup_buffer(cs->csc, bo) != -1); } -static inline boolean +static inline bool amdgpu_bo_is_referenced_by_cs_with_usage(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo, enum radeon_bo_usage usage) { int index; + struct amdgpu_cs_buffer *buffer; if (!bo->num_cs_references) - return FALSE; + return false; - index = amdgpu_get_reloc(cs, bo); + index = amdgpu_lookup_buffer(cs->csc, bo); if (index == -1) - return FALSE; + return false; + + buffer = bo->bo ? &cs->csc->real_buffers[index] + : &cs->csc->slab_buffers[index]; - return (cs->buffers[index].usage & usage) != 0; + return (buffer->usage & usage) != 0; } -static inline boolean +static inline bool amdgpu_bo_is_referenced_by_any_cs(struct amdgpu_winsys_bo *bo) { return bo->num_cs_references != 0; @@ -157,6 +231,8 @@ amdgpu_bo_is_referenced_by_any_cs(struct amdgpu_winsys_bo *bo) bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, bool absolute); +void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs); void amdgpu_cs_init_functions(struct amdgpu_winsys *ws); +void amdgpu_cs_submit_ib(void *job, int thread_index); #endif diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c index 358df3810..c5462bc0e 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c @@ -108,26 +108,6 @@ static ADDR_E_RETURNCODE ADDR_API freeSysMem(const ADDR_FREESYSMEM_INPUT * pInpu return ADDR_OK; } -/** - * This returns the number of banks for the surface. - * Possible values: 2, 4, 8, 16. - */ -static uint32_t cik_num_banks(struct amdgpu_winsys *ws, - struct radeon_surf *surf) -{ - unsigned index, tileb; - - tileb = 8 * 8 * surf->bpe; - tileb = MIN2(surf->tile_split, tileb); - - for (index = 0; tileb > 64; index++) { - tileb >>= 1; - } - assert(index < 16); - - return 2 << ((ws->amdinfo.gb_macro_tile_mode[index] >> 6) & 0x3); -} - ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws) { ADDR_CREATE_INPUT addrCreateInput = {0}; @@ -145,15 +125,19 @@ ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws) regValue.backendDisables = ws->amdinfo.backend_disable[0]; regValue.pTileConfig = ws->amdinfo.gb_tile_mode; - regValue.noOfEntries = sizeof(ws->amdinfo.gb_tile_mode) / - sizeof(ws->amdinfo.gb_tile_mode[0]); - regValue.pMacroTileConfig = ws->amdinfo.gb_macro_tile_mode; - regValue.noOfMacroEntries = sizeof(ws->amdinfo.gb_macro_tile_mode) / - sizeof(ws->amdinfo.gb_macro_tile_mode[0]); + regValue.noOfEntries = ARRAY_SIZE(ws->amdinfo.gb_tile_mode); + if (ws->info.chip_class == SI) { + regValue.pMacroTileConfig = NULL; + regValue.noOfMacroEntries = 0; + } else { + regValue.pMacroTileConfig = ws->amdinfo.gb_macro_tile_mode; + regValue.noOfMacroEntries = ARRAY_SIZE(ws->amdinfo.gb_macro_tile_mode); + } createFlags.value = 0; createFlags.useTileIndex = 1; createFlags.degradeBaseLevel = 1; + createFlags.useHtileSliceAlign = 1; addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND; addrCreateInput.chipFamily = ws->family; @@ -175,7 +159,11 @@ static int compute_level(struct amdgpu_winsys *ws, struct radeon_surf *surf, bool is_stencil, unsigned level, unsigned type, bool compressed, ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn, - ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut) + ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut, + ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn, + ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut, + ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn, + ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut) { struct radeon_surf_level *surf_level; ADDR_E_RETURNCODE ret; @@ -212,7 +200,7 @@ static int compute_level(struct amdgpu_winsys *ws, } surf_level = is_stencil ? &surf->stencil_level[level] : &surf->level[level]; - surf_level->offset = align(surf->bo_size, AddrSurfInfoOut->baseAlign); + surf_level->offset = align64(surf->bo_size, AddrSurfInfoOut->baseAlign); surf_level->slice_size = AddrSurfInfoOut->sliceSize; surf_level->pitch_bytes = AddrSurfInfoOut->pitch * (is_stencil ? 1 : surf->bpe); surf_level->npix_x = u_minify(surf->npix_x, level); @@ -226,9 +214,6 @@ static int compute_level(struct amdgpu_winsys *ws, surf_level->nblk_z = 1; switch (AddrSurfInfoOut->tileMode) { - case ADDR_TM_LINEAR_GENERAL: - surf_level->mode = RADEON_SURF_MODE_LINEAR; - break; case ADDR_TM_LINEAR_ALIGNED: surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED; break; @@ -248,9 +233,90 @@ static int compute_level(struct amdgpu_winsys *ws, surf->tiling_index[level] = AddrSurfInfoOut->tileIndex; surf->bo_size = surf_level->offset + AddrSurfInfoOut->surfSize; + + /* Clear DCC fields at the beginning. */ + surf_level->dcc_offset = 0; + surf_level->dcc_enabled = false; + + /* The previous level's flag tells us if we can use DCC for this level. */ + if (AddrSurfInfoIn->flags.dccCompatible && + (level == 0 || AddrDccOut->subLvlCompressible)) { + AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize; + AddrDccIn->tileMode = AddrSurfInfoOut->tileMode; + AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo; + AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex; + AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; + + ret = AddrComputeDccInfo(ws->addrlib, + AddrDccIn, + AddrDccOut); + + if (ret == ADDR_OK) { + surf_level->dcc_offset = surf->dcc_size; + surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize; + surf_level->dcc_enabled = true; + surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize; + surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign); + } + } + + /* TC-compatible HTILE. */ + if (!is_stencil && + AddrSurfInfoIn->flags.depth && + AddrSurfInfoIn->flags.tcCompatible && + surf_level->mode == RADEON_SURF_MODE_2D && + level == 0) { + AddrHtileIn->flags.tcCompatible = 1; + AddrHtileIn->pitch = AddrSurfInfoOut->pitch; + AddrHtileIn->height = AddrSurfInfoOut->height; + AddrHtileIn->numSlices = AddrSurfInfoOut->depth; + AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8; + AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8; + AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo; + AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex; + AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; + + ret = AddrComputeHtileInfo(ws->addrlib, + AddrHtileIn, + AddrHtileOut); + + if (ret == ADDR_OK) { + surf->htile_size = AddrHtileOut->htileBytes; + surf->htile_alignment = AddrHtileOut->baseAlign; + } + } + return 0; } +#define G_009910_MICRO_TILE_MODE(x) (((x) >> 0) & 0x03) +#define G_009910_MICRO_TILE_MODE_NEW(x) (((x) >> 22) & 0x07) + +static void set_micro_tile_mode(struct radeon_surf *surf, + struct radeon_info *info) +{ + uint32_t tile_mode = info->si_tile_mode_array[surf->tiling_index[0]]; + + if (info->chip_class >= CIK) + surf->micro_tile_mode = G_009910_MICRO_TILE_MODE_NEW(tile_mode); + else + surf->micro_tile_mode = G_009910_MICRO_TILE_MODE(tile_mode); +} + +static unsigned cik_get_macro_tile_index(struct radeon_surf *surf) +{ + unsigned index, tileb; + + tileb = 8 * 8 * surf->bpe; + tileb = MIN2(surf->tile_split, tileb); + + for (index = 0; tileb > 64; index++) + tileb >>= 1; + + assert(index < 16); + return index; +} + static int amdgpu_surface_init(struct radeon_winsys *rws, struct radeon_surf *surf) { @@ -259,6 +325,10 @@ static int amdgpu_surface_init(struct radeon_winsys *rws, bool compressed; ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0}; ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0}; + ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0}; + ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0}; + ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0}; + ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0}; ADDR_TILEINFO AddrTileInfoIn = {0}; ADDR_TILEINFO AddrTileInfoOut = {0}; int r; @@ -269,6 +339,10 @@ static int amdgpu_surface_init(struct radeon_winsys *rws, AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT); AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT); + AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT); + AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT); + AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT); + AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT); AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut; type = RADEON_SURF_GET(surf->flags, TYPE); @@ -287,9 +361,6 @@ static int amdgpu_surface_init(struct radeon_winsys *rws, /* Set the requested tiling mode. */ switch (mode) { - case RADEON_SURF_MODE_LINEAR: - AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_GENERAL; - break; case RADEON_SURF_MODE_LINEAR_ALIGNED: AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED; break; @@ -318,10 +389,10 @@ static int amdgpu_surface_init(struct radeon_winsys *rws, } } else { - AddrSurfInfoIn.bpp = surf->bpe * 8; + AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8; } - AddrSurfInfoIn.numSamples = surf->nsamples; + AddrDccIn.numSamples = AddrSurfInfoIn.numSamples = surf->nsamples; AddrSurfInfoIn.tileIndex = -1; /* Set the micro tile type. */ @@ -334,14 +405,41 @@ static int amdgpu_surface_init(struct radeon_winsys *rws, AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER); AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0; - AddrSurfInfoIn.flags.stencil = (surf->flags & RADEON_SURF_SBUFFER) != 0; AddrSurfInfoIn.flags.cube = type == RADEON_SURF_TYPE_CUBEMAP; AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0; AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0; - AddrSurfInfoIn.flags.degrade4Space = 1; - - /* This disables incorrect calculations (hacks) in addrlib. */ - AddrSurfInfoIn.flags.noStencil = 1; + AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0; + + /* Only degrade the tile mode for space if TC-compatible HTILE hasn't been + * requested, because TC-compatible HTILE requires 2D tiling. + */ + AddrSurfInfoIn.flags.degrade4Space = !AddrSurfInfoIn.flags.tcCompatible; + + /* DCC notes: + * - If we add MSAA support, keep in mind that CB can't decompress 8bpp + * with samples >= 4. + * - Mipmapped array textures have low performance (discovered by a closed + * driver team). + */ + AddrSurfInfoIn.flags.dccCompatible = ws->info.chip_class >= VI && + !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) && + !(surf->flags & RADEON_SURF_DISABLE_DCC) && + !compressed && AddrDccIn.numSamples <= 1 && + ((surf->array_size == 1 && surf->npix_z == 1) || + surf->last_level == 0); + + AddrSurfInfoIn.flags.noStencil = (surf->flags & RADEON_SURF_SBUFFER) == 0; + AddrSurfInfoIn.flags.compressZ = AddrSurfInfoIn.flags.depth; + + /* noStencil = 0 can result in a depth part that is incompatible with + * mipmapped texturing. So set noStencil = 1 when mipmaps are requested (in + * this case, we may end up setting stencil_adjusted). + * + * TODO: update addrlib to a newer version, remove this, and + * use flags.matchStencilTileCfg = 1 as an alternative fix. + */ + if (surf->last_level > 0) + AddrSurfInfoIn.flags.noStencil = 1; /* Set preferred macrotile parameters. This is usually required * for shared resources. This is for 2D tiling only. */ @@ -349,11 +447,12 @@ static int amdgpu_surface_init(struct radeon_winsys *rws, surf->bankw && surf->bankh && surf->mtilea && surf->tile_split) { /* If any of these parameters are incorrect, the calculation * will fail. */ - AddrTileInfoIn.banks = cik_num_banks(ws, surf); + AddrTileInfoIn.banks = surf->num_banks; AddrTileInfoIn.bankWidth = surf->bankw; AddrTileInfoIn.bankHeight = surf->bankh; AddrTileInfoIn.macroAspectRatio = surf->mtilea; AddrTileInfoIn.tileSplitBytes = surf->tile_split; + AddrTileInfoIn.pipeConfig = surf->pipe_config + 1; /* +1 compared to GB_TILE_MODE */ AddrSurfInfoIn.flags.degrade4Space = 0; AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn; @@ -368,24 +467,52 @@ static int amdgpu_surface_init(struct radeon_winsys *rws, assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1); - if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) - AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */ - else - AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */ + if (ws->info.chip_class == SI) { + if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) { + if (surf->bpe == 2) + AddrSurfInfoIn.tileIndex = 11; /* 16bpp */ + else + AddrSurfInfoIn.tileIndex = 12; /* 32bpp */ + } else { + if (surf->bpe == 1) + AddrSurfInfoIn.tileIndex = 14; /* 8bpp */ + else if (surf->bpe == 2) + AddrSurfInfoIn.tileIndex = 15; /* 16bpp */ + else if (surf->bpe == 4) + AddrSurfInfoIn.tileIndex = 16; /* 32bpp */ + else + AddrSurfInfoIn.tileIndex = 17; /* 64bpp (and 128bpp) */ + } + } else { + /* CIK - VI */ + if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) + AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */ + else + AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */ + + /* Addrlib doesn't set this if tileIndex is forced like above. */ + AddrSurfInfoOut.macroModeIndex = cik_get_macro_tile_index(surf); + } } surf->bo_size = 0; + surf->dcc_size = 0; + surf->dcc_alignment = 1; + surf->htile_size = 0; + surf->htile_alignment = 1; /* Calculate texture layout information. */ for (level = 0; level <= surf->last_level; level++) { r = compute_level(ws, surf, false, level, type, compressed, - &AddrSurfInfoIn, &AddrSurfInfoOut); + &AddrSurfInfoIn, &AddrSurfInfoOut, + &AddrDccIn, &AddrDccOut, &AddrHtileIn, &AddrHtileOut); if (r) return r; if (level == 0) { surf->bo_alignment = AddrSurfInfoOut.baseAlign; surf->pipe_config = AddrSurfInfoOut.pTileInfo->pipeConfig - 1; + set_micro_tile_mode(surf, &ws->info); /* For 2D modes only. */ if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) { @@ -394,6 +521,9 @@ static int amdgpu_surface_init(struct radeon_winsys *rws, surf->mtilea = AddrSurfInfoOut.pTileInfo->macroAspectRatio; surf->tile_split = AddrSurfInfoOut.pTileInfo->tileSplitBytes; surf->num_banks = AddrSurfInfoOut.pTileInfo->banks; + surf->macro_tile_index = AddrSurfInfoOut.macroModeIndex; + } else { + surf->macro_tile_index = 0; } } } @@ -401,18 +531,24 @@ static int amdgpu_surface_init(struct radeon_winsys *rws, /* Calculate texture layout information for stencil. */ if (surf->flags & RADEON_SURF_SBUFFER) { AddrSurfInfoIn.bpp = 8; + AddrSurfInfoIn.flags.depth = 0; + AddrSurfInfoIn.flags.stencil = 1; + AddrSurfInfoIn.flags.tcCompatible = 0; /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */ AddrTileInfoIn.tileSplitBytes = surf->stencil_tile_split; for (level = 0; level <= surf->last_level; level++) { r = compute_level(ws, surf, true, level, type, compressed, - &AddrSurfInfoIn, &AddrSurfInfoOut); + &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut, + NULL, NULL); if (r) return r; - if (level == 0) { - surf->stencil_offset = surf->stencil_level[0].offset; + /* DB uses the depth pitch for both stencil and depth. */ + if (surf->stencil_level[level].nblk_x != surf->level[level].nblk_x) + surf->stencil_adjusted = true; + if (level == 0) { /* For 2D modes only. */ if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) { surf->stencil_tile_split = @@ -422,6 +558,22 @@ static int amdgpu_surface_init(struct radeon_winsys *rws, } } + /* Recalculate the whole DCC miptree size including disabled levels. + * This is what addrlib does, but calling addrlib would be a lot more + * complicated. + */ + if (surf->dcc_size && surf->last_level > 0) { + surf->dcc_size = align64(surf->bo_size >> 8, + ws->info.pipe_interleave_bytes * + ws->info.num_tile_pipes); + } + + /* Make sure HTILE covers the whole miptree, because the shader reads + * TC-compatible HTILE even for levels where it's disabled by DB. + */ + if (surf->htile_size && surf->last_level) + surf->htile_size *= 2; + return 0; } diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index 824f0d380..d92c0bd83 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -39,7 +39,7 @@ #include <xf86drm.h> #include <stdio.h> #include <sys/stat.h> -#include "amdgpu_id.h" +#include "amd/common/amdgpu_id.h" #define CIK_TILE_MODE_COLOR_2D 14 @@ -59,6 +59,10 @@ #define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16 16 #define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16 17 +#ifndef AMDGPU_INFO_NUM_EVICTIONS +#define AMDGPU_INFO_NUM_EVICTIONS 0x18 +#endif + static struct util_hash_table *dev_tab = NULL; pipe_static_mutex(dev_tab_mutex); @@ -68,7 +72,6 @@ static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info) switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) { case CIK__PIPE_CONFIG__ADDR_SURF_P2: - default: return 2; case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16: case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16: @@ -86,31 +89,35 @@ static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info) case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16: case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16: return 16; + default: + fprintf(stderr, "Invalid CIK pipe configuration, assuming P2\n"); + assert(!"this should never occur"); + return 2; } } -/* Convert Sea Islands register values GB_ADDR_CFG and MC_ADDR_CFG - * into GB_TILING_CONFIG register which is only present on R600-R700. */ -static unsigned r600_get_gb_tiling_config(struct amdgpu_gpu_info *info) -{ - unsigned num_pipes = info->gb_addr_cfg & 0x7; - unsigned num_banks = info->mc_arb_ramcfg & 0x3; - unsigned pipe_interleave_bytes = (info->gb_addr_cfg >> 4) & 0x7; - unsigned row_size = (info->gb_addr_cfg >> 28) & 0x3; - - return num_pipes | (num_banks << 4) | - (pipe_interleave_bytes << 8) | - (row_size << 12); -} - /* Helper function to do the ioctls needed for setup and init. */ -static boolean do_winsys_init(struct amdgpu_winsys *ws) +static bool do_winsys_init(struct amdgpu_winsys *ws, int fd) { struct amdgpu_buffer_size_alignments alignment_info = {}; struct amdgpu_heap_info vram, gtt; struct drm_amdgpu_info_hw_ip dma = {}, uvd = {}, vce = {}; - uint32_t vce_version = 0, vce_feature = 0; + uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0; + uint32_t unused_feature; int r, i, j; + drmDevicePtr devinfo; + + /* Get PCI info. */ + r = drmGetDevice(fd, &devinfo); + if (r) { + fprintf(stderr, "amdgpu: drmGetDevice failed.\n"); + goto fail; + } + ws->info.pci_domain = devinfo->businfo.pci->domain; + ws->info.pci_bus = devinfo->businfo.pci->bus; + ws->info.pci_dev = devinfo->businfo.pci->dev; + ws->info.pci_func = devinfo->businfo.pci->func; + drmFreeDevice(&devinfo); /* Query hardware and driver information. */ r = amdgpu_query_gpu_info(ws->dev, &ws->amdinfo); @@ -149,6 +156,34 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws) goto fail; } + r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, + &ws->info.me_fw_version, &unused_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n"); + goto fail; + } + + r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0, + &ws->info.pfp_fw_version, &unused_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n"); + goto fail; + } + + r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_GFX_CE, 0, 0, + &ws->info.ce_fw_version, &unused_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(ce) failed.\n"); + goto fail; + } + + r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_UVD, 0, 0, + &uvd_version, &uvd_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n"); + goto fail; + } + r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_VCE, 0, &vce); if (r) { fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n"); @@ -180,15 +215,16 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws) ws->info.chip_class = VI; else if (ws->info.family >= CHIP_BONAIRE) ws->info.chip_class = CIK; + else if (ws->info.family >= CHIP_TAHITI) + ws->info.chip_class = SI; else { fprintf(stderr, "amdgpu: Unknown family.\n"); goto fail; } - /* LLVM 3.6 is required for VI. */ + /* LLVM 3.6.1 is required for VI. */ if (ws->info.chip_class >= VI && - (HAVE_LLVM < 0x0306 || - (HAVE_LLVM == 0x0306 && MESA_LLVM_VERSION_PATCH < 1))) { + HAVE_LLVM == 0x0306 && MESA_LLVM_VERSION_PATCH < 1) { fprintf(stderr, "amdgpu: LLVM 3.6.1 is required, got LLVM %i.%i.%i\n", HAVE_LLVM >> 8, HAVE_LLVM & 255, MESA_LLVM_VERSION_PATCH); goto fail; @@ -196,6 +232,26 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws) /* family and rev_id are for addrlib */ switch (ws->info.family) { + case CHIP_TAHITI: + ws->family = FAMILY_SI; + ws->rev_id = SI_TAHITI_P_A0; + break; + case CHIP_PITCAIRN: + ws->family = FAMILY_SI; + ws->rev_id = SI_PITCAIRN_PM_A0; + break; + case CHIP_VERDE: + ws->family = FAMILY_SI; + ws->rev_id = SI_CAPEVERDE_M_A0; + break; + case CHIP_OLAND: + ws->family = FAMILY_SI; + ws->rev_id = SI_OLAND_M_A0; + break; + case CHIP_HAINAN: + ws->family = FAMILY_SI; + ws->rev_id = SI_HAINAN_V_A0; + break; case CHIP_BONAIRE: ws->family = FAMILY_CI; ws->rev_id = CI_BONAIRE_M_A0; @@ -236,6 +292,14 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws) ws->family = FAMILY_VI; ws->rev_id = VI_FIJI_P_A0; break; + case CHIP_POLARIS10: + ws->family = FAMILY_VI; + ws->rev_id = VI_POLARIS10_P_A0; + break; + case CHIP_POLARIS11: + ws->family = FAMILY_VI; + ws->rev_id = VI_POLARIS11_M_A0; + break; default: fprintf(stderr, "amdgpu: Unknown family.\n"); goto fail; @@ -247,69 +311,83 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws) goto fail; } + /* Set which chips have dedicated VRAM. */ + ws->info.has_dedicated_vram = + !(ws->amdinfo.ids_flags & AMDGPU_IDS_FLAGS_FUSION); + /* Set hardware information. */ ws->info.gart_size = gtt.heap_size; ws->info.vram_size = vram.heap_size; + /* The kernel can split large buffers in VRAM but not in GTT, so large + * allocations can fail or cause buffer movement failures in the kernel. + */ + ws->info.max_alloc_size = MIN2(ws->info.vram_size * 0.9, ws->info.gart_size * 0.7); /* convert the shader clock from KHz to MHz */ - ws->info.max_sclk = ws->amdinfo.max_engine_clk / 1000; + ws->info.max_shader_clock = ws->amdinfo.max_engine_clk / 1000; ws->info.max_se = ws->amdinfo.num_shader_engines; ws->info.max_sh_per_se = ws->amdinfo.num_shader_arrays_per_engine; ws->info.has_uvd = uvd.available_rings != 0; + ws->info.uvd_fw_version = + uvd.available_rings ? uvd_version : 0; ws->info.vce_fw_version = vce.available_rings ? vce_version : 0; - ws->info.has_userptr = TRUE; - ws->info.r600_num_backends = ws->amdinfo.rb_pipes; - ws->info.r600_clock_crystal_freq = ws->amdinfo.gpu_counter_freq; - ws->info.r600_tiling_config = r600_get_gb_tiling_config(&ws->amdinfo); - ws->info.r600_num_tile_pipes = cik_get_num_tile_pipes(&ws->amdinfo); - ws->info.r600_max_pipes = ws->amdinfo.max_quad_shader_pipes; /* TODO: is this correct? */ - ws->info.r600_virtual_address = TRUE; - ws->info.r600_has_dma = dma.available_rings != 0; - - /* Guess what the maximum compute unit number is by looking at the mask - * of enabled CUs. - */ + ws->info.has_userptr = true; + ws->info.num_render_backends = ws->amdinfo.rb_pipes; + ws->info.clock_crystal_freq = ws->amdinfo.gpu_counter_freq; + ws->info.num_tile_pipes = cik_get_num_tile_pipes(&ws->amdinfo); + ws->info.pipe_interleave_bytes = 256 << ((ws->amdinfo.gb_addr_cfg >> 4) & 0x7); + ws->info.has_virtual_memory = true; + ws->info.has_sdma = dma.available_rings != 0; + + /* Get the number of good compute units. */ + ws->info.num_good_compute_units = 0; for (i = 0; i < ws->info.max_se; i++) - for (j = 0; j < ws->info.max_sh_per_se; j++) { - unsigned max = util_last_bit(ws->amdinfo.cu_bitmap[i][j]); - - if (ws->info.max_compute_units < max) - ws->info.max_compute_units = max; - } - ws->info.max_compute_units *= ws->info.max_se * ws->info.max_sh_per_se; + for (j = 0; j < ws->info.max_sh_per_se; j++) + ws->info.num_good_compute_units += + util_bitcount(ws->amdinfo.cu_bitmap[i][j]); memcpy(ws->info.si_tile_mode_array, ws->amdinfo.gb_tile_mode, sizeof(ws->amdinfo.gb_tile_mode)); - ws->info.si_tile_mode_array_valid = TRUE; - ws->info.si_backend_enabled_mask = ws->amdinfo.enabled_rb_pipes_mask; + ws->info.enabled_rb_mask = ws->amdinfo.enabled_rb_pipes_mask; memcpy(ws->info.cik_macrotile_mode_array, ws->amdinfo.gb_macro_tile_mode, sizeof(ws->amdinfo.gb_macro_tile_mode)); - ws->info.cik_macrotile_mode_array_valid = TRUE; - ws->gart_page_size = alignment_info.size_remote; + ws->info.gart_page_size = alignment_info.size_remote; + + if (ws->info.chip_class == SI) + ws->info.gfx_ib_pad_with_type2 = TRUE; + + ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL; - return TRUE; + return true; fail: if (ws->addrlib) AddrDestroy(ws->addrlib); amdgpu_device_deinitialize(ws->dev); ws->dev = NULL; - return FALSE; + return false; +} + +static void do_winsys_deinit(struct amdgpu_winsys *ws) +{ + AddrDestroy(ws->addrlib); + amdgpu_device_deinitialize(ws->dev); } static void amdgpu_winsys_destroy(struct radeon_winsys *rws) { struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws; - pipe_mutex_destroy(ws->bo_fence_lock); - - ws->cman->destroy(ws->cman); - ws->kman->destroy(ws->kman); - AddrDestroy(ws->addrlib); + if (util_queue_is_initialized(&ws->cs_queue)) + util_queue_destroy(&ws->cs_queue); - amdgpu_device_deinitialize(ws->dev); + pipe_mutex_destroy(ws->bo_fence_lock); + pb_slabs_deinit(&ws->bo_slabs); + pb_cache_deinit(&ws->bo_cache); + pipe_mutex_destroy(ws->global_bo_list_lock); + do_winsys_deinit(ws); FREE(rws); } @@ -319,11 +397,11 @@ static void amdgpu_winsys_query_info(struct radeon_winsys *rws, *info = ((struct amdgpu_winsys *)rws)->info; } -static boolean amdgpu_cs_request_feature(struct radeon_winsys_cs *rcs, - enum radeon_feature_id fid, - boolean enable) +static bool amdgpu_cs_request_feature(struct radeon_winsys_cs *rcs, + enum radeon_feature_id fid, + bool enable) { - return FALSE; + return false; } static uint64_t amdgpu_query_value(struct radeon_winsys *rws, @@ -338,6 +416,10 @@ static uint64_t amdgpu_query_value(struct radeon_winsys *rws, return ws->allocated_vram; case RADEON_REQUESTED_GTT_MEMORY: return ws->allocated_gtt; + case RADEON_MAPPED_VRAM: + return ws->mapped_vram; + case RADEON_MAPPED_GTT: + return ws->mapped_gtt; case RADEON_BUFFER_WAIT_TIME_NS: return ws->buffer_wait_time; case RADEON_TIMESTAMP: @@ -348,6 +430,9 @@ static uint64_t amdgpu_query_value(struct radeon_winsys *rws, case RADEON_NUM_BYTES_MOVED: amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_BYTES_MOVED, 8, &retval); return retval; + case RADEON_NUM_EVICTIONS: + amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_EVICTIONS, 8, &retval); + return retval; case RADEON_VRAM_USAGE: amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &heap); return heap.heap_usage; @@ -365,14 +450,14 @@ static uint64_t amdgpu_query_value(struct radeon_winsys *rws, return 0; } -static void amdgpu_read_registers(struct radeon_winsys *rws, +static bool amdgpu_read_registers(struct radeon_winsys *rws, unsigned reg_offset, unsigned num_registers, uint32_t *out) { struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws; - amdgpu_read_mm_registers(ws->dev, reg_offset / 4, num_registers, - 0xffffffff, 0, out); + return amdgpu_read_mm_registers(ws->dev, reg_offset / 4, num_registers, + 0xffffffff, 0, out) == 0; } static unsigned hash_dev(void *key) @@ -389,9 +474,11 @@ static int compare_dev(void *key1, void *key2) return key1 != key2; } -static bool amdgpu_winsys_unref(struct radeon_winsys *ws) +DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", true) + +static bool amdgpu_winsys_unref(struct radeon_winsys *rws) { - struct amdgpu_winsys *rws = (struct amdgpu_winsys*)ws; + struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws; bool destroy; /* When the reference counter drops to zero, remove the device pointer @@ -401,9 +488,9 @@ static bool amdgpu_winsys_unref(struct radeon_winsys *ws) * from the table when the counter drops to 0. */ pipe_mutex_lock(dev_tab_mutex); - destroy = pipe_reference(&rws->reference, NULL); + destroy = pipe_reference(&ws->reference, NULL); if (destroy && dev_tab) - util_hash_table_remove(dev_tab, rws->dev); + util_hash_table_remove(dev_tab, ws->dev); pipe_mutex_unlock(dev_tab_mutex); return destroy; @@ -448,26 +535,31 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) /* Create a new winsys. */ ws = CALLOC_STRUCT(amdgpu_winsys); - if (!ws) { - pipe_mutex_unlock(dev_tab_mutex); - return NULL; - } + if (!ws) + goto fail; ws->dev = dev; ws->info.drm_major = drm_major; ws->info.drm_minor = drm_minor; - if (!do_winsys_init(ws)) - goto fail; + if (!do_winsys_init(ws, fd)) + goto fail_alloc; /* Create managers. */ - ws->kman = amdgpu_bomgr_create(ws); - if (!ws->kman) - goto fail; - ws->cman = pb_cache_manager_create(ws->kman, 500000, 2.0f, 0, - (ws->info.vram_size + ws->info.gart_size) / 8); - if (!ws->cman) - goto fail; + pb_cache_init(&ws->bo_cache, 500000, ws->check_vm ? 1.0f : 2.0f, 0, + (ws->info.vram_size + ws->info.gart_size) / 8, + amdgpu_bo_destroy, amdgpu_bo_can_reclaim); + + if (!pb_slabs_init(&ws->bo_slabs, + AMDGPU_SLAB_MIN_SIZE_LOG2, AMDGPU_SLAB_MAX_SIZE_LOG2, + 12, /* number of heaps (domain/flags combinations) */ + ws, + amdgpu_bo_can_reclaim_slab, + amdgpu_bo_slab_alloc, + amdgpu_bo_slab_free)) + goto fail_cache; + + ws->info.min_alloc_size = 1 << AMDGPU_SLAB_MIN_SIZE_LOG2; /* init reference */ pipe_reference_init(&ws->reference, 1); @@ -480,12 +572,17 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) ws->base.query_value = amdgpu_query_value; ws->base.read_registers = amdgpu_read_registers; - amdgpu_bomgr_init_functions(ws); + amdgpu_bo_init_functions(ws); amdgpu_cs_init_functions(ws); amdgpu_surface_init_functions(ws); + LIST_INITHEAD(&ws->global_bo_list); + pipe_mutex_init(ws->global_bo_list_lock); pipe_mutex_init(ws->bo_fence_lock); + if (sysconf(_SC_NPROCESSORS_ONLN) > 1 && debug_get_option_thread()) + util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1); + /* Create the screen at the end. The winsys must be initialized * completely. * @@ -507,12 +604,12 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) return &ws->base; +fail_cache: + pb_cache_deinit(&ws->bo_cache); + do_winsys_deinit(ws); +fail_alloc: + FREE(ws); fail: pipe_mutex_unlock(dev_tab_mutex); - if (ws->cman) - ws->cman->destroy(ws->cman); - if (ws->kman) - ws->kman->destroy(ws->kman); - FREE(ws); return NULL; } diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h index 4d07644c9..69c663807 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h @@ -32,16 +32,23 @@ #ifndef AMDGPU_WINSYS_H #define AMDGPU_WINSYS_H +#include "pipebuffer/pb_cache.h" +#include "pipebuffer/pb_slab.h" #include "gallium/drivers/radeon/radeon_winsys.h" #include "addrlib/addrinterface.h" -#include "os/os_thread.h" +#include "util/u_queue.h" #include <amdgpu.h> struct amdgpu_cs; +#define AMDGPU_SLAB_MIN_SIZE_LOG2 9 +#define AMDGPU_SLAB_MAX_SIZE_LOG2 14 + struct amdgpu_winsys { struct radeon_winsys base; struct pipe_reference reference; + struct pb_cache bo_cache; + struct pb_slabs bo_slabs; amdgpu_device_handle dev; @@ -51,19 +58,27 @@ struct amdgpu_winsys { uint32_t next_bo_unique_id; uint64_t allocated_vram; uint64_t allocated_gtt; + uint64_t mapped_vram; + uint64_t mapped_gtt; uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */ uint64_t num_cs_flushes; - unsigned gart_page_size; struct radeon_info info; - struct pb_manager *kman; - struct pb_manager *cman; + /* multithreaded IB submission */ + struct util_queue cs_queue; struct amdgpu_gpu_info amdinfo; ADDR_HANDLE addrlib; uint32_t rev_id; unsigned family; + + bool check_vm; + + /* List of all allocated buffers */ + pipe_mutex global_bo_list_lock; + struct list_head global_bo_list; + unsigned num_buffers; }; static inline struct amdgpu_winsys * |