diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2018-10-23 06:36:00 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2018-10-23 06:36:00 +0000 |
commit | b65fcab046d3a1b6b6ac315720df220925c5322e (patch) | |
tree | ff73dcc383ac0799c655ff6194cda9dacb75dde9 /lib/mesa/src/gallium/winsys/amdgpu | |
parent | 18d6381c51e253e4c41c62619f80d9ce745b95c8 (diff) |
Merge Mesa 17.3.9
Mesa 18.x needs an ld with build-id for at least the intel code
Mesa 18.2 assumes linux only memfd syscalls in intel code
Tested by matthieu@, kettenis@ and myself on a variety of hardware and
architectures. ok kettenis@
Diffstat (limited to 'lib/mesa/src/gallium/winsys/amdgpu')
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am | 4 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in | 81 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 732 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 40 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 895 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 63 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_public.h | 7 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c | 569 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 391 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h | 22 |
10 files changed, 1471 insertions, 1333 deletions
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am index 543325cc2..0889591f2 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am @@ -10,5 +10,7 @@ AM_CXXFLAGS = $(AM_CFLAGS) noinst_LTLIBRARIES = libamdgpuwinsys.la -libamdgpuwinsys_la_LIBADD = $(top_builddir)/src/amd/addrlib/libamdgpu_addrlib.la +libamdgpuwinsys_la_LIBADD = \ + $(top_builddir)/src/amd/addrlib/libamdgpu_addrlib.la + libamdgpuwinsys_la_SOURCES = $(C_SOURCES) diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in index 5e197a855..80be6b08b 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in @@ -54,16 +54,20 @@ target_triplet = @target@ DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \ $(srcdir)/Makefile.sources $(top_srcdir)/bin/depcomp \ $(top_srcdir)/src/gallium/Automake.inc -@HAVE_DRISW_TRUE@am__append_1 = \ +@HAVE_LIBDRM_TRUE@am__append_1 = \ +@HAVE_LIBDRM_TRUE@ $(LIBDRM_LIBS) + +@HAVE_DRISW_TRUE@am__append_2 = \ @HAVE_DRISW_TRUE@ $(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la -@HAVE_DRISW_KMS_TRUE@am__append_2 = \ +@HAVE_DRISW_KMS_TRUE@am__append_3 = \ @HAVE_DRISW_KMS_TRUE@ $(top_builddir)/src/gallium/winsys/sw/kms-dri/libswkmsdri.la \ @HAVE_DRISW_KMS_TRUE@ $(LIBDRM_LIBS) subdir = src/gallium/winsys/amdgpu/drm ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_gnu_make.m4 \ +am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_compile_flag.m4 \ + $(top_srcdir)/m4/ax_check_gnu_make.m4 \ $(top_srcdir)/m4/ax_check_python_mako_module.m4 \ $(top_srcdir)/m4/ax_gcc_builtin.m4 \ $(top_srcdir)/m4/ax_gcc_func_attribute.m4 \ @@ -138,6 +142,8 @@ AMDGPU_CFLAGS = @AMDGPU_CFLAGS@ AMDGPU_LIBS = @AMDGPU_LIBS@ AMTAR = @AMTAR@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +ANDROID_CFLAGS = @ANDROID_CFLAGS@ +ANDROID_LIBS = @ANDROID_LIBS@ AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ @@ -168,8 +174,6 @@ DLLTOOL = @DLLTOOL@ DLOPEN_LIBS = @DLOPEN_LIBS@ DRI2PROTO_CFLAGS = @DRI2PROTO_CFLAGS@ DRI2PROTO_LIBS = @DRI2PROTO_LIBS@ -DRI3PROTO_CFLAGS = @DRI3PROTO_CFLAGS@ -DRI3PROTO_LIBS = @DRI3PROTO_LIBS@ DRIGL_CFLAGS = @DRIGL_CFLAGS@ DRIGL_LIBS = @DRIGL_LIBS@ DRI_DRIVER_INSTALL_DIR = @DRI_DRIVER_INSTALL_DIR@ @@ -182,10 +186,11 @@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGL_CFLAGS = @EGL_CFLAGS@ -EGL_CLIENT_APIS = @EGL_CLIENT_APIS@ EGL_LIB_DEPS = @EGL_LIB_DEPS@ EGL_NATIVE_PLATFORM = @EGL_NATIVE_PLATFORM@ EGREP = @EGREP@ +ETNAVIV_CFLAGS = @ETNAVIV_CFLAGS@ +ETNAVIV_LIBS = @ETNAVIV_LIBS@ EXEEXT = @EXEEXT@ EXPAT_CFLAGS = @EXPAT_CFLAGS@ EXPAT_LIBS = @EXPAT_LIBS@ @@ -212,6 +217,8 @@ GL_PC_LIB_PRIV = @GL_PC_LIB_PRIV@ GL_PC_REQ_PRIV = @GL_PC_REQ_PRIV@ GREP = @GREP@ HAVE_XF86VIDMODE = @HAVE_XF86VIDMODE@ +I915_CFLAGS = @I915_CFLAGS@ +I915_LIBS = @I915_LIBS@ INDENT = @INDENT@ INDENT_FLAGS = @INDENT_FLAGS@ INSTALL = @INSTALL@ @@ -219,45 +226,40 @@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ -INTEL_CFLAGS = @INTEL_CFLAGS@ -INTEL_LIBS = @INTEL_LIBS@ LD = @LD@ LDFLAGS = @LDFLAGS@ LD_NO_UNDEFINED = @LD_NO_UNDEFINED@ LEX = @LEX@ LEXLIB = @LEXLIB@ LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LIBATOMIC_LIBS = @LIBATOMIC_LIBS@ LIBCLC_INCLUDEDIR = @LIBCLC_INCLUDEDIR@ LIBCLC_LIBEXECDIR = @LIBCLC_LIBEXECDIR@ LIBDRM_CFLAGS = @LIBDRM_CFLAGS@ LIBDRM_LIBS = @LIBDRM_LIBS@ LIBELF_CFLAGS = @LIBELF_CFLAGS@ LIBELF_LIBS = @LIBELF_LIBS@ +LIBGLVND_DATADIR = @LIBGLVND_DATADIR@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ -LIBSENSORS_LDFLAGS = @LIBSENSORS_LDFLAGS@ -LIBSHA1_CFLAGS = @LIBSHA1_CFLAGS@ -LIBSHA1_LIBS = @LIBSHA1_LIBS@ +LIBSENSORS_LIBS = @LIBSENSORS_LIBS@ LIBTOOL = @LIBTOOL@ +LIBUNWIND_CFLAGS = @LIBUNWIND_CFLAGS@ +LIBUNWIND_LIBS = @LIBUNWIND_LIBS@ LIB_DIR = @LIB_DIR@ LIB_EXT = @LIB_EXT@ LIPO = @LIPO@ -LLVM_BINDIR = @LLVM_BINDIR@ LLVM_CFLAGS = @LLVM_CFLAGS@ LLVM_CONFIG = @LLVM_CONFIG@ -LLVM_CPPFLAGS = @LLVM_CPPFLAGS@ LLVM_CXXFLAGS = @LLVM_CXXFLAGS@ LLVM_INCLUDEDIR = @LLVM_INCLUDEDIR@ LLVM_LDFLAGS = @LLVM_LDFLAGS@ -LLVM_LIBDIR = @LLVM_LIBDIR@ LLVM_LIBS = @LLVM_LIBS@ -LLVM_VERSION = @LLVM_VERSION@ LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ MAINT = @MAINT@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ -MESA_LLVM = @MESA_LLVM@ MKDIR_P = @MKDIR_P@ MSVC2013_COMPAT_CFLAGS = @MSVC2013_COMPAT_CFLAGS@ MSVC2013_COMPAT_CXXFLAGS = @MSVC2013_COMPAT_CXXFLAGS@ @@ -273,13 +275,11 @@ NVVIEUX_CFLAGS = @NVVIEUX_CFLAGS@ NVVIEUX_LIBS = @NVVIEUX_LIBS@ OBJDUMP = @OBJDUMP@ OBJEXT = @OBJEXT@ -OMX_CFLAGS = @OMX_CFLAGS@ -OMX_LIBS = @OMX_LIBS@ -OMX_LIB_INSTALL_DIR = @OMX_LIB_INSTALL_DIR@ +OMX_BELLAGIO_CFLAGS = @OMX_BELLAGIO_CFLAGS@ +OMX_BELLAGIO_LIBS = @OMX_BELLAGIO_LIBS@ +OMX_BELLAGIO_LIB_INSTALL_DIR = @OMX_BELLAGIO_LIB_INSTALL_DIR@ OPENCL_LIBNAME = @OPENCL_LIBNAME@ OPENCL_VERSION = @OPENCL_VERSION@ -OPENSSL_CFLAGS = @OPENSSL_CFLAGS@ -OPENSSL_LIBS = @OPENSSL_LIBS@ OSMESA_LIB = @OSMESA_LIB@ OSMESA_LIB_DEPS = @OSMESA_LIB_DEPS@ OSMESA_PC_LIB_PRIV = @OSMESA_PC_LIB_PRIV@ @@ -299,8 +299,6 @@ PKG_CONFIG = @PKG_CONFIG@ PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ POSIX_SHELL = @POSIX_SHELL@ -PRESENTPROTO_CFLAGS = @PRESENTPROTO_CFLAGS@ -PRESENTPROTO_LIBS = @PRESENTPROTO_LIBS@ PTHREADSTUBS_CFLAGS = @PTHREADSTUBS_CFLAGS@ PTHREADSTUBS_LIBS = @PTHREADSTUBS_LIBS@ PTHREAD_CC = @PTHREAD_CC@ @@ -316,8 +314,6 @@ SED = @SED@ SELINUX_CFLAGS = @SELINUX_CFLAGS@ SELINUX_LIBS = @SELINUX_LIBS@ SET_MAKE = @SET_MAKE@ -SHA1_CFLAGS = @SHA1_CFLAGS@ -SHA1_LIBS = @SHA1_LIBS@ SHELL = @SHELL@ SIMPENROSE_CFLAGS = @SIMPENROSE_CFLAGS@ SIMPENROSE_LIBS = @SIMPENROSE_LIBS@ @@ -326,7 +322,8 @@ STRIP = @STRIP@ SWR_AVX2_CXXFLAGS = @SWR_AVX2_CXXFLAGS@ SWR_AVX_CXXFLAGS = @SWR_AVX_CXXFLAGS@ SWR_CXX11_CXXFLAGS = @SWR_CXX11_CXXFLAGS@ -TIMESTAMP_CMD = @TIMESTAMP_CMD@ +SWR_KNL_CXXFLAGS = @SWR_KNL_CXXFLAGS@ +SWR_SKX_CXXFLAGS = @SWR_SKX_CXXFLAGS@ VALGRIND_CFLAGS = @VALGRIND_CFLAGS@ VALGRIND_LIBS = @VALGRIND_LIBS@ VA_CFLAGS = @VA_CFLAGS@ @@ -334,25 +331,28 @@ VA_LIBS = @VA_LIBS@ VA_LIB_INSTALL_DIR = @VA_LIB_INSTALL_DIR@ VA_MAJOR = @VA_MAJOR@ VA_MINOR = @VA_MINOR@ -VC4_CFLAGS = @VC4_CFLAGS@ -VC4_LIBS = @VC4_LIBS@ +VC5_SIMULATOR_CFLAGS = @VC5_SIMULATOR_CFLAGS@ +VC5_SIMULATOR_LIBS = @VC5_SIMULATOR_LIBS@ VDPAU_CFLAGS = @VDPAU_CFLAGS@ VDPAU_LIBS = @VDPAU_LIBS@ VDPAU_LIB_INSTALL_DIR = @VDPAU_LIB_INSTALL_DIR@ VDPAU_MAJOR = @VDPAU_MAJOR@ VDPAU_MINOR = @VDPAU_MINOR@ VERSION = @VERSION@ -VG_LIB_DEPS = @VG_LIB_DEPS@ VISIBILITY_CFLAGS = @VISIBILITY_CFLAGS@ VISIBILITY_CXXFLAGS = @VISIBILITY_CXXFLAGS@ VL_CFLAGS = @VL_CFLAGS@ VL_LIBS = @VL_LIBS@ VULKAN_ICD_INSTALL_DIR = @VULKAN_ICD_INSTALL_DIR@ -WAYLAND_CFLAGS = @WAYLAND_CFLAGS@ -WAYLAND_LIBS = @WAYLAND_LIBS@ +WAYLAND_CLIENT_CFLAGS = @WAYLAND_CLIENT_CFLAGS@ +WAYLAND_CLIENT_LIBS = @WAYLAND_CLIENT_LIBS@ +WAYLAND_PROTOCOLS_DATADIR = @WAYLAND_PROTOCOLS_DATADIR@ WAYLAND_SCANNER = @WAYLAND_SCANNER@ WAYLAND_SCANNER_CFLAGS = @WAYLAND_SCANNER_CFLAGS@ WAYLAND_SCANNER_LIBS = @WAYLAND_SCANNER_LIBS@ +WAYLAND_SERVER_CFLAGS = @WAYLAND_SERVER_CFLAGS@ +WAYLAND_SERVER_LIBS = @WAYLAND_SERVER_LIBS@ +WNO_OVERRIDE_INIT = @WNO_OVERRIDE_INIT@ X11_INCLUDES = @X11_INCLUDES@ XA_MAJOR = @XA_MAJOR@ XA_MINOR = @XA_MINOR@ @@ -371,9 +371,10 @@ XVMC_LIBS = @XVMC_LIBS@ XVMC_LIB_INSTALL_DIR = @XVMC_LIB_INSTALL_DIR@ XVMC_MAJOR = @XVMC_MAJOR@ XVMC_MINOR = @XVMC_MINOR@ -XXD = @XXD@ YACC = @YACC@ YFLAGS = @YFLAGS@ +ZLIB_CFLAGS = @ZLIB_CFLAGS@ +ZLIB_LIBS = @ZLIB_LIBS@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -486,17 +487,15 @@ GALLIUM_TARGET_CFLAGS = \ -I$(top_srcdir)/src/gallium/auxiliary \ -I$(top_srcdir)/src/gallium/drivers \ -I$(top_srcdir)/src/gallium/winsys \ + -I$(top_builddir)/src/util/ \ + -I$(top_builddir)/src/gallium/drivers/ \ $(DEFINES) \ $(PTHREAD_CFLAGS) \ $(LIBDRM_CFLAGS) \ $(VISIBILITY_CFLAGS) -GALLIUM_COMMON_LIB_DEPS = \ - -lm \ - $(CLOCK_LIB) \ - $(PTHREAD_LIBS) \ - $(DLOPEN_LIBS) - +GALLIUM_COMMON_LIB_DEPS = -lm $(LIBUNWIND_LIBS) $(LIBSENSORS_LIBS) \ + $(CLOCK_LIB) $(PTHREAD_LIBS) $(DLOPEN_LIBS) $(am__append_1) GALLIUM_WINSYS_CFLAGS = \ -I$(top_srcdir)/src \ -I$(top_srcdir)/include \ @@ -508,7 +507,7 @@ GALLIUM_WINSYS_CFLAGS = \ GALLIUM_PIPE_LOADER_WINSYS_LIBS = \ $(top_builddir)/src/gallium/winsys/sw/null/libws_null.la \ $(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \ - $(am__append_1) $(am__append_2) + $(am__append_2) $(am__append_3) AM_CFLAGS = \ $(GALLIUM_WINSYS_CFLAGS) \ $(AMDGPU_CFLAGS) \ @@ -516,7 +515,9 @@ AM_CFLAGS = \ AM_CXXFLAGS = $(AM_CFLAGS) noinst_LTLIBRARIES = libamdgpuwinsys.la -libamdgpuwinsys_la_LIBADD = $(top_builddir)/src/amd/addrlib/libamdgpu_addrlib.la +libamdgpuwinsys_la_LIBADD = \ + $(top_builddir)/src/amd/addrlib/libamdgpu_addrlib.la + libamdgpuwinsys_la_SOURCES = $(C_SOURCES) all: all-am diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index e7ea51978..2a2fe25a8 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -38,6 +38,17 @@ #include <stdio.h> #include <inttypes.h> +#ifndef AMDGPU_GEM_CREATE_VM_ALWAYS_VALID +#define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6) +#endif + +/* Set to 1 for verbose output showing committed sparse buffer ranges. */ +#define DEBUG_SPARSE_COMMITS 0 + +struct amdgpu_sparse_backing_chunk { + uint32_t begin, end; +}; + static struct pb_buffer * amdgpu_bo_create(struct radeon_winsys *rws, uint64_t size, @@ -83,7 +94,7 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, unsigned idle_fences; bool buffer_idle; - pipe_mutex_lock(ws->bo_fence_lock); + mtx_lock(&ws->bo_fence_lock); for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) { if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false)) @@ -99,13 +110,13 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, bo->num_fences -= idle_fences; buffer_idle = !bo->num_fences; - pipe_mutex_unlock(ws->bo_fence_lock); + mtx_unlock(&ws->bo_fence_lock); return buffer_idle; } else { bool buffer_idle = true; - pipe_mutex_lock(ws->bo_fence_lock); + mtx_lock(&ws->bo_fence_lock); while (bo->num_fences && buffer_idle) { struct pipe_fence_handle *fence = NULL; bool fence_idle = false; @@ -113,12 +124,12 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, amdgpu_fence_reference(&fence, bo->fences[0]); /* Wait for the fence. */ - pipe_mutex_unlock(ws->bo_fence_lock); + mtx_unlock(&ws->bo_fence_lock); if (amdgpu_fence_wait(fence, abs_timeout, true)) fence_idle = true; else buffer_idle = false; - pipe_mutex_lock(ws->bo_fence_lock); + mtx_lock(&ws->bo_fence_lock); /* Release an idle fence to avoid checking it again later, keeping in * mind that the fence array may have been modified by other threads. @@ -132,7 +143,7 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, amdgpu_fence_reference(&fence, NULL); } - pipe_mutex_unlock(ws->bo_fence_lock); + mtx_unlock(&ws->bo_fence_lock); return buffer_idle; } @@ -160,10 +171,12 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf) assert(bo->bo && "must not be called for slab entries"); - pipe_mutex_lock(bo->ws->global_bo_list_lock); - LIST_DEL(&bo->u.real.global_list_item); - bo->ws->num_buffers--; - pipe_mutex_unlock(bo->ws->global_bo_list_lock); + if (bo->ws->debug_all_bos) { + mtx_lock(&bo->ws->global_bo_list_lock); + LIST_DEL(&bo->u.real.global_list_item); + bo->ws->num_buffers--; + mtx_unlock(&bo->ws->global_bo_list_lock); + } amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP); amdgpu_va_range_free(bo->u.real.va_handle); @@ -181,6 +194,7 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf) bo->ws->mapped_vram -= bo->base.size; else if (bo->initial_domain & RADEON_DOMAIN_GTT) bo->ws->mapped_gtt -= bo->base.size; + bo->ws->num_mapped_buffers--; } FREE(bo); @@ -209,6 +223,8 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, void *cpu = NULL; uint64_t offset = 0; + assert(!bo->sparse); + /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */ if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */ @@ -311,6 +327,7 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, real->ws->mapped_vram += real->base.size; else if (real->initial_domain & RADEON_DOMAIN_GTT) real->ws->mapped_gtt += real->base.size; + real->ws->num_mapped_buffers++; } return (uint8_t*)cpu + offset; } @@ -320,6 +337,8 @@ static void amdgpu_bo_unmap(struct pb_buffer *buf) struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; struct amdgpu_winsys_bo *real; + assert(!bo->sparse); + if (bo->user_ptr) return; @@ -330,6 +349,7 @@ static void amdgpu_bo_unmap(struct pb_buffer *buf) real->ws->mapped_vram -= real->base.size; else if (real->initial_domain & RADEON_DOMAIN_GTT) real->ws->mapped_gtt -= real->base.size; + real->ws->num_mapped_buffers--; } amdgpu_bo_cpu_unmap(real->bo); @@ -346,10 +366,12 @@ static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo) assert(bo->bo); - pipe_mutex_lock(ws->global_bo_list_lock); - LIST_ADDTAIL(&bo->u.real.global_list_item, &ws->global_bo_list); - ws->num_buffers++; - pipe_mutex_unlock(ws->global_bo_list_lock); + if (ws->debug_all_bos) { + mtx_lock(&ws->global_bo_list_lock); + LIST_ADDTAIL(&bo->u.real.global_list_item, &ws->global_bo_list); + ws->num_buffers++; + mtx_unlock(&ws->global_bo_list_lock); + } } static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, @@ -384,12 +406,14 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, if (initial_domain & RADEON_DOMAIN_GTT) request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; - if (flags & RADEON_FLAG_CPU_ACCESS) - request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; if (flags & RADEON_FLAG_NO_CPU_ACCESS) request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS; if (flags & RADEON_FLAG_GTT_WC) request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; + /* TODO: Enable this once the kernel handles it efficiently. */ + /*if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && + ws->info.drm_minor >= 20) + request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;*/ r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle); if (r) { @@ -401,6 +425,8 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, } va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0; + if (size > ws->info.pte_fragment_size) + alignment = MAX2(alignment, ws->info.pte_fragment_size); r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, size + va_gap_size, alignment, 0, &va, &va_handle, 0); if (r) @@ -421,6 +447,7 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, bo->u.real.va_handle = va_handle; bo->initial_domain = initial_domain; bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); + bo->is_local = !!(request.flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID); if (initial_domain & RADEON_DOMAIN_VRAM) ws->allocated_vram += align64(size, ws->info.gart_page_size); @@ -481,33 +508,16 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, { struct amdgpu_winsys *ws = priv; struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab); - enum radeon_bo_domain domains; - enum radeon_bo_flag flags = 0; + enum radeon_bo_domain domains = radeon_domain_from_heap(heap); + enum radeon_bo_flag flags = radeon_flags_from_heap(heap); uint32_t base_id; if (!slab) return NULL; - if (heap & 1) - flags |= RADEON_FLAG_GTT_WC; - if (heap & 2) - flags |= RADEON_FLAG_CPU_ACCESS; - - switch (heap >> 2) { - case 0: - domains = RADEON_DOMAIN_VRAM; - break; - default: - case 1: - domains = RADEON_DOMAIN_VRAM_GTT; - break; - case 2: - domains = RADEON_DOMAIN_GTT; - break; - } - + unsigned slab_size = 1 << AMDGPU_SLAB_BO_SIZE_LOG2; slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base, - 64 * 1024, 64 * 1024, + slab_size, slab_size, domains, flags)); if (!slab->buffer) goto fail; @@ -563,6 +573,462 @@ void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab) FREE(slab); } +#if DEBUG_SPARSE_COMMITS +static void +sparse_dump(struct amdgpu_winsys_bo *bo, const char *func) +{ + fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n" + "Commitments:\n", + __func__, bo, bo->base.size, bo->u.sparse.num_va_pages, func); + + struct amdgpu_sparse_backing *span_backing = NULL; + uint32_t span_first_backing_page = 0; + uint32_t span_first_va_page = 0; + uint32_t va_page = 0; + + for (;;) { + struct amdgpu_sparse_backing *backing = 0; + uint32_t backing_page = 0; + + if (va_page < bo->u.sparse.num_va_pages) { + backing = bo->u.sparse.commitments[va_page].backing; + backing_page = bo->u.sparse.commitments[va_page].page; + } + + if (span_backing && + (backing != span_backing || + backing_page != span_first_backing_page + (va_page - span_first_va_page))) { + fprintf(stderr, " %u..%u: backing=%p:%u..%u\n", + span_first_va_page, va_page - 1, span_backing, + span_first_backing_page, + span_first_backing_page + (va_page - span_first_va_page) - 1); + + span_backing = NULL; + } + + if (va_page >= bo->u.sparse.num_va_pages) + break; + + if (backing && !span_backing) { + span_backing = backing; + span_first_backing_page = backing_page; + span_first_va_page = va_page; + } + + va_page++; + } + + fprintf(stderr, "Backing:\n"); + + list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { + fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->base.size); + for (unsigned i = 0; i < backing->num_chunks; ++i) + fprintf(stderr, " %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end); + } +} +#endif + +/* + * Attempt to allocate the given number of backing pages. Fewer pages may be + * allocated (depending on the fragmentation of existing backing buffers), + * which will be reflected by a change to *pnum_pages. + */ +static struct amdgpu_sparse_backing * +sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages) +{ + struct amdgpu_sparse_backing *best_backing; + unsigned best_idx; + uint32_t best_num_pages; + + best_backing = NULL; + best_idx = 0; + best_num_pages = 0; + + /* This is a very simple and inefficient best-fit algorithm. */ + list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { + for (unsigned idx = 0; idx < backing->num_chunks; ++idx) { + uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin; + if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) || + (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) { + best_backing = backing; + best_idx = idx; + best_num_pages = cur_num_pages; + } + } + } + + /* Allocate a new backing buffer if necessary. */ + if (!best_backing) { + struct pb_buffer *buf; + uint64_t size; + uint32_t pages; + + best_backing = CALLOC_STRUCT(amdgpu_sparse_backing); + if (!best_backing) + return NULL; + + best_backing->max_chunks = 4; + best_backing->chunks = CALLOC(best_backing->max_chunks, + sizeof(*best_backing->chunks)); + if (!best_backing->chunks) { + FREE(best_backing); + return NULL; + } + + assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE)); + + size = MIN3(bo->base.size / 16, + 8 * 1024 * 1024, + bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE); + size = MAX2(size, RADEON_SPARSE_PAGE_SIZE); + + buf = amdgpu_bo_create(&bo->ws->base, size, RADEON_SPARSE_PAGE_SIZE, + bo->initial_domain, + bo->u.sparse.flags | RADEON_FLAG_NO_SUBALLOC); + if (!buf) { + FREE(best_backing->chunks); + FREE(best_backing); + return NULL; + } + + /* We might have gotten a bigger buffer than requested via caching. */ + pages = buf->size / RADEON_SPARSE_PAGE_SIZE; + + best_backing->bo = amdgpu_winsys_bo(buf); + best_backing->num_chunks = 1; + best_backing->chunks[0].begin = 0; + best_backing->chunks[0].end = pages; + + list_add(&best_backing->list, &bo->u.sparse.backing); + bo->u.sparse.num_backing_pages += pages; + + best_idx = 0; + best_num_pages = pages; + } + + *pnum_pages = MIN2(*pnum_pages, best_num_pages); + *pstart_page = best_backing->chunks[best_idx].begin; + best_backing->chunks[best_idx].begin += *pnum_pages; + + if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) { + memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1], + sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1)); + best_backing->num_chunks--; + } + + return best_backing; +} + +static void +sparse_free_backing_buffer(struct amdgpu_winsys_bo *bo, + struct amdgpu_sparse_backing *backing) +{ + struct amdgpu_winsys *ws = backing->bo->ws; + + bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE; + + mtx_lock(&ws->bo_fence_lock); + amdgpu_add_fences(backing->bo, bo->num_fences, bo->fences); + mtx_unlock(&ws->bo_fence_lock); + + list_del(&backing->list); + amdgpu_winsys_bo_reference(&backing->bo, NULL); + FREE(backing->chunks); + FREE(backing); +} + +/* + * Return a range of pages from the given backing buffer back into the + * free structure. + */ +static bool +sparse_backing_free(struct amdgpu_winsys_bo *bo, + struct amdgpu_sparse_backing *backing, + uint32_t start_page, uint32_t num_pages) +{ + uint32_t end_page = start_page + num_pages; + unsigned low = 0; + unsigned high = backing->num_chunks; + + /* Find the first chunk with begin >= start_page. */ + while (low < high) { + unsigned mid = low + (high - low) / 2; + + if (backing->chunks[mid].begin >= start_page) + high = mid; + else + low = mid + 1; + } + + assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin); + assert(low == 0 || backing->chunks[low - 1].end <= start_page); + + if (low > 0 && backing->chunks[low - 1].end == start_page) { + backing->chunks[low - 1].end = end_page; + + if (low < backing->num_chunks && end_page == backing->chunks[low].begin) { + backing->chunks[low - 1].end = backing->chunks[low].end; + memmove(&backing->chunks[low], &backing->chunks[low + 1], + sizeof(*backing->chunks) * (backing->num_chunks - low - 1)); + backing->num_chunks--; + } + } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) { + backing->chunks[low].begin = start_page; + } else { + if (backing->num_chunks >= backing->max_chunks) { + unsigned new_max_chunks = 2 * backing->max_chunks; + struct amdgpu_sparse_backing_chunk *new_chunks = + REALLOC(backing->chunks, + sizeof(*backing->chunks) * backing->max_chunks, + sizeof(*backing->chunks) * new_max_chunks); + if (!new_chunks) + return false; + + backing->max_chunks = new_max_chunks; + backing->chunks = new_chunks; + } + + memmove(&backing->chunks[low + 1], &backing->chunks[low], + sizeof(*backing->chunks) * (backing->num_chunks - low)); + backing->chunks[low].begin = start_page; + backing->chunks[low].end = end_page; + backing->num_chunks++; + } + + if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 && + backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE) + sparse_free_backing_buffer(bo, backing); + + return true; +} + +static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf) +{ + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); + int r; + + assert(!bo->bo && bo->sparse); + + r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, + (uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE, + bo->va, 0, AMDGPU_VA_OP_CLEAR); + if (r) { + fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r); + } + + while (!list_empty(&bo->u.sparse.backing)) { + struct amdgpu_sparse_backing *dummy = NULL; + sparse_free_backing_buffer(bo, + container_of(bo->u.sparse.backing.next, + dummy, list)); + } + + amdgpu_va_range_free(bo->u.sparse.va_handle); + mtx_destroy(&bo->u.sparse.commit_lock); + FREE(bo->u.sparse.commitments); + FREE(bo); +} + +static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = { + amdgpu_bo_sparse_destroy + /* other functions are never called */ +}; + +static struct pb_buffer * +amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size, + enum radeon_bo_domain domain, + enum radeon_bo_flag flags) +{ + struct amdgpu_winsys_bo *bo; + uint64_t map_size; + uint64_t va_gap_size; + int r; + + /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers + * that exceed this limit. This is not really a restriction: we don't have + * that much virtual address space anyway. + */ + if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE) + return NULL; + + bo = CALLOC_STRUCT(amdgpu_winsys_bo); + if (!bo) + return NULL; + + pipe_reference_init(&bo->base.reference, 1); + bo->base.alignment = RADEON_SPARSE_PAGE_SIZE; + bo->base.size = size; + bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl; + bo->ws = ws; + bo->initial_domain = domain; + bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); + bo->sparse = true; + bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE; + + bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); + bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages, + sizeof(*bo->u.sparse.commitments)); + if (!bo->u.sparse.commitments) + goto error_alloc_commitments; + + mtx_init(&bo->u.sparse.commit_lock, mtx_plain); + LIST_INITHEAD(&bo->u.sparse.backing); + + /* For simplicity, we always map a multiple of the page size. */ + map_size = align64(size, RADEON_SPARSE_PAGE_SIZE); + va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0; + r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, + map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE, + 0, &bo->va, &bo->u.sparse.va_handle, 0); + if (r) + goto error_va_alloc; + + r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va, + AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP); + if (r) + goto error_va_map; + + return &bo->base; + +error_va_map: + amdgpu_va_range_free(bo->u.sparse.va_handle); +error_va_alloc: + mtx_destroy(&bo->u.sparse.commit_lock); + FREE(bo->u.sparse.commitments); +error_alloc_commitments: + FREE(bo); + return NULL; +} + +static bool +amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size, + bool commit) +{ + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf); + struct amdgpu_sparse_commitment *comm; + uint32_t va_page, end_va_page; + bool ok = true; + int r; + + assert(bo->sparse); + assert(offset % RADEON_SPARSE_PAGE_SIZE == 0); + assert(offset <= bo->base.size); + assert(size <= bo->base.size - offset); + assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size); + + comm = bo->u.sparse.commitments; + va_page = offset / RADEON_SPARSE_PAGE_SIZE; + end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); + + mtx_lock(&bo->u.sparse.commit_lock); + +#if DEBUG_SPARSE_COMMITS + sparse_dump(bo, __func__); +#endif + + if (commit) { + while (va_page < end_va_page) { + uint32_t span_va_page; + + /* Skip pages that are already committed. */ + if (comm[va_page].backing) { + va_page++; + continue; + } + + /* Determine length of uncommitted span. */ + span_va_page = va_page; + while (va_page < end_va_page && !comm[va_page].backing) + va_page++; + + /* Fill the uncommitted span with chunks of backing memory. */ + while (span_va_page < va_page) { + struct amdgpu_sparse_backing *backing; + uint32_t backing_start, backing_size; + + backing_size = va_page - span_va_page; + backing = sparse_backing_alloc(bo, &backing_start, &backing_size); + if (!backing) { + ok = false; + goto out; + } + + r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo, + (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE, + (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE, + bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE, + AMDGPU_VM_PAGE_READABLE | + AMDGPU_VM_PAGE_WRITEABLE | + AMDGPU_VM_PAGE_EXECUTABLE, + AMDGPU_VA_OP_REPLACE); + if (r) { + ok = sparse_backing_free(bo, backing, backing_start, backing_size); + assert(ok && "sufficient memory should already be allocated"); + + ok = false; + goto out; + } + + while (backing_size) { + comm[span_va_page].backing = backing; + comm[span_va_page].page = backing_start; + span_va_page++; + backing_start++; + backing_size--; + } + } + } + } else { + r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, + (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE, + bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE, + AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE); + if (r) { + ok = false; + goto out; + } + + while (va_page < end_va_page) { + struct amdgpu_sparse_backing *backing; + uint32_t backing_start; + uint32_t span_pages; + + /* Skip pages that are already uncommitted. */ + if (!comm[va_page].backing) { + va_page++; + continue; + } + + /* Group contiguous spans of pages. */ + backing = comm[va_page].backing; + backing_start = comm[va_page].page; + comm[va_page].backing = NULL; + + span_pages = 1; + va_page++; + + while (va_page < end_va_page && + comm[va_page].backing == backing && + comm[va_page].page == backing_start + span_pages) { + comm[va_page].backing = NULL; + va_page++; + span_pages++; + } + + if (!sparse_backing_free(bo, backing, backing_start, span_pages)) { + /* Couldn't allocate tracking data structures, so we have to leak */ + fprintf(stderr, "amdgpu: leaking PRT backing memory\n"); + ok = false; + } + } + } +out: + + mtx_unlock(&bo->u.sparse.commit_lock); + + return ok; +} + static unsigned eg_tile_split(unsigned tile_split) { switch (tile_split) { @@ -597,7 +1063,7 @@ static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf, { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); struct amdgpu_bo_info info = {0}; - uint32_t tiling_flags; + uint64_t tiling_flags; int r; assert(bo->bo && "must not be called for slab entries"); @@ -608,21 +1074,25 @@ static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf, tiling_flags = info.metadata.tiling_info; - md->microtile = RADEON_LAYOUT_LINEAR; - md->macrotile = RADEON_LAYOUT_LINEAR; - - if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */ - md->macrotile = RADEON_LAYOUT_TILED; - else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */ - md->microtile = RADEON_LAYOUT_TILED; - - md->pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG); - md->bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH); - md->bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT); - md->tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT)); - md->mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT); - md->num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS); - md->scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */ + if (bo->ws->info.chip_class >= GFX9) { + md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE); + } else { + md->u.legacy.microtile = RADEON_LAYOUT_LINEAR; + md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR; + + if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */ + md->u.legacy.macrotile = RADEON_LAYOUT_TILED; + else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */ + md->u.legacy.microtile = RADEON_LAYOUT_TILED; + + md->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG); + md->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH); + md->u.legacy.bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT); + md->u.legacy.tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT)); + md->u.legacy.mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT); + md->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS); + md->u.legacy.scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */ + } md->size_metadata = info.metadata.size_metadata; memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata)); @@ -633,29 +1103,33 @@ static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf, { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); struct amdgpu_bo_metadata metadata = {0}; - uint32_t tiling_flags = 0; + uint64_t tiling_flags = 0; assert(bo->bo && "must not be called for slab entries"); - if (md->macrotile == RADEON_LAYOUT_TILED) - tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */ - else if (md->microtile == RADEON_LAYOUT_TILED) - tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */ - else - tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */ - - tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->pipe_config); - tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->bankw)); - tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->bankh)); - if (md->tile_split) - tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->tile_split)); - tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->mtilea)); - tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->num_banks)-1); - - if (md->scanout) - tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */ - else - tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */ + if (bo->ws->info.chip_class >= GFX9) { + tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode); + } else { + if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED) + tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */ + else if (md->u.legacy.microtile == RADEON_LAYOUT_TILED) + tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */ + else + tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */ + + tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->u.legacy.pipe_config); + tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->u.legacy.bankw)); + tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->u.legacy.bankh)); + if (md->u.legacy.tile_split) + tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->u.legacy.tile_split)); + tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->u.legacy.mtilea)); + tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->u.legacy.num_banks)-1); + + if (md->u.legacy.scanout) + tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */ + else + tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */ + } metadata.tiling_info = tiling_flags; metadata.size_metadata = md->size_metadata; @@ -673,36 +1147,24 @@ amdgpu_bo_create(struct radeon_winsys *rws, { struct amdgpu_winsys *ws = amdgpu_winsys(rws); struct amdgpu_winsys_bo *bo; - unsigned usage = 0, pb_cache_bucket; + unsigned usage = 0, pb_cache_bucket = 0; + + /* VRAM implies WC. This is not optional. */ + assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC); + + /* NO_CPU_ACCESS is valid with VRAM only. */ + assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS)); /* Sub-allocate small buffers from slabs. */ - if (!(flags & RADEON_FLAG_HANDLE) && + if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) && size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) && alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) { struct pb_slab_entry *entry; - unsigned heap = 0; + int heap = radeon_get_heap_index(domain, flags); - if (flags & RADEON_FLAG_GTT_WC) - heap |= 1; - if (flags & RADEON_FLAG_CPU_ACCESS) - heap |= 2; - if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_CPU_ACCESS)) + if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS) goto no_slab; - switch (domain) { - case RADEON_DOMAIN_VRAM: - heap |= 0 * 4; - break; - case RADEON_DOMAIN_VRAM_GTT: - heap |= 1 * 4; - break; - case RADEON_DOMAIN_GTT: - heap |= 2 * 4; - break; - default: - goto no_slab; - } - entry = pb_slab_alloc(&ws->bo_slabs, size, heap); if (!entry) { /* Clear the cache and try again. */ @@ -722,8 +1184,16 @@ amdgpu_bo_create(struct radeon_winsys *rws, } no_slab: + if (flags & RADEON_FLAG_SPARSE) { + assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0); + + flags |= RADEON_FLAG_NO_CPU_ACCESS; + + return amdgpu_bo_sparse_create(ws, size, domain, flags); + } + /* This flag is irrelevant for the cache. */ - flags &= ~RADEON_FLAG_HANDLE; + flags &= ~RADEON_FLAG_NO_SUBALLOC; /* Align size to page size. This is the minimum alignment for normal * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, @@ -732,30 +1202,23 @@ no_slab: size = align64(size, ws->info.gart_page_size); alignment = align(alignment, ws->info.gart_page_size); - /* Only set one usage bit each for domains and flags, or the cache manager - * might consider different sets of domains / flags compatible - */ - if (domain == RADEON_DOMAIN_VRAM_GTT) - usage = 1 << 2; - else - usage = domain >> 1; - assert(flags < sizeof(usage) * 8 - 3); - usage |= 1 << (flags + 3); - - /* Determine the pb_cache bucket for minimizing pb_cache misses. */ - pb_cache_bucket = 0; - if (domain & RADEON_DOMAIN_VRAM) /* VRAM or VRAM+GTT */ - pb_cache_bucket += 1; - if (flags == RADEON_FLAG_GTT_WC) /* WC */ - pb_cache_bucket += 2; - assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); - - /* Get a buffer from the cache. */ - bo = (struct amdgpu_winsys_bo*) - pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, - pb_cache_bucket); - if (bo) - return &bo->base; + bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING; + + if (use_reusable_pool) { + int heap = radeon_get_heap_index(domain, flags); + assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); + usage = 1 << heap; /* Only set one usage bit for each heap. */ + + pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); + assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); + + /* Get a buffer from the cache. */ + bo = (struct amdgpu_winsys_bo*) + pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, + pb_cache_bucket); + if (bo) + return &bo->base; + } /* Create a new one. */ bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags, @@ -770,7 +1233,7 @@ no_slab: return NULL; } - bo->u.real.use_reusable_pool = true; + bo->u.real.use_reusable_pool = use_reusable_pool; return &bo->base; } @@ -876,10 +1339,9 @@ static bool amdgpu_bo_get_handle(struct pb_buffer *buffer, enum amdgpu_bo_handle_type type; int r; - if (!bo->bo) { - offset += bo->va - bo->u.slab.real->va; - bo = bo->u.slab.real; - } + /* Don't allow exports of slab entries and sparse buffers. */ + if (!bo->bo) + return false; bo->u.real.use_reusable_pool = false; @@ -916,19 +1378,22 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws, struct amdgpu_winsys_bo *bo; uint64_t va; amdgpu_va_handle va_handle; + /* Avoid failure when the size is not page aligned */ + uint64_t aligned_size = align64(size, ws->info.gart_page_size); bo = CALLOC_STRUCT(amdgpu_winsys_bo); if (!bo) return NULL; - if (amdgpu_create_bo_from_user_mem(ws->dev, pointer, size, &buf_handle)) + if (amdgpu_create_bo_from_user_mem(ws->dev, pointer, + aligned_size, &buf_handle)) goto error; if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, - size, 1 << 12, 0, &va, &va_handle, 0)) + aligned_size, 1 << 12, 0, &va, &va_handle, 0)) goto error_va_alloc; - if (amdgpu_bo_va_op(buf_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP)) + if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP)) goto error_va_map; /* Initialize it. */ @@ -944,7 +1409,7 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws, bo->initial_domain = RADEON_DOMAIN_GTT; bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); - ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size); + ws->allocated_gtt += aligned_size; amdgpu_add_buffer_to_global_list(bo); @@ -966,6 +1431,13 @@ static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf) return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL; } +static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf) +{ + struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; + + return !bo->bo && !bo->sparse; +} + static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf) { return ((struct amdgpu_winsys_bo*)buf)->va; @@ -982,7 +1454,9 @@ void amdgpu_bo_init_functions(struct amdgpu_winsys *ws) ws->base.buffer_from_handle = amdgpu_bo_from_handle; ws->base.buffer_from_ptr = amdgpu_bo_from_ptr; ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr; + ws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated; ws->base.buffer_get_handle = amdgpu_bo_get_handle; + ws->base.buffer_commit = amdgpu_bo_sparse_commit; ws->base.buffer_get_virtual_address = amdgpu_bo_get_va; ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain; } diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h index 1e25897b6..10b095d7a 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h @@ -37,6 +37,28 @@ #include "pipebuffer/pb_slab.h" +struct amdgpu_sparse_backing_chunk; + +/* + * Sub-allocation information for a real buffer used as backing memory of a + * sparse buffer. + */ +struct amdgpu_sparse_backing { + struct list_head list; + + struct amdgpu_winsys_bo *bo; + + /* Sorted list of free chunks. */ + struct amdgpu_sparse_backing_chunk *chunks; + uint32_t max_chunks; + uint32_t num_chunks; +}; + +struct amdgpu_sparse_commitment { + struct amdgpu_sparse_backing *backing; + uint32_t page; +}; + struct amdgpu_winsys_bo { struct pb_buffer base; union { @@ -53,12 +75,26 @@ struct amdgpu_winsys_bo { struct pb_slab_entry entry; struct amdgpu_winsys_bo *real; } slab; + struct { + mtx_t commit_lock; + amdgpu_va_handle va_handle; + enum radeon_bo_flag flags; + + uint32_t num_va_pages; + uint32_t num_backing_pages; + + struct list_head backing; + + /* Commitment information for each page of the virtual memory area. */ + struct amdgpu_sparse_commitment *commitments; + } sparse; } u; struct amdgpu_winsys *ws; void *user_ptr; /* from buffer_from_ptr */ - amdgpu_bo_handle bo; /* NULL for slab entries */ + amdgpu_bo_handle bo; /* NULL for slab entries and sparse buffers */ + bool sparse; uint32_t unique_id; uint64_t va; enum radeon_bo_domain initial_domain; @@ -79,6 +115,8 @@ struct amdgpu_winsys_bo { unsigned num_fences; unsigned max_fences; struct pipe_fence_handle **fences; + + bool is_local; }; struct amdgpu_slab { diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 2b86827ff..e2555813e 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -33,7 +33,6 @@ #include "amdgpu_cs.h" #include "os/os_time.h" #include <stdio.h> -#include <amdgpu_drm.h> #include "amd/common/sid.h" @@ -48,6 +47,7 @@ amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type, struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); fence->reference.count = 1; + fence->ws = ctx->ws; fence->ctx = ctx; fence->fence.context = ctx->ctx; fence->fence.ip_type = ip_type; @@ -58,13 +58,89 @@ amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type, return (struct pipe_fence_handle *)fence; } +static struct pipe_fence_handle * +amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd) +{ + struct amdgpu_winsys *ws = amdgpu_winsys(rws); + struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); + + if (!fence) + return NULL; + + pipe_reference_init(&fence->reference, 1); + fence->ws = ws; + /* fence->ctx == NULL means that the fence is syncobj-based. */ + + /* Convert sync_file into syncobj. */ + int r = amdgpu_cs_create_syncobj(ws->dev, &fence->syncobj); + if (r) { + FREE(fence); + return NULL; + } + + r = amdgpu_cs_syncobj_import_sync_file(ws->dev, fence->syncobj, fd); + if (r) { + amdgpu_cs_destroy_syncobj(ws->dev, fence->syncobj); + FREE(fence); + return NULL; + } + return (struct pipe_fence_handle*)fence; +} + +static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws, + struct pipe_fence_handle *pfence) +{ + struct amdgpu_winsys *ws = amdgpu_winsys(rws); + struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence; + + if (amdgpu_fence_is_syncobj(fence)) { + int fd, r; + + /* Convert syncobj into sync_file. */ + r = amdgpu_cs_syncobj_export_sync_file(ws->dev, fence->syncobj, &fd); + return r ? -1 : fd; + } + + os_wait_until_zero(&fence->submission_in_progress, PIPE_TIMEOUT_INFINITE); + + /* Convert the amdgpu fence into a fence FD. */ + int fd; + if (amdgpu_cs_fence_to_handle(ws->dev, &fence->fence, + AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD, + (uint32_t*)&fd)) + return -1; + + return fd; +} + +static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws) +{ + struct amdgpu_winsys *ws = amdgpu_winsys(rws); + uint32_t syncobj; + int fd = -1; + + int r = amdgpu_cs_create_syncobj2(ws->dev, DRM_SYNCOBJ_CREATE_SIGNALED, + &syncobj); + if (r) { + return -1; + } + + r = amdgpu_cs_syncobj_export_sync_file(ws->dev, syncobj, &fd); + if (r) { + fd = -1; + } + + amdgpu_cs_destroy_syncobj(ws->dev, syncobj); + return fd; +} + static void amdgpu_fence_submitted(struct pipe_fence_handle *fence, - struct amdgpu_cs_request* request, - uint64_t *user_fence_cpu_address) + uint64_t seq_no, + uint64_t *user_fence_cpu_address) { struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence; - rfence->fence.fence = request->seq_no; + rfence->fence.fence = seq_no; rfence->user_fence_cpu_address = user_fence_cpu_address; rfence->submission_in_progress = false; } @@ -89,6 +165,21 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, if (rfence->signalled) return true; + /* Handle syncobjs. */ + if (amdgpu_fence_is_syncobj(rfence)) { + /* Absolute timeouts are only be used by BO fences, which aren't + * backed by syncobjs. + */ + assert(!absolute); + + if (amdgpu_cs_syncobj_wait(rfence->ws->dev, &rfence->syncobj, 1, + timeout, 0, NULL)) + return false; + + rfence->signalled = true; + return true; + } + if (absolute) abs_timeout = timeout; else @@ -154,9 +245,9 @@ amdgpu_cs_get_next_fence(struct radeon_winsys_cs *rcs) } fence = amdgpu_fence_create(cs->ctx, - cs->csc->request.ip_type, - cs->csc->request.ip_instance, - cs->csc->request.ring); + cs->csc->ib[IB_MAIN].ip_type, + cs->csc->ib[IB_MAIN].ip_instance, + cs->csc->ib[IB_MAIN].ring); if (!fence) return NULL; @@ -178,6 +269,7 @@ static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *ws) ctx->ws = amdgpu_winsys(ws); ctx->refcount = 1; + ctx->initial_num_total_rejected_cs = ctx->ws->num_total_rejected_cs; r = amdgpu_cs_ctx_create(ctx->ws->dev, &ctx->ctx); if (r) { @@ -227,6 +319,13 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx) uint32_t result, hangs; int r; + /* Return a failure due to a rejected command submission. */ + if (ctx->ws->num_total_rejected_cs > ctx->initial_num_total_rejected_cs) { + return ctx->num_rejected_cs ? PIPE_GUILTY_CONTEXT_RESET : + PIPE_INNOCENT_CONTEXT_RESET; + } + + /* Return a failure due to a GPU hang. */ r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs); if (r) { fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r); @@ -250,8 +349,9 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx) static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs) { - return cs->request.ip_type != AMDGPU_HW_IP_UVD && - cs->request.ip_type != AMDGPU_HW_IP_VCE; + return cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_UVD && + cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCE && + cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_DEC; } static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs) @@ -278,9 +378,12 @@ int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo * if (bo->bo) { buffers = cs->real_buffers; num_buffers = cs->num_real_buffers; - } else { + } else if (!bo->sparse) { buffers = cs->slab_buffers; num_buffers = cs->num_slab_buffers; + } else { + buffers = cs->sparse_buffers; + num_buffers = cs->num_sparse_buffers; } /* not found or found */ @@ -307,48 +410,31 @@ int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo * } static int -amdgpu_lookup_or_add_real_buffer(struct amdgpu_cs *acs, struct amdgpu_winsys_bo *bo) +amdgpu_do_add_real_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo) { - struct amdgpu_cs_context *cs = acs->csc; struct amdgpu_cs_buffer *buffer; - unsigned hash; - int idx = amdgpu_lookup_buffer(cs, bo); - - if (idx >= 0) - return idx; + int idx; /* New buffer, check if the backing array is large enough. */ if (cs->num_real_buffers >= cs->max_real_buffers) { unsigned new_max = MAX2(cs->max_real_buffers + 16, (unsigned)(cs->max_real_buffers * 1.3)); struct amdgpu_cs_buffer *new_buffers; - amdgpu_bo_handle *new_handles; - uint8_t *new_flags; new_buffers = MALLOC(new_max * sizeof(*new_buffers)); - new_handles = MALLOC(new_max * sizeof(*new_handles)); - new_flags = MALLOC(new_max * sizeof(*new_flags)); - if (!new_buffers || !new_handles || !new_flags) { - fprintf(stderr, "amdgpu_lookup_or_add_buffer: allocation failed\n"); + if (!new_buffers) { + fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n"); FREE(new_buffers); - FREE(new_handles); - FREE(new_flags); return -1; } memcpy(new_buffers, cs->real_buffers, cs->num_real_buffers * sizeof(*new_buffers)); - memcpy(new_handles, cs->handles, cs->num_real_buffers * sizeof(*new_handles)); - memcpy(new_flags, cs->flags, cs->num_real_buffers * sizeof(*new_flags)); FREE(cs->real_buffers); - FREE(cs->handles); - FREE(cs->flags); cs->max_real_buffers = new_max; cs->real_buffers = new_buffers; - cs->handles = new_handles; - cs->flags = new_flags; } idx = cs->num_real_buffers; @@ -356,11 +442,24 @@ amdgpu_lookup_or_add_real_buffer(struct amdgpu_cs *acs, struct amdgpu_winsys_bo memset(buffer, 0, sizeof(*buffer)); amdgpu_winsys_bo_reference(&buffer->bo, bo); - cs->handles[idx] = bo->bo; - cs->flags[idx] = 0; p_atomic_inc(&bo->num_cs_references); cs->num_real_buffers++; + return idx; +} + +static int +amdgpu_lookup_or_add_real_buffer(struct amdgpu_cs *acs, struct amdgpu_winsys_bo *bo) +{ + struct amdgpu_cs_context *cs = acs->csc; + unsigned hash; + int idx = amdgpu_lookup_buffer(cs, bo); + + if (idx >= 0) + return idx; + + idx = amdgpu_do_add_real_buffer(cs, bo); + hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); cs->buffer_indices_hashlist[hash] = idx; @@ -421,6 +520,63 @@ static int amdgpu_lookup_or_add_slab_buffer(struct amdgpu_cs *acs, return idx; } +static int amdgpu_lookup_or_add_sparse_buffer(struct amdgpu_cs *acs, + struct amdgpu_winsys_bo *bo) +{ + struct amdgpu_cs_context *cs = acs->csc; + struct amdgpu_cs_buffer *buffer; + unsigned hash; + int idx = amdgpu_lookup_buffer(cs, bo); + + if (idx >= 0) + return idx; + + /* New buffer, check if the backing array is large enough. */ + if (cs->num_sparse_buffers >= cs->max_sparse_buffers) { + unsigned new_max = + MAX2(cs->max_sparse_buffers + 16, (unsigned)(cs->max_sparse_buffers * 1.3)); + struct amdgpu_cs_buffer *new_buffers; + + new_buffers = REALLOC(cs->sparse_buffers, + cs->max_sparse_buffers * sizeof(*new_buffers), + new_max * sizeof(*new_buffers)); + if (!new_buffers) { + fprintf(stderr, "amdgpu_lookup_or_add_sparse_buffer: allocation failed\n"); + return -1; + } + + cs->max_sparse_buffers = new_max; + cs->sparse_buffers = new_buffers; + } + + idx = cs->num_sparse_buffers; + buffer = &cs->sparse_buffers[idx]; + + memset(buffer, 0, sizeof(*buffer)); + amdgpu_winsys_bo_reference(&buffer->bo, bo); + p_atomic_inc(&bo->num_cs_references); + cs->num_sparse_buffers++; + + hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); + cs->buffer_indices_hashlist[hash] = idx; + + /* We delay adding the backing buffers until we really have to. However, + * we cannot delay accounting for memory use. + */ + mtx_lock(&bo->u.sparse.commit_lock); + + list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { + if (bo->initial_domain & RADEON_DOMAIN_VRAM) + acs->main.base.used_vram += backing->bo->base.size; + else if (bo->initial_domain & RADEON_DOMAIN_GTT) + acs->main.base.used_gart += backing->bo->base.size; + } + + mtx_unlock(&bo->u.sparse.commit_lock); + + return idx; +} + static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs, struct pb_buffer *buf, enum radeon_bo_usage usage, @@ -436,30 +592,53 @@ static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs, struct amdgpu_cs_buffer *buffer; int index; - if (!bo->bo) { - index = amdgpu_lookup_or_add_slab_buffer(acs, bo); - if (index < 0) - return 0; - - buffer = &cs->slab_buffers[index]; - buffer->usage |= usage; + /* Fast exit for no-op calls. + * This is very effective with suballocators and linear uploaders that + * are outside of the winsys. + */ + if (bo == cs->last_added_bo && + (usage & cs->last_added_bo_usage) == usage && + (1ull << priority) & cs->last_added_bo_priority_usage) + return cs->last_added_bo_index; + + if (!bo->sparse) { + if (!bo->bo) { + index = amdgpu_lookup_or_add_slab_buffer(acs, bo); + if (index < 0) + return 0; + + buffer = &cs->slab_buffers[index]; + buffer->usage |= usage; + + usage &= ~RADEON_USAGE_SYNCHRONIZED; + index = buffer->u.slab.real_idx; + } else { + index = amdgpu_lookup_or_add_real_buffer(acs, bo); + if (index < 0) + return 0; + } - usage &= ~RADEON_USAGE_SYNCHRONIZED; - index = buffer->u.slab.real_idx; + buffer = &cs->real_buffers[index]; } else { - index = amdgpu_lookup_or_add_real_buffer(acs, bo); + index = amdgpu_lookup_or_add_sparse_buffer(acs, bo); if (index < 0) return 0; + + buffer = &cs->sparse_buffers[index]; } - buffer = &cs->real_buffers[index]; - buffer->u.real.priority_usage |= 1llu << priority; + buffer->u.real.priority_usage |= 1ull << priority; buffer->usage |= usage; - cs->flags[index] = MAX2(cs->flags[index], priority / 4); + + cs->last_added_bo = bo; + cs->last_added_bo_index = index; + cs->last_added_bo_usage = buffer->usage; + cs->last_added_bo_priority_usage = buffer->u.real.priority_usage; return index; } -static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) +static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib, + enum ring_type ring_type) { struct pb_buffer *pb; uint8_t *mapped; @@ -479,12 +658,6 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) buffer_size = MIN2(buffer_size, 4 * 512 * 1024); switch (ib->ib_type) { - case IB_CONST_PREAMBLE: - buffer_size = MAX2(buffer_size, 4 * 1024); - break; - case IB_CONST: - buffer_size = MAX2(buffer_size, 16 * 1024 * 4); - break; case IB_MAIN: buffer_size = MAX2(buffer_size, 8 * 1024 * 4); break; @@ -495,7 +668,11 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) pb = ws->base.buffer_create(&ws->base, buffer_size, ws->info.gart_page_size, RADEON_DOMAIN_GTT, - RADEON_FLAG_CPU_ACCESS); + RADEON_FLAG_NO_INTERPROCESS_SHARING | + (ring_type == RING_GFX || + ring_type == RING_COMPUTE || + ring_type == RING_DMA ? + RADEON_FLAG_GTT_WC : 0)); if (!pb) return false; @@ -523,13 +700,6 @@ static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type) * http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1 */ return 20 * 1024; - case IB_CONST_PREAMBLE: - case IB_CONST: - /* There isn't really any reason to limit CE IB size beyond the natural - * limit implied by the main IB, except perhaps GTT size. Just return - * an extremely large value that we never get anywhere close to. - */ - return 16 * 1024 * 1024; default: unreachable("bad ib_type"); } @@ -544,18 +714,10 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs, * http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1 */ struct amdgpu_ib *ib = NULL; - struct amdgpu_cs_ib_info *info = &cs->csc->ib[ib_type]; + struct drm_amdgpu_cs_chunk_ib *info = &cs->csc->ib[ib_type]; unsigned ib_size = 0; switch (ib_type) { - case IB_CONST_PREAMBLE: - ib = &cs->const_preamble_ib; - ib_size = 256 * 4; - break; - case IB_CONST: - ib = &cs->const_ib; - ib_size = 8 * 1024 * 4; - break; case IB_MAIN: ib = &cs->main; ib_size = 4 * 1024 * 4; @@ -580,14 +742,16 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs, /* Allocate a new buffer for IBs if the current buffer is all used. */ if (!ib->big_ib_buffer || ib->used_ib_space + ib_size > ib->big_ib_buffer->size) { - if (!amdgpu_ib_new_buffer(aws, ib)) + if (!amdgpu_ib_new_buffer(aws, ib, cs->ring_type)) return false; } - info->ib_mc_address = amdgpu_winsys_bo(ib->big_ib_buffer)->va + - ib->used_ib_space; - info->size = 0; - ib->ptr_ib_size = &info->size; + info->va_start = amdgpu_winsys_bo(ib->big_ib_buffer)->va + ib->used_ib_space; + info->ib_bytes = 0; + /* ib_bytes is in dwords and the conversion to bytes will be done before + * the CS ioctl. */ + ib->ptr_ib_size = &info->ib_bytes; + ib->ptr_ib_size_inside_ib = false; amdgpu_cs_add_buffer(&cs->main.base, ib->big_ib_buffer, RADEON_USAGE_READ, 0, RADEON_PRIO_IB1); @@ -599,52 +763,56 @@ static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs, return true; } -static void amdgpu_ib_finalize(struct amdgpu_ib *ib) +static void amdgpu_set_ib_size(struct amdgpu_ib *ib) { - *ib->ptr_ib_size |= ib->base.current.cdw; + if (ib->ptr_ib_size_inside_ib) { + *ib->ptr_ib_size = ib->base.current.cdw | + S_3F2_CHAIN(1) | S_3F2_VALID(1); + } else { + *ib->ptr_ib_size = ib->base.current.cdw; + } +} + +static void amdgpu_ib_finalize(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) +{ + amdgpu_set_ib_size(ib); ib->used_ib_space += ib->base.current.cdw * 4; + ib->used_ib_space = align(ib->used_ib_space, ws->info.ib_start_alignment); ib->max_ib_size = MAX2(ib->max_ib_size, ib->base.prev_dw + ib->base.current.cdw); } static bool amdgpu_init_cs_context(struct amdgpu_cs_context *cs, enum ring_type ring_type) { - int i; - switch (ring_type) { case RING_DMA: - cs->request.ip_type = AMDGPU_HW_IP_DMA; + cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_DMA; break; case RING_UVD: - cs->request.ip_type = AMDGPU_HW_IP_UVD; + cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_UVD; break; case RING_VCE: - cs->request.ip_type = AMDGPU_HW_IP_VCE; + cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCE; break; case RING_COMPUTE: - cs->request.ip_type = AMDGPU_HW_IP_COMPUTE; + cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_COMPUTE; + break; + + case RING_VCN_DEC: + cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_DEC; break; default: case RING_GFX: - cs->request.ip_type = AMDGPU_HW_IP_GFX; + cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_GFX; break; } - for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) { - cs->buffer_indices_hashlist[i] = -1; - } - - cs->request.number_of_ibs = 1; - cs->request.ibs = &cs->ib[IB_MAIN]; - - cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE; - cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE | - AMDGPU_IB_FLAG_PREAMBLE; - + memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); + cs->last_added_bo = NULL; return true; } @@ -660,14 +828,21 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs) p_atomic_dec(&cs->slab_buffers[i].bo->num_cs_references); amdgpu_winsys_bo_reference(&cs->slab_buffers[i].bo, NULL); } + for (i = 0; i < cs->num_sparse_buffers; i++) { + p_atomic_dec(&cs->sparse_buffers[i].bo->num_cs_references); + amdgpu_winsys_bo_reference(&cs->sparse_buffers[i].bo, NULL); + } + for (i = 0; i < cs->num_fence_dependencies; i++) + amdgpu_fence_reference(&cs->fence_dependencies[i], NULL); cs->num_real_buffers = 0; cs->num_slab_buffers = 0; + cs->num_sparse_buffers = 0; + cs->num_fence_dependencies = 0; amdgpu_fence_reference(&cs->fence, NULL); - for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) { - cs->buffer_indices_hashlist[i] = -1; - } + memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); + cs->last_added_bo = NULL; } static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs) @@ -677,7 +852,8 @@ static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs) FREE(cs->real_buffers); FREE(cs->handles); FREE(cs->slab_buffers); - FREE(cs->request.dependencies); + FREE(cs->sparse_buffers); + FREE(cs->fence_dependencies); } @@ -703,9 +879,12 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, cs->flush_data = flush_ctx; cs->ring_type = ring_type; + struct amdgpu_cs_fence_info fence_info; + fence_info.handle = cs->ctx->user_fence_bo; + fence_info.offset = cs->ring_type; + amdgpu_cs_chunk_fence_info_to_data(&fence_info, (void*)&cs->fence_chunk); + cs->main.ib_type = IB_MAIN; - cs->const_ib.ib_type = IB_CONST; - cs->const_preamble_ib.ib_type = IB_CONST_PREAMBLE; if (!amdgpu_init_cs_context(&cs->csc1, ring_type)) { FREE(cs); @@ -733,52 +912,6 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, return &cs->main.base; } -static struct radeon_winsys_cs * -amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs) -{ - struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs; - struct amdgpu_winsys *ws = cs->ctx->ws; - - /* only one const IB can be added */ - if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped) - return NULL; - - if (!amdgpu_get_new_ib(&ws->base, cs, IB_CONST)) - return NULL; - - cs->csc->request.number_of_ibs = 2; - cs->csc->request.ibs = &cs->csc->ib[IB_CONST]; - - cs->cst->request.number_of_ibs = 2; - cs->cst->request.ibs = &cs->cst->ib[IB_CONST]; - - return &cs->const_ib.base; -} - -static struct radeon_winsys_cs * -amdgpu_cs_add_const_preamble_ib(struct radeon_winsys_cs *rcs) -{ - struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs; - struct amdgpu_winsys *ws = cs->ctx->ws; - - /* only one const preamble IB can be added and only when the const IB has - * also been mapped */ - if (cs->ring_type != RING_GFX || !cs->const_ib.ib_mapped || - cs->const_preamble_ib.ib_mapped) - return NULL; - - if (!amdgpu_get_new_ib(&ws->base, cs, IB_CONST_PREAMBLE)) - return NULL; - - cs->csc->request.number_of_ibs = 3; - cs->csc->request.ibs = &cs->csc->ib[IB_CONST_PREAMBLE]; - - cs->cst->request.number_of_ibs = 3; - cs->cst->request.ibs = &cs->cst->ib[IB_CONST_PREAMBLE]; - - return &cs->const_preamble_ib.base; -} - static bool amdgpu_cs_validate(struct radeon_winsys_cs *rcs) { return true; @@ -820,7 +953,7 @@ static bool amdgpu_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw) rcs->max_prev = new_max_prev; } - if (!amdgpu_ib_new_buffer(cs->ctx->ws, ib)) + if (!amdgpu_ib_new_buffer(cs->ctx->ws, ib, cs->ring_type)) return false; assert(ib->used_ib_space == 0); @@ -838,14 +971,14 @@ static bool amdgpu_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw) : PKT3_INDIRECT_BUFFER_CONST, 2, 0)); radeon_emit(rcs, va); radeon_emit(rcs, va >> 32); - new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw]; - radeon_emit(rcs, S_3F2_CHAIN(1) | S_3F2_VALID(1)); + new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++]; assert((rcs->current.cdw & 7) == 0); assert(rcs->current.cdw <= rcs->current.max_dw); - *ib->ptr_ib_size |= rcs->current.cdw; + amdgpu_set_ib_size(ib); ib->ptr_ib_size = new_ptr_ib_size; + ib->ptr_ib_size_inside_ib = true; /* Hook up the new chunk */ rcs->prev[rcs->num_prev].buf = rcs->current.buf; @@ -881,27 +1014,65 @@ static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs, return cs->num_real_buffers; } -DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false) +static unsigned add_fence_dependency_entry(struct amdgpu_cs_context *cs) +{ + unsigned idx = cs->num_fence_dependencies++; + + if (idx >= cs->max_fence_dependencies) { + unsigned size; + const unsigned increment = 8; + + cs->max_fence_dependencies = idx + increment; + size = cs->max_fence_dependencies * sizeof(cs->fence_dependencies[0]); + cs->fence_dependencies = realloc(cs->fence_dependencies, size); + /* Clear the newly-allocated elements. */ + memset(cs->fence_dependencies + idx, 0, + increment * sizeof(cs->fence_dependencies[0])); + } + return idx; +} -static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs, - struct amdgpu_cs_buffer *buffer) +static bool is_noop_fence_dependency(struct amdgpu_cs *acs, + struct amdgpu_fence *fence) +{ + struct amdgpu_cs_context *cs = acs->csc; + + if (!amdgpu_fence_is_syncobj(fence) && + fence->ctx == acs->ctx && + fence->fence.ip_type == cs->ib[IB_MAIN].ip_type && + fence->fence.ip_instance == cs->ib[IB_MAIN].ip_instance && + fence->fence.ring == cs->ib[IB_MAIN].ring) + return true; + + return amdgpu_fence_wait((void *)fence, 0, false); +} + +static void amdgpu_cs_add_fence_dependency(struct radeon_winsys_cs *rws, + struct pipe_fence_handle *pfence) +{ + struct amdgpu_cs *acs = amdgpu_cs(rws); + struct amdgpu_cs_context *cs = acs->csc; + struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence; + + if (is_noop_fence_dependency(acs, fence)) + return; + + unsigned idx = add_fence_dependency_entry(cs); + amdgpu_fence_reference(&cs->fence_dependencies[idx], + (struct pipe_fence_handle*)fence); +} + +static void amdgpu_add_bo_fence_dependencies(struct amdgpu_cs *acs, + struct amdgpu_cs_buffer *buffer) { struct amdgpu_cs_context *cs = acs->csc; struct amdgpu_winsys_bo *bo = buffer->bo; - struct amdgpu_cs_fence *dep; unsigned new_num_fences = 0; for (unsigned j = 0; j < bo->num_fences; ++j) { struct amdgpu_fence *bo_fence = (void *)bo->fences[j]; - unsigned idx; - if (bo_fence->ctx == acs->ctx && - bo_fence->fence.ip_type == cs->request.ip_type && - bo_fence->fence.ip_instance == cs->request.ip_instance && - bo_fence->fence.ring == cs->request.ring) - continue; - - if (amdgpu_fence_wait((void *)bo_fence, 0, false)) + if (is_noop_fence_dependency(acs, bo_fence)) continue; amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]); @@ -910,21 +1081,9 @@ static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs, if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED)) continue; - if (bo_fence->submission_in_progress) - os_wait_until_zero(&bo_fence->submission_in_progress, - PIPE_TIMEOUT_INFINITE); - - idx = cs->request.number_of_dependencies++; - if (idx >= cs->max_dependencies) { - unsigned size; - - cs->max_dependencies = idx + 8; - size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence); - cs->request.dependencies = realloc(cs->request.dependencies, size); - } - - dep = &cs->request.dependencies[idx]; - memcpy(dep, &bo_fence->fence, sizeof(*dep)); + unsigned idx = add_fence_dependency_entry(cs); + amdgpu_fence_reference(&cs->fence_dependencies[idx], + (struct pipe_fence_handle*)bo_fence); } for (unsigned j = new_num_fences; j < bo->num_fences; ++j) @@ -933,47 +1092,108 @@ static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs, bo->num_fences = new_num_fences; } -/* Since the kernel driver doesn't synchronize execution between different - * rings automatically, we have to add fence dependencies manually. +/* Add the given list of fences to the buffer's fence list. + * + * Must be called with the winsys bo_fence_lock held. */ -static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs) -{ - struct amdgpu_cs_context *cs = acs->csc; - int i; - - cs->request.number_of_dependencies = 0; - - for (i = 0; i < cs->num_real_buffers; i++) - amdgpu_add_fence_dependency(acs, &cs->real_buffers[i]); - for (i = 0; i < cs->num_slab_buffers; i++) - amdgpu_add_fence_dependency(acs, &cs->slab_buffers[i]); -} - -static void amdgpu_add_fence(struct amdgpu_winsys_bo *bo, - struct pipe_fence_handle *fence) +void amdgpu_add_fences(struct amdgpu_winsys_bo *bo, + unsigned num_fences, + struct pipe_fence_handle **fences) { - if (bo->num_fences >= bo->max_fences) { - unsigned new_max_fences = MAX2(1, bo->max_fences * 2); + if (bo->num_fences + num_fences > bo->max_fences) { + unsigned new_max_fences = MAX2(bo->num_fences + num_fences, bo->max_fences * 2); struct pipe_fence_handle **new_fences = REALLOC(bo->fences, bo->num_fences * sizeof(*new_fences), new_max_fences * sizeof(*new_fences)); - if (new_fences) { + if (likely(new_fences)) { bo->fences = new_fences; bo->max_fences = new_max_fences; } else { - fprintf(stderr, "amdgpu_add_fence: allocation failure, dropping fence\n"); + unsigned drop; + + fprintf(stderr, "amdgpu_add_fences: allocation failure, dropping fence(s)\n"); if (!bo->num_fences) return; - bo->num_fences--; /* prefer to keep a more recent fence if possible */ + bo->num_fences--; /* prefer to keep the most recent fence if possible */ amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL); + + drop = bo->num_fences + num_fences - bo->max_fences; + num_fences -= drop; + fences += drop; } } - bo->fences[bo->num_fences] = NULL; - amdgpu_fence_reference(&bo->fences[bo->num_fences], fence); - bo->num_fences++; + for (unsigned i = 0; i < num_fences; ++i) { + bo->fences[bo->num_fences] = NULL; + amdgpu_fence_reference(&bo->fences[bo->num_fences], fences[i]); + bo->num_fences++; + } +} + +static void amdgpu_add_fence_dependencies_bo_list(struct amdgpu_cs *acs, + struct pipe_fence_handle *fence, + unsigned num_buffers, + struct amdgpu_cs_buffer *buffers) +{ + for (unsigned i = 0; i < num_buffers; i++) { + struct amdgpu_cs_buffer *buffer = &buffers[i]; + struct amdgpu_winsys_bo *bo = buffer->bo; + + amdgpu_add_bo_fence_dependencies(acs, buffer); + p_atomic_inc(&bo->num_active_ioctls); + amdgpu_add_fences(bo, 1, &fence); + } +} + +/* Since the kernel driver doesn't synchronize execution between different + * rings automatically, we have to add fence dependencies manually. + */ +static void amdgpu_add_fence_dependencies_bo_lists(struct amdgpu_cs *acs) +{ + struct amdgpu_cs_context *cs = acs->csc; + + cs->num_fence_dependencies = 0; + + amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers); + amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers); + amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers); +} + +/* Add backing of sparse buffers to the buffer list. + * + * This is done late, during submission, to keep the buffer list short before + * submit, and to avoid managing fences for the backing buffers. + */ +static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs) +{ + for (unsigned i = 0; i < cs->num_sparse_buffers; ++i) { + struct amdgpu_cs_buffer *buffer = &cs->sparse_buffers[i]; + struct amdgpu_winsys_bo *bo = buffer->bo; + + mtx_lock(&bo->u.sparse.commit_lock); + + list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { + /* We can directly add the buffer here, because we know that each + * backing buffer occurs only once. + */ + int idx = amdgpu_do_add_real_buffer(cs, backing->bo); + if (idx < 0) { + fprintf(stderr, "%s: failed to add buffer\n", __FUNCTION__); + mtx_unlock(&bo->u.sparse.commit_lock); + return false; + } + + cs->real_buffers[idx].usage = buffer->usage & ~RADEON_USAGE_SYNCHRONIZED; + cs->real_buffers[idx].u.real.priority_usage = buffer->u.real.priority_usage; + p_atomic_inc(&backing->bo->num_active_ioctls); + } + + mtx_unlock(&bo->u.sparse.commit_lock); + } + + return true; } void amdgpu_cs_submit_ib(void *job, int thread_index) @@ -982,26 +1202,23 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) struct amdgpu_winsys *ws = acs->ctx->ws; struct amdgpu_cs_context *cs = acs->cst; int i, r; - - cs->request.fence_info.handle = NULL; - if (amdgpu_cs_has_user_fence(cs)) { - cs->request.fence_info.handle = acs->ctx->user_fence_bo; - cs->request.fence_info.offset = acs->ring_type; - } + amdgpu_bo_list_handle bo_list = NULL; + uint64_t seq_no = 0; + bool has_user_fence = amdgpu_cs_has_user_fence(cs); /* Create the buffer list. * Use a buffer list containing all allocated buffers if requested. */ - if (debug_get_option_all_bos()) { + if (ws->debug_all_bos) { struct amdgpu_winsys_bo *bo; amdgpu_bo_handle *handles; unsigned num = 0; - pipe_mutex_lock(ws->global_bo_list_lock); + mtx_lock(&ws->global_bo_list_lock); handles = malloc(sizeof(handles[0]) * ws->num_buffers); if (!handles) { - pipe_mutex_unlock(ws->global_bo_list_lock); + mtx_unlock(&ws->global_bo_list_lock); amdgpu_cs_context_cleanup(cs); cs->error_code = -ENOMEM; return; @@ -1013,52 +1230,178 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) } r = amdgpu_bo_list_create(ws->dev, ws->num_buffers, - handles, NULL, - &cs->request.resources); + handles, NULL, &bo_list); free(handles); - pipe_mutex_unlock(ws->global_bo_list_lock); + mtx_unlock(&ws->global_bo_list_lock); } else { - r = amdgpu_bo_list_create(ws->dev, cs->num_real_buffers, - cs->handles, cs->flags, - &cs->request.resources); + unsigned num_handles; + + if (!amdgpu_add_sparse_backing_buffers(cs)) { + r = -ENOMEM; + goto bo_list_error; + } + + if (cs->max_real_submit < cs->num_real_buffers) { + FREE(cs->handles); + FREE(cs->flags); + + cs->handles = MALLOC(sizeof(*cs->handles) * cs->num_real_buffers); + cs->flags = MALLOC(sizeof(*cs->flags) * cs->num_real_buffers); + + if (!cs->handles || !cs->flags) { + cs->max_real_submit = 0; + r = -ENOMEM; + goto bo_list_error; + } + } + + num_handles = 0; + for (i = 0; i < cs->num_real_buffers; ++i) { + struct amdgpu_cs_buffer *buffer = &cs->real_buffers[i]; + + if (buffer->bo->is_local) + continue; + + assert(buffer->u.real.priority_usage != 0); + + cs->handles[num_handles] = buffer->bo->bo; + cs->flags[num_handles] = (util_last_bit64(buffer->u.real.priority_usage) - 1) / 4; + ++num_handles; + } + + if (acs->ring_type == RING_GFX) + ws->gfx_bo_list_counter += cs->num_real_buffers; + + if (num_handles) { + r = amdgpu_bo_list_create(ws->dev, num_handles, + cs->handles, cs->flags, &bo_list); + } else { + r = 0; + } } +bo_list_error: if (r) { fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r); - cs->request.resources = NULL; amdgpu_fence_signalled(cs->fence); cs->error_code = r; goto cleanup; } - r = amdgpu_cs_submit(acs->ctx->ctx, 0, &cs->request, 1); + if (acs->ctx->num_rejected_cs) { + r = -ECANCELED; + } else { + struct drm_amdgpu_cs_chunk chunks[4]; + unsigned num_chunks = 0; + + /* Convert from dwords to bytes. */ + cs->ib[IB_MAIN].ib_bytes *= 4; + + /* IB */ + chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; + chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; + chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_MAIN]; + num_chunks++; + + /* Fence */ + if (has_user_fence) { + chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE; + chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4; + chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk; + num_chunks++; + } + + /* Dependencies */ + unsigned num_dependencies = cs->num_fence_dependencies; + unsigned num_syncobj_dependencies = 0; + + if (num_dependencies) { + struct drm_amdgpu_cs_chunk_dep *dep_chunk = + alloca(num_dependencies * sizeof(*dep_chunk)); + unsigned num = 0; + + for (unsigned i = 0; i < num_dependencies; i++) { + struct amdgpu_fence *fence = + (struct amdgpu_fence*)cs->fence_dependencies[i]; + + if (amdgpu_fence_is_syncobj(fence)) { + num_syncobj_dependencies++; + continue; + } + + assert(!fence->submission_in_progress); + amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[num++]); + } + + chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES; + chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num; + chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk; + num_chunks++; + } + + /* Syncobj dependencies. */ + if (num_syncobj_dependencies) { + struct drm_amdgpu_cs_chunk_sem *sem_chunk = + alloca(num_syncobj_dependencies * sizeof(sem_chunk[0])); + unsigned num = 0; + + for (unsigned i = 0; i < num_dependencies; i++) { + struct amdgpu_fence *fence = + (struct amdgpu_fence*)cs->fence_dependencies[i]; + + if (!amdgpu_fence_is_syncobj(fence)) + continue; + + assert(!fence->submission_in_progress); + sem_chunk[num++].handle = fence->syncobj; + } + + chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN; + chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num; + chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk; + num_chunks++; + } + + assert(num_chunks <= ARRAY_SIZE(chunks)); + + r = amdgpu_cs_submit_raw(ws->dev, acs->ctx->ctx, bo_list, + num_chunks, chunks, &seq_no); + } + cs->error_code = r; if (r) { if (r == -ENOMEM) fprintf(stderr, "amdgpu: Not enough memory for command submission.\n"); + else if (r == -ECANCELED) + fprintf(stderr, "amdgpu: The CS has been cancelled because the context is lost.\n"); else fprintf(stderr, "amdgpu: The CS has been rejected, " "see dmesg for more information (%i).\n", r); amdgpu_fence_signalled(cs->fence); + + acs->ctx->num_rejected_cs++; + ws->num_total_rejected_cs++; } else { /* Success. */ uint64_t *user_fence = NULL; - if (amdgpu_cs_has_user_fence(cs)) - user_fence = acs->ctx->user_fence_cpu_address_base + - cs->request.fence_info.offset; - amdgpu_fence_submitted(cs->fence, &cs->request, user_fence); + + if (has_user_fence) + user_fence = acs->ctx->user_fence_cpu_address_base + acs->ring_type; + amdgpu_fence_submitted(cs->fence, seq_no, user_fence); } /* Cleanup. */ - if (cs->request.resources) - amdgpu_bo_list_destroy(cs->request.resources); + if (bo_list) + amdgpu_bo_list_destroy(bo_list); cleanup: for (i = 0; i < cs->num_real_buffers; i++) p_atomic_dec(&cs->real_buffers[i].bo->num_active_ioctls); for (i = 0; i < cs->num_slab_buffers; i++) p_atomic_dec(&cs->slab_buffers[i].bo->num_active_ioctls); + for (i = 0; i < cs->num_sparse_buffers; i++) + p_atomic_dec(&cs->sparse_buffers[i].bo->num_active_ioctls); amdgpu_cs_context_cleanup(cs); } @@ -1067,11 +1410,9 @@ cleanup: void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs) { struct amdgpu_cs *cs = amdgpu_cs(rcs); - struct amdgpu_winsys *ws = cs->ctx->ws; /* Wait for any pending ioctl of this CS to complete. */ - if (util_queue_is_initialized(&ws->cs_queue)) - util_queue_job_wait(&cs->flush_completed); + util_queue_fence_wait(&cs->flush_completed); } static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, @@ -1104,20 +1445,16 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, while (rcs->current.cdw & 7) radeon_emit(rcs, 0xffff1000); /* type3 nop packet */ } - - /* Also pad the const IB. */ - if (cs->const_ib.ib_mapped) - while (!cs->const_ib.base.current.cdw || (cs->const_ib.base.current.cdw & 7)) - radeon_emit(&cs->const_ib.base, 0xffff1000); /* type3 nop packet */ - - if (cs->const_preamble_ib.ib_mapped) - while (!cs->const_preamble_ib.base.current.cdw || (cs->const_preamble_ib.base.current.cdw & 7)) - radeon_emit(&cs->const_preamble_ib.base, 0xffff1000); + ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4; break; case RING_UVD: while (rcs->current.cdw & 15) radeon_emit(rcs, 0x80000000); /* type2 nop packet */ break; + case RING_VCN_DEC: + while (rcs->current.cdw & 15) + radeon_emit(rcs, 0x81ff); /* nop packet */ + break; default: break; } @@ -1127,20 +1464,13 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, } /* If the CS is not empty or overflowed.... */ - if (radeon_emitted(&cs->main.base, 0) && + if (likely(radeon_emitted(&cs->main.base, 0) && cs->main.base.current.cdw <= cs->main.base.current.max_dw && - !debug_get_option_noop()) { + !debug_get_option_noop())) { struct amdgpu_cs_context *cur = cs->csc; - unsigned i, num_buffers; /* Set IB sizes. */ - amdgpu_ib_finalize(&cs->main); - - if (cs->const_ib.ib_mapped) - amdgpu_ib_finalize(&cs->const_ib); - - if (cs->const_preamble_ib.ib_mapped) - amdgpu_ib_finalize(&cs->const_preamble_ib); + amdgpu_ib_finalize(ws, &cs->main); /* Create a fence. */ amdgpu_fence_reference(&cur->fence, NULL); @@ -1150,61 +1480,52 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, cs->next_fence = NULL; } else { cur->fence = amdgpu_fence_create(cs->ctx, - cur->request.ip_type, - cur->request.ip_instance, - cur->request.ring); + cur->ib[IB_MAIN].ip_type, + cur->ib[IB_MAIN].ip_instance, + cur->ib[IB_MAIN].ring); } if (fence) amdgpu_fence_reference(fence, cur->fence); - /* Prepare buffers. */ - pipe_mutex_lock(ws->bo_fence_lock); - amdgpu_add_fence_dependencies(cs); - - num_buffers = cur->num_real_buffers; - for (i = 0; i < num_buffers; i++) { - struct amdgpu_winsys_bo *bo = cur->real_buffers[i].bo; - p_atomic_inc(&bo->num_active_ioctls); - amdgpu_add_fence(bo, cur->fence); - } - - num_buffers = cur->num_slab_buffers; - for (i = 0; i < num_buffers; i++) { - struct amdgpu_winsys_bo *bo = cur->slab_buffers[i].bo; - p_atomic_inc(&bo->num_active_ioctls); - amdgpu_add_fence(bo, cur->fence); - } - pipe_mutex_unlock(ws->bo_fence_lock); - amdgpu_cs_sync_flush(rcs); + /* Prepare buffers. + * + * This fence must be held until the submission is queued to ensure + * that the order of fence dependency updates matches the order of + * submissions. + */ + mtx_lock(&ws->bo_fence_lock); + amdgpu_add_fence_dependencies_bo_lists(cs); + /* Swap command streams. "cst" is going to be submitted. */ cs->csc = cs->cst; cs->cst = cur; /* Submit. */ - if ((flags & RADEON_FLUSH_ASYNC) && - util_queue_is_initialized(&ws->cs_queue)) { - util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed, - amdgpu_cs_submit_ib, NULL); - } else { - amdgpu_cs_submit_ib(cs, 0); - error_code = cs->cst->error_code; + util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed, + amdgpu_cs_submit_ib, NULL); + /* The submission has been queued, unlock the fence now. */ + mtx_unlock(&ws->bo_fence_lock); + + if (!(flags & RADEON_FLUSH_ASYNC)) { + amdgpu_cs_sync_flush(rcs); + error_code = cur->error_code; } } else { amdgpu_cs_context_cleanup(cs->csc); } amdgpu_get_new_ib(&ws->base, cs, IB_MAIN); - if (cs->const_ib.ib_mapped) - amdgpu_get_new_ib(&ws->base, cs, IB_CONST); - if (cs->const_preamble_ib.ib_mapped) - amdgpu_get_new_ib(&ws->base, cs, IB_CONST_PREAMBLE); cs->main.base.used_gart = 0; cs->main.base.used_vram = 0; - ws->num_cs_flushes++; + if (cs->ring_type == RING_GFX) + ws->num_gfx_IBs++; + else if (cs->ring_type == RING_DMA) + ws->num_sdma_IBs++; + return error_code; } @@ -1217,10 +1538,6 @@ static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs) p_atomic_dec(&cs->ctx->ws->num_cs); pb_reference(&cs->main.big_ib_buffer, NULL); FREE(cs->main.base.prev); - pb_reference(&cs->const_ib.big_ib_buffer, NULL); - FREE(cs->const_ib.base.prev); - pb_reference(&cs->const_preamble_ib.big_ib_buffer, NULL); - FREE(cs->const_preamble_ib.base.prev); amdgpu_destroy_cs_context(&cs->csc1); amdgpu_destroy_cs_context(&cs->csc2); amdgpu_fence_reference(&cs->next_fence, NULL); @@ -1243,8 +1560,6 @@ void amdgpu_cs_init_functions(struct amdgpu_winsys *ws) ws->base.ctx_destroy = amdgpu_ctx_destroy; ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status; ws->base.cs_create = amdgpu_cs_create; - ws->base.cs_add_const_ib = amdgpu_cs_add_const_ib; - ws->base.cs_add_const_preamble_ib = amdgpu_cs_add_const_preamble_ib; ws->base.cs_destroy = amdgpu_cs_destroy; ws->base.cs_add_buffer = amdgpu_cs_add_buffer; ws->base.cs_validate = amdgpu_cs_validate; @@ -1254,6 +1569,10 @@ void amdgpu_cs_init_functions(struct amdgpu_winsys *ws) ws->base.cs_get_next_fence = amdgpu_cs_get_next_fence; ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced; ws->base.cs_sync_flush = amdgpu_cs_sync_flush; + ws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency; ws->base.fence_wait = amdgpu_fence_wait_rel_timeout; ws->base.fence_reference = amdgpu_fence_reference; + ws->base.fence_import_sync_file = amdgpu_fence_import_sync_file; + ws->base.fence_export_sync_file = amdgpu_fence_export_sync_file; + ws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file; } diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index 5f181a5da..1c3d0f0be 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -34,6 +34,7 @@ #include "amdgpu_bo.h" #include "util/u_memory.h" +#include <amdgpu_drm.h> struct amdgpu_ctx { struct amdgpu_winsys *ws; @@ -41,6 +42,8 @@ struct amdgpu_ctx { amdgpu_bo_handle user_fence_bo; uint64_t *user_fence_cpu_address_base; int refcount; + unsigned initial_num_total_rejected_cs; + unsigned num_rejected_cs; }; struct amdgpu_cs_buffer { @@ -57,10 +60,8 @@ struct amdgpu_cs_buffer { }; enum ib_type { - IB_CONST_PREAMBLE = 0, - IB_CONST = 1, /* the const IB must be first */ - IB_MAIN = 2, - IB_NUM + IB_MAIN, + IB_NUM, }; struct amdgpu_ib { @@ -72,27 +73,40 @@ struct amdgpu_ib { unsigned used_ib_space; unsigned max_ib_size; uint32_t *ptr_ib_size; + bool ptr_ib_size_inside_ib; enum ib_type ib_type; }; struct amdgpu_cs_context { - struct amdgpu_cs_request request; - struct amdgpu_cs_ib_info ib[IB_NUM]; + struct drm_amdgpu_cs_chunk_ib ib[IB_NUM]; /* Buffers. */ unsigned max_real_buffers; unsigned num_real_buffers; + struct amdgpu_cs_buffer *real_buffers; + + unsigned max_real_submit; amdgpu_bo_handle *handles; uint8_t *flags; - struct amdgpu_cs_buffer *real_buffers; unsigned num_slab_buffers; unsigned max_slab_buffers; struct amdgpu_cs_buffer *slab_buffers; + unsigned num_sparse_buffers; + unsigned max_sparse_buffers; + struct amdgpu_cs_buffer *sparse_buffers; + int buffer_indices_hashlist[4096]; - unsigned max_dependencies; + struct amdgpu_winsys_bo *last_added_bo; + unsigned last_added_bo_index; + unsigned last_added_bo_usage; + uint64_t last_added_bo_priority_usage; + + struct pipe_fence_handle **fence_dependencies; + unsigned num_fence_dependencies; + unsigned max_fence_dependencies; struct pipe_fence_handle *fence; @@ -102,10 +116,9 @@ struct amdgpu_cs_context { struct amdgpu_cs { struct amdgpu_ib main; /* must be first because this is inherited */ - struct amdgpu_ib const_ib; /* optional constant engine IB */ - struct amdgpu_ib const_preamble_ib; struct amdgpu_ctx *ctx; enum ring_type ring_type; + struct drm_amdgpu_cs_chunk_fence fence_chunk; /* We flip between these two CS. While one is being consumed * by the kernel in another thread, the other one is being filled @@ -127,7 +140,10 @@ struct amdgpu_cs { struct amdgpu_fence { struct pipe_reference reference; + /* If ctx == NULL, this fence is syncobj-based. */ + uint32_t syncobj; + struct amdgpu_winsys *ws; struct amdgpu_ctx *ctx; /* submission context */ struct amdgpu_cs_fence fence; uint64_t *user_fence_cpu_address; @@ -138,6 +154,11 @@ struct amdgpu_fence { volatile int signalled; /* bool (int for atomicity) */ }; +static inline bool amdgpu_fence_is_syncobj(struct amdgpu_fence *fence) +{ + return fence->ctx == NULL; +} + static inline void amdgpu_ctx_unref(struct amdgpu_ctx *ctx) { if (p_atomic_dec_zero(&ctx->refcount)) { @@ -154,8 +175,14 @@ static inline void amdgpu_fence_reference(struct pipe_fence_handle **dst, struct amdgpu_fence *rsrc = (struct amdgpu_fence *)src; if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) { - amdgpu_ctx_unref((*rdst)->ctx); - FREE(*rdst); + struct amdgpu_fence *fence = *rdst; + + if (amdgpu_fence_is_syncobj(fence)) + amdgpu_cs_destroy_syncobj(fence->ws->dev, fence->syncobj); + else + amdgpu_ctx_unref(fence->ctx); + + FREE(fence); } *rdst = rsrc; } @@ -184,10 +211,6 @@ amdgpu_cs_from_ib(struct amdgpu_ib *ib) switch (ib->ib_type) { case IB_MAIN: return get_container(ib, struct amdgpu_cs, main); - case IB_CONST: - return get_container(ib, struct amdgpu_cs, const_ib); - case IB_CONST_PREAMBLE: - return get_container(ib, struct amdgpu_cs, const_preamble_ib); default: unreachable("bad ib_type"); } @@ -217,8 +240,9 @@ amdgpu_bo_is_referenced_by_cs_with_usage(struct amdgpu_cs *cs, if (index == -1) return false; - buffer = bo->bo ? &cs->csc->real_buffers[index] - : &cs->csc->slab_buffers[index]; + buffer = bo->bo ? &cs->csc->real_buffers[index] : + bo->sparse ? &cs->csc->sparse_buffers[index] : + &cs->csc->slab_buffers[index]; return (buffer->usage & usage) != 0; } @@ -231,6 +255,9 @@ amdgpu_bo_is_referenced_by_any_cs(struct amdgpu_winsys_bo *bo) bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, bool absolute); +void amdgpu_add_fences(struct amdgpu_winsys_bo *bo, + unsigned num_fences, + struct pipe_fence_handle **fences); void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs); void amdgpu_cs_init_functions(struct amdgpu_winsys *ws); void amdgpu_cs_submit_ib(void *job, int thread_index); diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_public.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_public.h index ad133b20b..8702e4f6e 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_public.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_public.h @@ -31,10 +31,13 @@ struct radeon_winsys; struct pipe_screen; +struct pipe_screen_config; -typedef struct pipe_screen *(*radeon_screen_create_t)(struct radeon_winsys *); +typedef struct pipe_screen *(*radeon_screen_create_t)(struct radeon_winsys *, + const struct pipe_screen_config *config); struct radeon_winsys * -amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create); +amdgpu_winsys_create(int fd, const struct pipe_screen_config *config, + radeon_screen_create_t screen_create); #endif diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c index c5462bc0e..99e4d778d 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c @@ -30,65 +30,32 @@ */ #include "amdgpu_winsys.h" +#include "util/u_format.h" -#ifndef NO_ENTRIES -#define NO_ENTRIES 32 -#endif - -#ifndef NO_MACRO_ENTRIES -#define NO_MACRO_ENTRIES 16 -#endif - -#ifndef CIASICIDGFXENGINE_SOUTHERNISLAND -#define CIASICIDGFXENGINE_SOUTHERNISLAND 0x0000000A -#endif - - -static int amdgpu_surface_sanity(const struct radeon_surf *surf) +static int amdgpu_surface_sanity(const struct pipe_resource *tex) { - unsigned type = RADEON_SURF_GET(surf->flags, TYPE); - - if (!(surf->flags & RADEON_SURF_HAS_TILE_MODE_INDEX)) - return -EINVAL; - - /* all dimension must be at least 1 ! */ - if (!surf->npix_x || !surf->npix_y || !surf->npix_z || - !surf->array_size) - return -EINVAL; - - if (!surf->blk_w || !surf->blk_h || !surf->blk_d) - return -EINVAL; - - switch (surf->nsamples) { - case 1: - case 2: - case 4: - case 8: - break; - default: - return -EINVAL; - } - - switch (type) { - case RADEON_SURF_TYPE_1D: - if (surf->npix_y > 1) + switch (tex->target) { + case PIPE_TEXTURE_1D: + if (tex->height0 > 1) return -EINVAL; /* fall through */ - case RADEON_SURF_TYPE_2D: - case RADEON_SURF_TYPE_CUBEMAP: - if (surf->npix_z > 1 || surf->array_size > 1) + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + if (tex->depth0 > 1 || tex->array_size > 1) return -EINVAL; break; - case RADEON_SURF_TYPE_3D: - if (surf->array_size > 1) + case PIPE_TEXTURE_3D: + if (tex->array_size > 1) return -EINVAL; break; - case RADEON_SURF_TYPE_1D_ARRAY: - if (surf->npix_y > 1) + case PIPE_TEXTURE_1D_ARRAY: + if (tex->height0 > 1) return -EINVAL; /* fall through */ - case RADEON_SURF_TYPE_2D_ARRAY: - if (surf->npix_z > 1) + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_CUBE_ARRAY: + if (tex->depth0 > 1) return -EINVAL; break; default: @@ -97,494 +64,50 @@ static int amdgpu_surface_sanity(const struct radeon_surf *surf) return 0; } -static void *ADDR_API allocSysMem(const ADDR_ALLOCSYSMEM_INPUT * pInput) -{ - return malloc(pInput->sizeInBytes); -} - -static ADDR_E_RETURNCODE ADDR_API freeSysMem(const ADDR_FREESYSMEM_INPUT * pInput) -{ - free(pInput->pVirtAddr); - return ADDR_OK; -} - -ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws) -{ - ADDR_CREATE_INPUT addrCreateInput = {0}; - ADDR_CREATE_OUTPUT addrCreateOutput = {0}; - ADDR_REGISTER_VALUE regValue = {0}; - ADDR_CREATE_FLAGS createFlags = {{0}}; - ADDR_E_RETURNCODE addrRet; - - addrCreateInput.size = sizeof(ADDR_CREATE_INPUT); - addrCreateOutput.size = sizeof(ADDR_CREATE_OUTPUT); - - regValue.noOfBanks = ws->amdinfo.mc_arb_ramcfg & 0x3; - regValue.gbAddrConfig = ws->amdinfo.gb_addr_cfg; - regValue.noOfRanks = (ws->amdinfo.mc_arb_ramcfg & 0x4) >> 2; - - regValue.backendDisables = ws->amdinfo.backend_disable[0]; - regValue.pTileConfig = ws->amdinfo.gb_tile_mode; - regValue.noOfEntries = ARRAY_SIZE(ws->amdinfo.gb_tile_mode); - if (ws->info.chip_class == SI) { - regValue.pMacroTileConfig = NULL; - regValue.noOfMacroEntries = 0; - } else { - regValue.pMacroTileConfig = ws->amdinfo.gb_macro_tile_mode; - regValue.noOfMacroEntries = ARRAY_SIZE(ws->amdinfo.gb_macro_tile_mode); - } - - createFlags.value = 0; - createFlags.useTileIndex = 1; - createFlags.degradeBaseLevel = 1; - createFlags.useHtileSliceAlign = 1; - - addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND; - addrCreateInput.chipFamily = ws->family; - addrCreateInput.chipRevision = ws->rev_id; - addrCreateInput.createFlags = createFlags; - addrCreateInput.callbacks.allocSysMem = allocSysMem; - addrCreateInput.callbacks.freeSysMem = freeSysMem; - addrCreateInput.callbacks.debugPrint = 0; - addrCreateInput.regValue = regValue; - - addrRet = AddrCreate(&addrCreateInput, &addrCreateOutput); - if (addrRet != ADDR_OK) - return NULL; - - return addrCreateOutput.hLib; -} - -static int compute_level(struct amdgpu_winsys *ws, - struct radeon_surf *surf, bool is_stencil, - unsigned level, unsigned type, bool compressed, - ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn, - ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut, - ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn, - ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut, - ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn, - ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut) -{ - struct radeon_surf_level *surf_level; - ADDR_E_RETURNCODE ret; - - AddrSurfInfoIn->mipLevel = level; - AddrSurfInfoIn->width = u_minify(surf->npix_x, level); - AddrSurfInfoIn->height = u_minify(surf->npix_y, level); - - if (type == RADEON_SURF_TYPE_3D) - AddrSurfInfoIn->numSlices = u_minify(surf->npix_z, level); - else if (type == RADEON_SURF_TYPE_CUBEMAP) - AddrSurfInfoIn->numSlices = 6; - else - AddrSurfInfoIn->numSlices = surf->array_size; - - if (level > 0) { - /* Set the base level pitch. This is needed for calculation - * of non-zero levels. */ - if (is_stencil) - AddrSurfInfoIn->basePitch = surf->stencil_level[0].nblk_x; - else - AddrSurfInfoIn->basePitch = surf->level[0].nblk_x; - - /* Convert blocks to pixels for compressed formats. */ - if (compressed) - AddrSurfInfoIn->basePitch *= surf->blk_w; - } - - ret = AddrComputeSurfaceInfo(ws->addrlib, - AddrSurfInfoIn, - AddrSurfInfoOut); - if (ret != ADDR_OK) { - return ret; - } - - surf_level = is_stencil ? &surf->stencil_level[level] : &surf->level[level]; - surf_level->offset = align64(surf->bo_size, AddrSurfInfoOut->baseAlign); - surf_level->slice_size = AddrSurfInfoOut->sliceSize; - surf_level->pitch_bytes = AddrSurfInfoOut->pitch * (is_stencil ? 1 : surf->bpe); - surf_level->npix_x = u_minify(surf->npix_x, level); - surf_level->npix_y = u_minify(surf->npix_y, level); - surf_level->npix_z = u_minify(surf->npix_z, level); - surf_level->nblk_x = AddrSurfInfoOut->pitch; - surf_level->nblk_y = AddrSurfInfoOut->height; - if (type == RADEON_SURF_TYPE_3D) - surf_level->nblk_z = AddrSurfInfoOut->depth; - else - surf_level->nblk_z = 1; - - switch (AddrSurfInfoOut->tileMode) { - case ADDR_TM_LINEAR_ALIGNED: - surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED; - break; - case ADDR_TM_1D_TILED_THIN1: - surf_level->mode = RADEON_SURF_MODE_1D; - break; - case ADDR_TM_2D_TILED_THIN1: - surf_level->mode = RADEON_SURF_MODE_2D; - break; - default: - assert(0); - } - - if (is_stencil) - surf->stencil_tiling_index[level] = AddrSurfInfoOut->tileIndex; - else - surf->tiling_index[level] = AddrSurfInfoOut->tileIndex; - - surf->bo_size = surf_level->offset + AddrSurfInfoOut->surfSize; - - /* Clear DCC fields at the beginning. */ - surf_level->dcc_offset = 0; - surf_level->dcc_enabled = false; - - /* The previous level's flag tells us if we can use DCC for this level. */ - if (AddrSurfInfoIn->flags.dccCompatible && - (level == 0 || AddrDccOut->subLvlCompressible)) { - AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize; - AddrDccIn->tileMode = AddrSurfInfoOut->tileMode; - AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo; - AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex; - AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; - - ret = AddrComputeDccInfo(ws->addrlib, - AddrDccIn, - AddrDccOut); - - if (ret == ADDR_OK) { - surf_level->dcc_offset = surf->dcc_size; - surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize; - surf_level->dcc_enabled = true; - surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize; - surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign); - } - } - - /* TC-compatible HTILE. */ - if (!is_stencil && - AddrSurfInfoIn->flags.depth && - AddrSurfInfoIn->flags.tcCompatible && - surf_level->mode == RADEON_SURF_MODE_2D && - level == 0) { - AddrHtileIn->flags.tcCompatible = 1; - AddrHtileIn->pitch = AddrSurfInfoOut->pitch; - AddrHtileIn->height = AddrSurfInfoOut->height; - AddrHtileIn->numSlices = AddrSurfInfoOut->depth; - AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8; - AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8; - AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo; - AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex; - AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; - - ret = AddrComputeHtileInfo(ws->addrlib, - AddrHtileIn, - AddrHtileOut); - - if (ret == ADDR_OK) { - surf->htile_size = AddrHtileOut->htileBytes; - surf->htile_alignment = AddrHtileOut->baseAlign; - } - } - - return 0; -} - -#define G_009910_MICRO_TILE_MODE(x) (((x) >> 0) & 0x03) -#define G_009910_MICRO_TILE_MODE_NEW(x) (((x) >> 22) & 0x07) - -static void set_micro_tile_mode(struct radeon_surf *surf, - struct radeon_info *info) -{ - uint32_t tile_mode = info->si_tile_mode_array[surf->tiling_index[0]]; - - if (info->chip_class >= CIK) - surf->micro_tile_mode = G_009910_MICRO_TILE_MODE_NEW(tile_mode); - else - surf->micro_tile_mode = G_009910_MICRO_TILE_MODE(tile_mode); -} - -static unsigned cik_get_macro_tile_index(struct radeon_surf *surf) -{ - unsigned index, tileb; - - tileb = 8 * 8 * surf->bpe; - tileb = MIN2(surf->tile_split, tileb); - - for (index = 0; tileb > 64; index++) - tileb >>= 1; - - assert(index < 16); - return index; -} - static int amdgpu_surface_init(struct radeon_winsys *rws, + const struct pipe_resource *tex, + unsigned flags, unsigned bpe, + enum radeon_surf_mode mode, struct radeon_surf *surf) { struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws; - unsigned level, mode, type; - bool compressed; - ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0}; - ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0}; - ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0}; - ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0}; - ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0}; - ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0}; - ADDR_TILEINFO AddrTileInfoIn = {0}; - ADDR_TILEINFO AddrTileInfoOut = {0}; int r; - r = amdgpu_surface_sanity(surf); + r = amdgpu_surface_sanity(tex); if (r) return r; - AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT); - AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT); - AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT); - AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT); - AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT); - AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT); - AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut; - - type = RADEON_SURF_GET(surf->flags, TYPE); - mode = RADEON_SURF_GET(surf->flags, MODE); - compressed = surf->blk_w == 4 && surf->blk_h == 4; - - /* MSAA and FMASK require 2D tiling. */ - if (surf->nsamples > 1 || - (surf->flags & RADEON_SURF_FMASK)) - mode = RADEON_SURF_MODE_2D; - - /* DB doesn't support linear layouts. */ - if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER) && - mode < RADEON_SURF_MODE_1D) - mode = RADEON_SURF_MODE_1D; - - /* Set the requested tiling mode. */ - switch (mode) { - case RADEON_SURF_MODE_LINEAR_ALIGNED: - AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED; - break; - case RADEON_SURF_MODE_1D: - AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1; - break; - case RADEON_SURF_MODE_2D: - AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1; - break; - default: - assert(0); - } - - /* The format must be set correctly for the allocation of compressed - * textures to work. In other cases, setting the bpp is sufficient. */ - if (compressed) { - switch (surf->bpe) { - case 8: - AddrSurfInfoIn.format = ADDR_FMT_BC1; - break; - case 16: - AddrSurfInfoIn.format = ADDR_FMT_BC3; - break; - default: - assert(0); - } - } - else { - AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8; - } - - AddrDccIn.numSamples = AddrSurfInfoIn.numSamples = surf->nsamples; - AddrSurfInfoIn.tileIndex = -1; - - /* Set the micro tile type. */ - if (surf->flags & RADEON_SURF_SCANOUT) - AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE; - else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER) - AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER; - else - AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE; - - AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER); - AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0; - AddrSurfInfoIn.flags.cube = type == RADEON_SURF_TYPE_CUBEMAP; - AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0; - AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0; - AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0; - - /* Only degrade the tile mode for space if TC-compatible HTILE hasn't been - * requested, because TC-compatible HTILE requires 2D tiling. + surf->blk_w = util_format_get_blockwidth(tex->format); + surf->blk_h = util_format_get_blockheight(tex->format); + surf->bpe = bpe; + surf->flags = flags; + + struct ac_surf_config config; + + config.info.width = tex->width0; + config.info.height = tex->height0; + config.info.depth = tex->depth0; + config.info.array_size = tex->array_size; + config.info.samples = tex->nr_samples; + config.info.levels = tex->last_level + 1; + config.is_3d = !!(tex->target == PIPE_TEXTURE_3D); + config.is_cube = !!(tex->target == PIPE_TEXTURE_CUBE); + + /* Use different surface counters for color and FMASK, so that MSAA MRTs + * always use consecutive surface indices when FMASK is allocated between + * them. */ - AddrSurfInfoIn.flags.degrade4Space = !AddrSurfInfoIn.flags.tcCompatible; - - /* DCC notes: - * - If we add MSAA support, keep in mind that CB can't decompress 8bpp - * with samples >= 4. - * - Mipmapped array textures have low performance (discovered by a closed - * driver team). - */ - AddrSurfInfoIn.flags.dccCompatible = ws->info.chip_class >= VI && - !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) && - !(surf->flags & RADEON_SURF_DISABLE_DCC) && - !compressed && AddrDccIn.numSamples <= 1 && - ((surf->array_size == 1 && surf->npix_z == 1) || - surf->last_level == 0); - - AddrSurfInfoIn.flags.noStencil = (surf->flags & RADEON_SURF_SBUFFER) == 0; - AddrSurfInfoIn.flags.compressZ = AddrSurfInfoIn.flags.depth; - - /* noStencil = 0 can result in a depth part that is incompatible with - * mipmapped texturing. So set noStencil = 1 when mipmaps are requested (in - * this case, we may end up setting stencil_adjusted). - * - * TODO: update addrlib to a newer version, remove this, and - * use flags.matchStencilTileCfg = 1 as an alternative fix. - */ - if (surf->last_level > 0) - AddrSurfInfoIn.flags.noStencil = 1; - - /* Set preferred macrotile parameters. This is usually required - * for shared resources. This is for 2D tiling only. */ - if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 && - surf->bankw && surf->bankh && surf->mtilea && surf->tile_split) { - /* If any of these parameters are incorrect, the calculation - * will fail. */ - AddrTileInfoIn.banks = surf->num_banks; - AddrTileInfoIn.bankWidth = surf->bankw; - AddrTileInfoIn.bankHeight = surf->bankh; - AddrTileInfoIn.macroAspectRatio = surf->mtilea; - AddrTileInfoIn.tileSplitBytes = surf->tile_split; - AddrTileInfoIn.pipeConfig = surf->pipe_config + 1; /* +1 compared to GB_TILE_MODE */ - AddrSurfInfoIn.flags.degrade4Space = 0; - AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn; - - /* If AddrSurfInfoIn.pTileInfo is set, Addrlib doesn't set - * the tile index, because we are expected to know it if - * we know the other parameters. - * - * This is something that can easily be fixed in Addrlib. - * For now, just figure it out here. - * Note that only 2D_TILE_THIN1 is handled here. - */ - assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); - assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1); - - if (ws->info.chip_class == SI) { - if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) { - if (surf->bpe == 2) - AddrSurfInfoIn.tileIndex = 11; /* 16bpp */ - else - AddrSurfInfoIn.tileIndex = 12; /* 32bpp */ - } else { - if (surf->bpe == 1) - AddrSurfInfoIn.tileIndex = 14; /* 8bpp */ - else if (surf->bpe == 2) - AddrSurfInfoIn.tileIndex = 15; /* 16bpp */ - else if (surf->bpe == 4) - AddrSurfInfoIn.tileIndex = 16; /* 32bpp */ - else - AddrSurfInfoIn.tileIndex = 17; /* 64bpp (and 128bpp) */ - } - } else { - /* CIK - VI */ - if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) - AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */ - else - AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */ - - /* Addrlib doesn't set this if tileIndex is forced like above. */ - AddrSurfInfoOut.macroModeIndex = cik_get_macro_tile_index(surf); - } - } - - surf->bo_size = 0; - surf->dcc_size = 0; - surf->dcc_alignment = 1; - surf->htile_size = 0; - surf->htile_alignment = 1; - - /* Calculate texture layout information. */ - for (level = 0; level <= surf->last_level; level++) { - r = compute_level(ws, surf, false, level, type, compressed, - &AddrSurfInfoIn, &AddrSurfInfoOut, - &AddrDccIn, &AddrDccOut, &AddrHtileIn, &AddrHtileOut); - if (r) - return r; - - if (level == 0) { - surf->bo_alignment = AddrSurfInfoOut.baseAlign; - surf->pipe_config = AddrSurfInfoOut.pTileInfo->pipeConfig - 1; - set_micro_tile_mode(surf, &ws->info); - - /* For 2D modes only. */ - if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) { - surf->bankw = AddrSurfInfoOut.pTileInfo->bankWidth; - surf->bankh = AddrSurfInfoOut.pTileInfo->bankHeight; - surf->mtilea = AddrSurfInfoOut.pTileInfo->macroAspectRatio; - surf->tile_split = AddrSurfInfoOut.pTileInfo->tileSplitBytes; - surf->num_banks = AddrSurfInfoOut.pTileInfo->banks; - surf->macro_tile_index = AddrSurfInfoOut.macroModeIndex; - } else { - surf->macro_tile_index = 0; - } - } - } - - /* Calculate texture layout information for stencil. */ - if (surf->flags & RADEON_SURF_SBUFFER) { - AddrSurfInfoIn.bpp = 8; - AddrSurfInfoIn.flags.depth = 0; - AddrSurfInfoIn.flags.stencil = 1; - AddrSurfInfoIn.flags.tcCompatible = 0; - /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */ - AddrTileInfoIn.tileSplitBytes = surf->stencil_tile_split; - - for (level = 0; level <= surf->last_level; level++) { - r = compute_level(ws, surf, true, level, type, compressed, - &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut, - NULL, NULL); - if (r) - return r; - - /* DB uses the depth pitch for both stencil and depth. */ - if (surf->stencil_level[level].nblk_x != surf->level[level].nblk_x) - surf->stencil_adjusted = true; - - if (level == 0) { - /* For 2D modes only. */ - if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) { - surf->stencil_tile_split = - AddrSurfInfoOut.pTileInfo->tileSplitBytes; - } - } - } - } - - /* Recalculate the whole DCC miptree size including disabled levels. - * This is what addrlib does, but calling addrlib would be a lot more - * complicated. - */ - if (surf->dcc_size && surf->last_level > 0) { - surf->dcc_size = align64(surf->bo_size >> 8, - ws->info.pipe_interleave_bytes * - ws->info.num_tile_pipes); - } - - /* Make sure HTILE covers the whole miptree, because the shader reads - * TC-compatible HTILE even for levels where it's disabled by DB. - */ - if (surf->htile_size && surf->last_level) - surf->htile_size *= 2; - - return 0; -} + if (flags & RADEON_SURF_FMASK) + config.info.surf_index = &ws->surf_index_fmask; + else if (!(flags & RADEON_SURF_Z_OR_SBUFFER)) + config.info.surf_index = &ws->surf_index_color; + else + config.info.surf_index = NULL; -static int amdgpu_surface_best(struct radeon_winsys *rws, - struct radeon_surf *surf) -{ - return 0; + return ac_compute_surface(ws->addrlib, &ws->info, &config, mode, surf); } void amdgpu_surface_init_functions(struct amdgpu_winsys *ws) { ws->base.surface_init = amdgpu_surface_init; - ws->base.surface_best = amdgpu_surface_best; } diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index d92c0bd83..a210a2704 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -40,331 +40,43 @@ #include <stdio.h> #include <sys/stat.h> #include "amd/common/amdgpu_id.h" +#include "amd/common/sid.h" +#include "amd/common/gfx9d.h" -#define CIK_TILE_MODE_COLOR_2D 14 - -#define CIK__GB_TILE_MODE__PIPE_CONFIG(x) (((x) >> 6) & 0x1f) -#define CIK__PIPE_CONFIG__ADDR_SURF_P2 0 -#define CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16 4 -#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16 5 -#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32 6 -#define CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32 7 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16 8 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16 9 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16 10 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16 11 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16 12 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32 13 -#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32 14 -#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16 16 -#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16 17 - -#ifndef AMDGPU_INFO_NUM_EVICTIONS -#define AMDGPU_INFO_NUM_EVICTIONS 0x18 +#ifndef AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS +#define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS 0x1E #endif static struct util_hash_table *dev_tab = NULL; -pipe_static_mutex(dev_tab_mutex); +static mtx_t dev_tab_mutex = _MTX_INITIALIZER_NP; -static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info) -{ - unsigned mode2d = info->gb_tile_mode[CIK_TILE_MODE_COLOR_2D]; - - switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) { - case CIK__PIPE_CONFIG__ADDR_SURF_P2: - return 2; - case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16: - case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16: - case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32: - case CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32: - return 4; - case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16: - case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16: - case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16: - case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16: - case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16: - case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32: - case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32: - return 8; - case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16: - case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16: - return 16; - default: - fprintf(stderr, "Invalid CIK pipe configuration, assuming P2\n"); - assert(!"this should never occur"); - return 2; - } -} +DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false) /* Helper function to do the ioctls needed for setup and init. */ static bool do_winsys_init(struct amdgpu_winsys *ws, int fd) { - struct amdgpu_buffer_size_alignments alignment_info = {}; - struct amdgpu_heap_info vram, gtt; - struct drm_amdgpu_info_hw_ip dma = {}, uvd = {}, vce = {}; - uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0; - uint32_t unused_feature; - int r, i, j; - drmDevicePtr devinfo; - - /* Get PCI info. */ - r = drmGetDevice(fd, &devinfo); - if (r) { - fprintf(stderr, "amdgpu: drmGetDevice failed.\n"); - goto fail; - } - ws->info.pci_domain = devinfo->businfo.pci->domain; - ws->info.pci_bus = devinfo->businfo.pci->bus; - ws->info.pci_dev = devinfo->businfo.pci->dev; - ws->info.pci_func = devinfo->businfo.pci->func; - drmFreeDevice(&devinfo); - - /* Query hardware and driver information. */ - r = amdgpu_query_gpu_info(ws->dev, &ws->amdinfo); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n"); - goto fail; - } - - r = amdgpu_query_buffer_size_alignment(ws->dev, &alignment_info); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n"); - goto fail; - } - - r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n"); - goto fail; - } - - r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_GTT, 0, >t); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n"); - goto fail; - } - - r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_DMA, 0, &dma); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n"); - goto fail; - } - - r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_UVD, 0, &uvd); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd) failed.\n"); - goto fail; - } - - r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, - &ws->info.me_fw_version, &unused_feature); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n"); - goto fail; - } - - r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0, - &ws->info.pfp_fw_version, &unused_feature); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n"); - goto fail; - } - - r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_GFX_CE, 0, 0, - &ws->info.ce_fw_version, &unused_feature); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(ce) failed.\n"); + if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo)) goto fail; - } - - r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_UVD, 0, 0, - &uvd_version, &uvd_feature); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n"); - goto fail; - } - - r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_VCE, 0, &vce); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n"); - goto fail; - } - - r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_VCE, 0, 0, - &vce_version, &vce_feature); - if (r) { - fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n"); - goto fail; - } - - /* Set chip identification. */ - ws->info.pci_id = ws->amdinfo.asic_id; /* TODO: is this correct? */ - ws->info.vce_harvest_config = ws->amdinfo.vce_harvest_config; - - switch (ws->info.pci_id) { -#define CHIPSET(pci_id, name, cfamily) case pci_id: ws->info.family = CHIP_##cfamily; break; -#include "pci_ids/radeonsi_pci_ids.h" -#undef CHIPSET - default: - fprintf(stderr, "amdgpu: Invalid PCI ID.\n"); + /* LLVM 5.0 is required for GFX9. */ + if (ws->info.chip_class >= GFX9 && HAVE_LLVM < 0x0500) { + fprintf(stderr, "amdgpu: LLVM 5.0 is required, got LLVM %i.%i\n", + HAVE_LLVM >> 8, HAVE_LLVM & 255); goto fail; } - if (ws->info.family >= CHIP_TONGA) - ws->info.chip_class = VI; - else if (ws->info.family >= CHIP_BONAIRE) - ws->info.chip_class = CIK; - else if (ws->info.family >= CHIP_TAHITI) - ws->info.chip_class = SI; - else { - fprintf(stderr, "amdgpu: Unknown family.\n"); - goto fail; - } - - /* LLVM 3.6.1 is required for VI. */ - if (ws->info.chip_class >= VI && - HAVE_LLVM == 0x0306 && MESA_LLVM_VERSION_PATCH < 1) { - fprintf(stderr, "amdgpu: LLVM 3.6.1 is required, got LLVM %i.%i.%i\n", - HAVE_LLVM >> 8, HAVE_LLVM & 255, MESA_LLVM_VERSION_PATCH); - goto fail; - } - - /* family and rev_id are for addrlib */ - switch (ws->info.family) { - case CHIP_TAHITI: - ws->family = FAMILY_SI; - ws->rev_id = SI_TAHITI_P_A0; - break; - case CHIP_PITCAIRN: - ws->family = FAMILY_SI; - ws->rev_id = SI_PITCAIRN_PM_A0; - break; - case CHIP_VERDE: - ws->family = FAMILY_SI; - ws->rev_id = SI_CAPEVERDE_M_A0; - break; - case CHIP_OLAND: - ws->family = FAMILY_SI; - ws->rev_id = SI_OLAND_M_A0; - break; - case CHIP_HAINAN: - ws->family = FAMILY_SI; - ws->rev_id = SI_HAINAN_V_A0; - break; - case CHIP_BONAIRE: - ws->family = FAMILY_CI; - ws->rev_id = CI_BONAIRE_M_A0; - break; - case CHIP_KAVERI: - ws->family = FAMILY_KV; - ws->rev_id = KV_SPECTRE_A0; - break; - case CHIP_KABINI: - ws->family = FAMILY_KV; - ws->rev_id = KB_KALINDI_A0; - break; - case CHIP_HAWAII: - ws->family = FAMILY_CI; - ws->rev_id = CI_HAWAII_P_A0; - break; - case CHIP_MULLINS: - ws->family = FAMILY_KV; - ws->rev_id = ML_GODAVARI_A0; - break; - case CHIP_TONGA: - ws->family = FAMILY_VI; - ws->rev_id = VI_TONGA_P_A0; - break; - case CHIP_ICELAND: - ws->family = FAMILY_VI; - ws->rev_id = VI_ICELAND_M_A0; - break; - case CHIP_CARRIZO: - ws->family = FAMILY_CZ; - ws->rev_id = CARRIZO_A0; - break; - case CHIP_STONEY: - ws->family = FAMILY_CZ; - ws->rev_id = STONEY_A0; - break; - case CHIP_FIJI: - ws->family = FAMILY_VI; - ws->rev_id = VI_FIJI_P_A0; - break; - case CHIP_POLARIS10: - ws->family = FAMILY_VI; - ws->rev_id = VI_POLARIS10_P_A0; - break; - case CHIP_POLARIS11: - ws->family = FAMILY_VI; - ws->rev_id = VI_POLARIS11_M_A0; - break; - default: - fprintf(stderr, "amdgpu: Unknown family.\n"); - goto fail; - } - - ws->addrlib = amdgpu_addr_create(ws); + ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, &ws->info.max_alignment); if (!ws->addrlib) { fprintf(stderr, "amdgpu: Cannot create addrlib.\n"); goto fail; } - /* Set which chips have dedicated VRAM. */ - ws->info.has_dedicated_vram = - !(ws->amdinfo.ids_flags & AMDGPU_IDS_FLAGS_FUSION); - - /* Set hardware information. */ - ws->info.gart_size = gtt.heap_size; - ws->info.vram_size = vram.heap_size; - /* The kernel can split large buffers in VRAM but not in GTT, so large - * allocations can fail or cause buffer movement failures in the kernel. - */ - ws->info.max_alloc_size = MIN2(ws->info.vram_size * 0.9, ws->info.gart_size * 0.7); - /* convert the shader clock from KHz to MHz */ - ws->info.max_shader_clock = ws->amdinfo.max_engine_clk / 1000; - ws->info.max_se = ws->amdinfo.num_shader_engines; - ws->info.max_sh_per_se = ws->amdinfo.num_shader_arrays_per_engine; - ws->info.has_uvd = uvd.available_rings != 0; - ws->info.uvd_fw_version = - uvd.available_rings ? uvd_version : 0; - ws->info.vce_fw_version = - vce.available_rings ? vce_version : 0; - ws->info.has_userptr = true; - ws->info.num_render_backends = ws->amdinfo.rb_pipes; - ws->info.clock_crystal_freq = ws->amdinfo.gpu_counter_freq; - ws->info.num_tile_pipes = cik_get_num_tile_pipes(&ws->amdinfo); - ws->info.pipe_interleave_bytes = 256 << ((ws->amdinfo.gb_addr_cfg >> 4) & 0x7); - ws->info.has_virtual_memory = true; - ws->info.has_sdma = dma.available_rings != 0; - - /* Get the number of good compute units. */ - ws->info.num_good_compute_units = 0; - for (i = 0; i < ws->info.max_se; i++) - for (j = 0; j < ws->info.max_sh_per_se; j++) - ws->info.num_good_compute_units += - util_bitcount(ws->amdinfo.cu_bitmap[i][j]); - - memcpy(ws->info.si_tile_mode_array, ws->amdinfo.gb_tile_mode, - sizeof(ws->amdinfo.gb_tile_mode)); - ws->info.enabled_rb_mask = ws->amdinfo.enabled_rb_pipes_mask; - - memcpy(ws->info.cik_macrotile_mode_array, ws->amdinfo.gb_macro_tile_mode, - sizeof(ws->amdinfo.gb_macro_tile_mode)); - - ws->info.gart_page_size = alignment_info.size_remote; - - if (ws->info.chip_class == SI) - ws->info.gfx_ib_pad_with_type2 = TRUE; - ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL; + ws->debug_all_bos = debug_get_option_all_bos(); return true; fail: - if (ws->addrlib) - AddrDestroy(ws->addrlib); amdgpu_device_deinitialize(ws->dev); ws->dev = NULL; return false; @@ -383,10 +95,10 @@ static void amdgpu_winsys_destroy(struct radeon_winsys *rws) if (util_queue_is_initialized(&ws->cs_queue)) util_queue_destroy(&ws->cs_queue); - pipe_mutex_destroy(ws->bo_fence_lock); + mtx_destroy(&ws->bo_fence_lock); pb_slabs_deinit(&ws->bo_slabs); pb_cache_deinit(&ws->bo_cache); - pipe_mutex_destroy(ws->global_bo_list_lock); + mtx_destroy(&ws->global_bo_list_lock); do_winsys_deinit(ws); FREE(rws); } @@ -422,30 +134,52 @@ static uint64_t amdgpu_query_value(struct radeon_winsys *rws, return ws->mapped_gtt; case RADEON_BUFFER_WAIT_TIME_NS: return ws->buffer_wait_time; + case RADEON_NUM_MAPPED_BUFFERS: + return ws->num_mapped_buffers; case RADEON_TIMESTAMP: amdgpu_query_info(ws->dev, AMDGPU_INFO_TIMESTAMP, 8, &retval); return retval; - case RADEON_NUM_CS_FLUSHES: - return ws->num_cs_flushes; + case RADEON_NUM_GFX_IBS: + return ws->num_gfx_IBs; + case RADEON_NUM_SDMA_IBS: + return ws->num_sdma_IBs; + case RADEON_GFX_BO_LIST_COUNTER: + return ws->gfx_bo_list_counter; + case RADEON_GFX_IB_SIZE_COUNTER: + return ws->gfx_ib_size_counter; case RADEON_NUM_BYTES_MOVED: amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_BYTES_MOVED, 8, &retval); return retval; case RADEON_NUM_EVICTIONS: amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_EVICTIONS, 8, &retval); return retval; + case RADEON_NUM_VRAM_CPU_PAGE_FAULTS: + amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS, 8, &retval); + return retval; case RADEON_VRAM_USAGE: amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &heap); return heap.heap_usage; + case RADEON_VRAM_VIS_USAGE: + amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, + AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED, &heap); + return heap.heap_usage; case RADEON_GTT_USAGE: amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_GTT, 0, &heap); return heap.heap_usage; case RADEON_GPU_TEMPERATURE: + amdgpu_query_sensor_info(ws->dev, AMDGPU_INFO_SENSOR_GPU_TEMP, 4, &retval); + return retval; case RADEON_CURRENT_SCLK: + amdgpu_query_sensor_info(ws->dev, AMDGPU_INFO_SENSOR_GFX_SCLK, 4, &retval); + return retval; case RADEON_CURRENT_MCLK: - return 0; + amdgpu_query_sensor_info(ws->dev, AMDGPU_INFO_SENSOR_GFX_MCLK, 4, &retval); + return retval; case RADEON_GPU_RESET_COUNTER: assert(0); return 0; + case RADEON_CS_THREAD_TIME: + return util_queue_get_thread_time_nano(&ws->cs_queue, 0); } return 0; } @@ -474,8 +208,6 @@ static int compare_dev(void *key1, void *key2) return key1 != key2; } -DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", true) - static bool amdgpu_winsys_unref(struct radeon_winsys *rws) { struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws; @@ -486,18 +218,26 @@ static bool amdgpu_winsys_unref(struct radeon_winsys *rws) * This must happen while the mutex is locked, so that * amdgpu_winsys_create in another thread doesn't get the winsys * from the table when the counter drops to 0. */ - pipe_mutex_lock(dev_tab_mutex); + mtx_lock(&dev_tab_mutex); destroy = pipe_reference(&ws->reference, NULL); if (destroy && dev_tab) util_hash_table_remove(dev_tab, ws->dev); - pipe_mutex_unlock(dev_tab_mutex); + mtx_unlock(&dev_tab_mutex); return destroy; } +static const char* amdgpu_get_chip_name(struct radeon_winsys *ws) +{ + amdgpu_device_handle dev = ((struct amdgpu_winsys *)ws)->dev; + return amdgpu_get_marketing_name(dev); +} + + PUBLIC struct radeon_winsys * -amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) +amdgpu_winsys_create(int fd, const struct pipe_screen_config *config, + radeon_screen_create_t screen_create) { struct amdgpu_winsys *ws; drmVersionPtr version = drmGetVersion(fd); @@ -512,7 +252,7 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) drmFreeVersion(version); /* Look up the winsys from the dev table. */ - pipe_mutex_lock(dev_tab_mutex); + mtx_lock(&dev_tab_mutex); if (!dev_tab) dev_tab = util_hash_table_create(hash_dev, compare_dev); @@ -520,7 +260,7 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) * for the same fd. */ r = amdgpu_device_initialize(fd, &drm_major, &drm_minor, &dev); if (r) { - pipe_mutex_unlock(dev_tab_mutex); + mtx_unlock(&dev_tab_mutex); fprintf(stderr, "amdgpu: amdgpu_device_initialize failed.\n"); return NULL; } @@ -529,7 +269,7 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) ws = util_hash_table_get(dev_tab, dev); if (ws) { pipe_reference(NULL, &ws->reference); - pipe_mutex_unlock(dev_tab_mutex); + mtx_unlock(&dev_tab_mutex); return &ws->base; } @@ -552,7 +292,7 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) if (!pb_slabs_init(&ws->bo_slabs, AMDGPU_SLAB_MIN_SIZE_LOG2, AMDGPU_SLAB_MAX_SIZE_LOG2, - 12, /* number of heaps (domain/flags combinations) */ + RADEON_MAX_SLAB_HEAPS, ws, amdgpu_bo_can_reclaim_slab, amdgpu_bo_slab_alloc, @@ -571,27 +311,32 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) ws->base.cs_request_feature = amdgpu_cs_request_feature; ws->base.query_value = amdgpu_query_value; ws->base.read_registers = amdgpu_read_registers; + ws->base.get_chip_name = amdgpu_get_chip_name; amdgpu_bo_init_functions(ws); amdgpu_cs_init_functions(ws); amdgpu_surface_init_functions(ws); LIST_INITHEAD(&ws->global_bo_list); - pipe_mutex_init(ws->global_bo_list_lock); - pipe_mutex_init(ws->bo_fence_lock); + (void) mtx_init(&ws->global_bo_list_lock, mtx_plain); + (void) mtx_init(&ws->bo_fence_lock, mtx_plain); - if (sysconf(_SC_NPROCESSORS_ONLN) > 1 && debug_get_option_thread()) - util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1); + if (!util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1, + UTIL_QUEUE_INIT_RESIZE_IF_FULL)) { + amdgpu_winsys_destroy(&ws->base); + mtx_unlock(&dev_tab_mutex); + return NULL; + } /* Create the screen at the end. The winsys must be initialized * completely. * * Alternatively, we could create the screen based on "ws->gen" * and link all drivers into one binary blob. */ - ws->base.screen = screen_create(&ws->base); + ws->base.screen = screen_create(&ws->base, config); if (!ws->base.screen) { amdgpu_winsys_destroy(&ws->base); - pipe_mutex_unlock(dev_tab_mutex); + mtx_unlock(&dev_tab_mutex); return NULL; } @@ -600,7 +345,7 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) /* We must unlock the mutex once the winsys is fully initialized, so that * other threads attempting to create the winsys from the same fd will * get a fully initialized winsys and not just half-way initialized. */ - pipe_mutex_unlock(dev_tab_mutex); + mtx_unlock(&dev_tab_mutex); return &ws->base; @@ -610,6 +355,6 @@ fail_cache: fail_alloc: FREE(ws); fail: - pipe_mutex_unlock(dev_tab_mutex); + mtx_unlock(&dev_tab_mutex); return NULL; } diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h index 69c663807..8b62e2dbe 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h @@ -41,8 +41,9 @@ struct amdgpu_cs; -#define AMDGPU_SLAB_MIN_SIZE_LOG2 9 -#define AMDGPU_SLAB_MAX_SIZE_LOG2 14 +#define AMDGPU_SLAB_MIN_SIZE_LOG2 9 /* 512 bytes */ +#define AMDGPU_SLAB_MAX_SIZE_LOG2 16 /* 64 KB */ +#define AMDGPU_SLAB_BO_SIZE_LOG2 17 /* 128 KB */ struct amdgpu_winsys { struct radeon_winsys base; @@ -52,16 +53,23 @@ struct amdgpu_winsys { amdgpu_device_handle dev; - pipe_mutex bo_fence_lock; + mtx_t bo_fence_lock; int num_cs; /* The number of command streams created. */ + unsigned num_total_rejected_cs; + uint32_t surf_index_color; + uint32_t surf_index_fmask; uint32_t next_bo_unique_id; uint64_t allocated_vram; uint64_t allocated_gtt; uint64_t mapped_vram; uint64_t mapped_gtt; uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */ - uint64_t num_cs_flushes; + uint64_t num_gfx_IBs; + uint64_t num_sdma_IBs; + uint64_t num_mapped_buffers; + uint64_t gfx_bo_list_counter; + uint64_t gfx_ib_size_counter; struct radeon_info info; @@ -70,13 +78,12 @@ struct amdgpu_winsys { struct amdgpu_gpu_info amdinfo; ADDR_HANDLE addrlib; - uint32_t rev_id; - unsigned family; bool check_vm; + bool debug_all_bos; /* List of all allocated buffers */ - pipe_mutex global_bo_list_lock; + mtx_t global_bo_list_lock; struct list_head global_bo_list; unsigned num_buffers; }; @@ -88,6 +95,5 @@ amdgpu_winsys(struct radeon_winsys *base) } void amdgpu_surface_init_functions(struct amdgpu_winsys *ws); -ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws); #endif |