diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2018-01-08 05:41:34 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2018-01-08 05:41:34 +0000 |
commit | c00801de923e125863aaf8180439d59d610b2517 (patch) | |
tree | e2896aa2785f3cf2151aeeb3c95fb5cc09a2fe02 /lib/mesa/src/gallium/winsys/amdgpu | |
parent | be30e6efb92db21299b936c0e068e7088941e9c9 (diff) |
Revert to Mesa 13.0.6 again.
Corruption has again been reported on Intel hardware running Xorg with
the modesetting driver (which uses OpenGL based acceleration instead of
SNA acceleration the intel driver defaults to).
Reported in various forms on Sandy Bridge (X220), Ivy Bridge (X230) and
Haswell (X240). Confirmed to not occur with the intel driver but the
xserver was changed to default to the modesetting driver on >= gen4
hardware (except Ironlake).
One means of triggering this is to open a large pdf with xpdf on an
idle machine and highlight a section of the document.
There have been reports of gpu hangs on gen4 intel hardware
(T500 with GM45, X61 with 965GM) when starting Xorg as well.
Diffstat (limited to 'lib/mesa/src/gallium/winsys/amdgpu')
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am | 7 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in | 61 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 676 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 38 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 465 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 27 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c | 556 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 388 | ||||
-rw-r--r-- | lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h | 18 |
9 files changed, 1130 insertions, 1106 deletions
diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am index a719913b1..543325cc2 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.am @@ -4,14 +4,11 @@ include $(top_srcdir)/src/gallium/Automake.inc AM_CFLAGS = \ $(GALLIUM_WINSYS_CFLAGS) \ $(AMDGPU_CFLAGS) \ - -I$(srcdir)/addrlib \ - -I$(srcdir)/addrlib/core \ - -I$(srcdir)/addrlib/inc/chip/r800 \ - -I$(srcdir)/addrlib/r800/chip \ - -DBRAHMA_BUILD=1 + -I$(top_srcdir)/src/amd/ AM_CXXFLAGS = $(AM_CFLAGS) noinst_LTLIBRARIES = libamdgpuwinsys.la +libamdgpuwinsys_la_LIBADD = $(top_builddir)/src/amd/addrlib/libamdgpu_addrlib.la libamdgpuwinsys_la_SOURCES = $(C_SOURCES) diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in index 5b326ad82..5e197a855 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/Makefile.in @@ -54,13 +54,10 @@ target_triplet = @target@ DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \ $(srcdir)/Makefile.sources $(top_srcdir)/bin/depcomp \ $(top_srcdir)/src/gallium/Automake.inc -@HAVE_LIBDRM_TRUE@am__append_1 = \ -@HAVE_LIBDRM_TRUE@ $(LIBDRM_LIBS) - -@HAVE_DRISW_TRUE@am__append_2 = \ +@HAVE_DRISW_TRUE@am__append_1 = \ @HAVE_DRISW_TRUE@ $(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la -@HAVE_DRISW_KMS_TRUE@am__append_3 = \ +@HAVE_DRISW_KMS_TRUE@am__append_2 = \ @HAVE_DRISW_KMS_TRUE@ $(top_builddir)/src/gallium/winsys/sw/kms-dri/libswkmsdri.la \ @HAVE_DRISW_KMS_TRUE@ $(LIBDRM_LIBS) @@ -141,8 +138,6 @@ AMDGPU_CFLAGS = @AMDGPU_CFLAGS@ AMDGPU_LIBS = @AMDGPU_LIBS@ AMTAR = @AMTAR@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ -ANDROID_CFLAGS = @ANDROID_CFLAGS@ -ANDROID_LIBS = @ANDROID_LIBS@ AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ @@ -173,6 +168,8 @@ DLLTOOL = @DLLTOOL@ DLOPEN_LIBS = @DLOPEN_LIBS@ DRI2PROTO_CFLAGS = @DRI2PROTO_CFLAGS@ DRI2PROTO_LIBS = @DRI2PROTO_LIBS@ +DRI3PROTO_CFLAGS = @DRI3PROTO_CFLAGS@ +DRI3PROTO_LIBS = @DRI3PROTO_LIBS@ DRIGL_CFLAGS = @DRIGL_CFLAGS@ DRIGL_LIBS = @DRIGL_LIBS@ DRI_DRIVER_INSTALL_DIR = @DRI_DRIVER_INSTALL_DIR@ @@ -185,11 +182,10 @@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGL_CFLAGS = @EGL_CFLAGS@ +EGL_CLIENT_APIS = @EGL_CLIENT_APIS@ EGL_LIB_DEPS = @EGL_LIB_DEPS@ EGL_NATIVE_PLATFORM = @EGL_NATIVE_PLATFORM@ EGREP = @EGREP@ -ETNAVIV_CFLAGS = @ETNAVIV_CFLAGS@ -ETNAVIV_LIBS = @ETNAVIV_LIBS@ EXEEXT = @EXEEXT@ EXPAT_CFLAGS = @EXPAT_CFLAGS@ EXPAT_LIBS = @EXPAT_LIBS@ @@ -216,8 +212,6 @@ GL_PC_LIB_PRIV = @GL_PC_LIB_PRIV@ GL_PC_REQ_PRIV = @GL_PC_REQ_PRIV@ GREP = @GREP@ HAVE_XF86VIDMODE = @HAVE_XF86VIDMODE@ -I915_CFLAGS = @I915_CFLAGS@ -I915_LIBS = @I915_LIBS@ INDENT = @INDENT@ INDENT_FLAGS = @INDENT_FLAGS@ INSTALL = @INSTALL@ @@ -225,40 +219,45 @@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +INTEL_CFLAGS = @INTEL_CFLAGS@ +INTEL_LIBS = @INTEL_LIBS@ LD = @LD@ LDFLAGS = @LDFLAGS@ LD_NO_UNDEFINED = @LD_NO_UNDEFINED@ LEX = @LEX@ LEXLIB = @LEXLIB@ LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ -LIBATOMIC_LIBS = @LIBATOMIC_LIBS@ LIBCLC_INCLUDEDIR = @LIBCLC_INCLUDEDIR@ LIBCLC_LIBEXECDIR = @LIBCLC_LIBEXECDIR@ LIBDRM_CFLAGS = @LIBDRM_CFLAGS@ LIBDRM_LIBS = @LIBDRM_LIBS@ LIBELF_CFLAGS = @LIBELF_CFLAGS@ LIBELF_LIBS = @LIBELF_LIBS@ -LIBGLVND_DATADIR = @LIBGLVND_DATADIR@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ -LIBSENSORS_LIBS = @LIBSENSORS_LIBS@ +LIBSENSORS_LDFLAGS = @LIBSENSORS_LDFLAGS@ +LIBSHA1_CFLAGS = @LIBSHA1_CFLAGS@ +LIBSHA1_LIBS = @LIBSHA1_LIBS@ LIBTOOL = @LIBTOOL@ -LIBUNWIND_CFLAGS = @LIBUNWIND_CFLAGS@ -LIBUNWIND_LIBS = @LIBUNWIND_LIBS@ LIB_DIR = @LIB_DIR@ LIB_EXT = @LIB_EXT@ LIPO = @LIPO@ +LLVM_BINDIR = @LLVM_BINDIR@ LLVM_CFLAGS = @LLVM_CFLAGS@ LLVM_CONFIG = @LLVM_CONFIG@ +LLVM_CPPFLAGS = @LLVM_CPPFLAGS@ LLVM_CXXFLAGS = @LLVM_CXXFLAGS@ LLVM_INCLUDEDIR = @LLVM_INCLUDEDIR@ LLVM_LDFLAGS = @LLVM_LDFLAGS@ +LLVM_LIBDIR = @LLVM_LIBDIR@ LLVM_LIBS = @LLVM_LIBS@ +LLVM_VERSION = @LLVM_VERSION@ LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ MAINT = @MAINT@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ +MESA_LLVM = @MESA_LLVM@ MKDIR_P = @MKDIR_P@ MSVC2013_COMPAT_CFLAGS = @MSVC2013_COMPAT_CFLAGS@ MSVC2013_COMPAT_CXXFLAGS = @MSVC2013_COMPAT_CXXFLAGS@ @@ -279,6 +278,8 @@ OMX_LIBS = @OMX_LIBS@ OMX_LIB_INSTALL_DIR = @OMX_LIB_INSTALL_DIR@ OPENCL_LIBNAME = @OPENCL_LIBNAME@ OPENCL_VERSION = @OPENCL_VERSION@ +OPENSSL_CFLAGS = @OPENSSL_CFLAGS@ +OPENSSL_LIBS = @OPENSSL_LIBS@ OSMESA_LIB = @OSMESA_LIB@ OSMESA_LIB_DEPS = @OSMESA_LIB_DEPS@ OSMESA_PC_LIB_PRIV = @OSMESA_PC_LIB_PRIV@ @@ -298,6 +299,8 @@ PKG_CONFIG = @PKG_CONFIG@ PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ POSIX_SHELL = @POSIX_SHELL@ +PRESENTPROTO_CFLAGS = @PRESENTPROTO_CFLAGS@ +PRESENTPROTO_LIBS = @PRESENTPROTO_LIBS@ PTHREADSTUBS_CFLAGS = @PTHREADSTUBS_CFLAGS@ PTHREADSTUBS_LIBS = @PTHREADSTUBS_LIBS@ PTHREAD_CC = @PTHREAD_CC@ @@ -313,6 +316,8 @@ SED = @SED@ SELINUX_CFLAGS = @SELINUX_CFLAGS@ SELINUX_LIBS = @SELINUX_LIBS@ SET_MAKE = @SET_MAKE@ +SHA1_CFLAGS = @SHA1_CFLAGS@ +SHA1_LIBS = @SHA1_LIBS@ SHELL = @SHELL@ SIMPENROSE_CFLAGS = @SIMPENROSE_CFLAGS@ SIMPENROSE_LIBS = @SIMPENROSE_LIBS@ @@ -321,8 +326,7 @@ STRIP = @STRIP@ SWR_AVX2_CXXFLAGS = @SWR_AVX2_CXXFLAGS@ SWR_AVX_CXXFLAGS = @SWR_AVX_CXXFLAGS@ SWR_CXX11_CXXFLAGS = @SWR_CXX11_CXXFLAGS@ -SWR_KNL_CXXFLAGS = @SWR_KNL_CXXFLAGS@ -SWR_SKX_CXXFLAGS = @SWR_SKX_CXXFLAGS@ +TIMESTAMP_CMD = @TIMESTAMP_CMD@ VALGRIND_CFLAGS = @VALGRIND_CFLAGS@ VALGRIND_LIBS = @VALGRIND_LIBS@ VA_CFLAGS = @VA_CFLAGS@ @@ -330,12 +334,15 @@ VA_LIBS = @VA_LIBS@ VA_LIB_INSTALL_DIR = @VA_LIB_INSTALL_DIR@ VA_MAJOR = @VA_MAJOR@ VA_MINOR = @VA_MINOR@ +VC4_CFLAGS = @VC4_CFLAGS@ +VC4_LIBS = @VC4_LIBS@ VDPAU_CFLAGS = @VDPAU_CFLAGS@ VDPAU_LIBS = @VDPAU_LIBS@ VDPAU_LIB_INSTALL_DIR = @VDPAU_LIB_INSTALL_DIR@ VDPAU_MAJOR = @VDPAU_MAJOR@ VDPAU_MINOR = @VDPAU_MINOR@ VERSION = @VERSION@ +VG_LIB_DEPS = @VG_LIB_DEPS@ VISIBILITY_CFLAGS = @VISIBILITY_CFLAGS@ VISIBILITY_CXXFLAGS = @VISIBILITY_CXXFLAGS@ VL_CFLAGS = @VL_CFLAGS@ @@ -343,7 +350,6 @@ VL_LIBS = @VL_LIBS@ VULKAN_ICD_INSTALL_DIR = @VULKAN_ICD_INSTALL_DIR@ WAYLAND_CFLAGS = @WAYLAND_CFLAGS@ WAYLAND_LIBS = @WAYLAND_LIBS@ -WAYLAND_PROTOCOLS_DATADIR = @WAYLAND_PROTOCOLS_DATADIR@ WAYLAND_SCANNER = @WAYLAND_SCANNER@ WAYLAND_SCANNER_CFLAGS = @WAYLAND_SCANNER_CFLAGS@ WAYLAND_SCANNER_LIBS = @WAYLAND_SCANNER_LIBS@ @@ -365,10 +371,9 @@ XVMC_LIBS = @XVMC_LIBS@ XVMC_LIB_INSTALL_DIR = @XVMC_LIB_INSTALL_DIR@ XVMC_MAJOR = @XVMC_MAJOR@ XVMC_MINOR = @XVMC_MINOR@ +XXD = @XXD@ YACC = @YACC@ YFLAGS = @YFLAGS@ -ZLIB_CFLAGS = @ZLIB_CFLAGS@ -ZLIB_LIBS = @ZLIB_LIBS@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -486,8 +491,12 @@ GALLIUM_TARGET_CFLAGS = \ $(LIBDRM_CFLAGS) \ $(VISIBILITY_CFLAGS) -GALLIUM_COMMON_LIB_DEPS = -lm $(LIBUNWIND_LIBS) $(LIBSENSORS_LIBS) \ - $(CLOCK_LIB) $(PTHREAD_LIBS) $(DLOPEN_LIBS) $(am__append_1) +GALLIUM_COMMON_LIB_DEPS = \ + -lm \ + $(CLOCK_LIB) \ + $(PTHREAD_LIBS) \ + $(DLOPEN_LIBS) + GALLIUM_WINSYS_CFLAGS = \ -I$(top_srcdir)/src \ -I$(top_srcdir)/include \ @@ -499,7 +508,7 @@ GALLIUM_WINSYS_CFLAGS = \ GALLIUM_PIPE_LOADER_WINSYS_LIBS = \ $(top_builddir)/src/gallium/winsys/sw/null/libws_null.la \ $(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \ - $(am__append_2) $(am__append_3) + $(am__append_1) $(am__append_2) AM_CFLAGS = \ $(GALLIUM_WINSYS_CFLAGS) \ $(AMDGPU_CFLAGS) \ @@ -507,9 +516,7 @@ AM_CFLAGS = \ AM_CXXFLAGS = $(AM_CFLAGS) noinst_LTLIBRARIES = libamdgpuwinsys.la -libamdgpuwinsys_la_LIBADD = \ - $(top_builddir)/src/amd/addrlib/libamdgpu_addrlib.la - +libamdgpuwinsys_la_LIBADD = $(top_builddir)/src/amd/addrlib/libamdgpu_addrlib.la libamdgpuwinsys_la_SOURCES = $(C_SOURCES) all: all-am diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index 97bbe235a..e7ea51978 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -38,13 +38,6 @@ #include <stdio.h> #include <inttypes.h> -/* Set to 1 for verbose output showing committed sparse buffer ranges. */ -#define DEBUG_SPARSE_COMMITS 0 - -struct amdgpu_sparse_backing_chunk { - uint32_t begin, end; -}; - static struct pb_buffer * amdgpu_bo_create(struct radeon_winsys *rws, uint64_t size, @@ -90,7 +83,7 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, unsigned idle_fences; bool buffer_idle; - mtx_lock(&ws->bo_fence_lock); + pipe_mutex_lock(ws->bo_fence_lock); for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) { if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false)) @@ -106,13 +99,13 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, bo->num_fences -= idle_fences; buffer_idle = !bo->num_fences; - mtx_unlock(&ws->bo_fence_lock); + pipe_mutex_unlock(ws->bo_fence_lock); return buffer_idle; } else { bool buffer_idle = true; - mtx_lock(&ws->bo_fence_lock); + pipe_mutex_lock(ws->bo_fence_lock); while (bo->num_fences && buffer_idle) { struct pipe_fence_handle *fence = NULL; bool fence_idle = false; @@ -120,12 +113,12 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, amdgpu_fence_reference(&fence, bo->fences[0]); /* Wait for the fence. */ - mtx_unlock(&ws->bo_fence_lock); + pipe_mutex_unlock(ws->bo_fence_lock); if (amdgpu_fence_wait(fence, abs_timeout, true)) fence_idle = true; else buffer_idle = false; - mtx_lock(&ws->bo_fence_lock); + pipe_mutex_lock(ws->bo_fence_lock); /* Release an idle fence to avoid checking it again later, keeping in * mind that the fence array may have been modified by other threads. @@ -139,7 +132,7 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, amdgpu_fence_reference(&fence, NULL); } - mtx_unlock(&ws->bo_fence_lock); + pipe_mutex_unlock(ws->bo_fence_lock); return buffer_idle; } @@ -167,10 +160,10 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf) assert(bo->bo && "must not be called for slab entries"); - mtx_lock(&bo->ws->global_bo_list_lock); + pipe_mutex_lock(bo->ws->global_bo_list_lock); LIST_DEL(&bo->u.real.global_list_item); bo->ws->num_buffers--; - mtx_unlock(&bo->ws->global_bo_list_lock); + pipe_mutex_unlock(bo->ws->global_bo_list_lock); amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP); amdgpu_va_range_free(bo->u.real.va_handle); @@ -188,7 +181,6 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf) bo->ws->mapped_vram -= bo->base.size; else if (bo->initial_domain & RADEON_DOMAIN_GTT) bo->ws->mapped_gtt -= bo->base.size; - bo->ws->num_mapped_buffers--; } FREE(bo); @@ -217,8 +209,6 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, void *cpu = NULL; uint64_t offset = 0; - assert(!bo->sparse); - /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */ if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */ @@ -321,7 +311,6 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, real->ws->mapped_vram += real->base.size; else if (real->initial_domain & RADEON_DOMAIN_GTT) real->ws->mapped_gtt += real->base.size; - real->ws->num_mapped_buffers++; } return (uint8_t*)cpu + offset; } @@ -331,8 +320,6 @@ static void amdgpu_bo_unmap(struct pb_buffer *buf) struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; struct amdgpu_winsys_bo *real; - assert(!bo->sparse); - if (bo->user_ptr) return; @@ -343,7 +330,6 @@ static void amdgpu_bo_unmap(struct pb_buffer *buf) real->ws->mapped_vram -= real->base.size; else if (real->initial_domain & RADEON_DOMAIN_GTT) real->ws->mapped_gtt -= real->base.size; - real->ws->num_mapped_buffers--; } amdgpu_bo_cpu_unmap(real->bo); @@ -360,10 +346,10 @@ static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo) assert(bo->bo); - mtx_lock(&ws->global_bo_list_lock); + pipe_mutex_lock(ws->global_bo_list_lock); LIST_ADDTAIL(&bo->u.real.global_list_item, &ws->global_bo_list); ws->num_buffers++; - mtx_unlock(&ws->global_bo_list_lock); + pipe_mutex_unlock(ws->global_bo_list_lock); } static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, @@ -398,6 +384,8 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, if (initial_domain & RADEON_DOMAIN_GTT) request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; + if (flags & RADEON_FLAG_CPU_ACCESS) + request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; if (flags & RADEON_FLAG_NO_CPU_ACCESS) request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS; if (flags & RADEON_FLAG_GTT_WC) @@ -413,8 +401,6 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, } va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0; - if (size > ws->info.pte_fragment_size) - alignment = MAX2(alignment, ws->info.pte_fragment_size); r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, size + va_gap_size, alignment, 0, &va, &va_handle, 0); if (r) @@ -495,16 +481,33 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, { struct amdgpu_winsys *ws = priv; struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab); - enum radeon_bo_domain domains = radeon_domain_from_heap(heap); - enum radeon_bo_flag flags = radeon_flags_from_heap(heap); + enum radeon_bo_domain domains; + enum radeon_bo_flag flags = 0; uint32_t base_id; if (!slab) return NULL; - unsigned slab_size = 1 << AMDGPU_SLAB_BO_SIZE_LOG2; + if (heap & 1) + flags |= RADEON_FLAG_GTT_WC; + if (heap & 2) + flags |= RADEON_FLAG_CPU_ACCESS; + + switch (heap >> 2) { + case 0: + domains = RADEON_DOMAIN_VRAM; + break; + default: + case 1: + domains = RADEON_DOMAIN_VRAM_GTT; + break; + case 2: + domains = RADEON_DOMAIN_GTT; + break; + } + slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base, - slab_size, slab_size, + 64 * 1024, 64 * 1024, domains, flags)); if (!slab->buffer) goto fail; @@ -560,462 +563,6 @@ void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab) FREE(slab); } -#if DEBUG_SPARSE_COMMITS -static void -sparse_dump(struct amdgpu_winsys_bo *bo, const char *func) -{ - fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n" - "Commitments:\n", - __func__, bo, bo->base.size, bo->u.sparse.num_va_pages, func); - - struct amdgpu_sparse_backing *span_backing = NULL; - uint32_t span_first_backing_page = 0; - uint32_t span_first_va_page = 0; - uint32_t va_page = 0; - - for (;;) { - struct amdgpu_sparse_backing *backing = 0; - uint32_t backing_page = 0; - - if (va_page < bo->u.sparse.num_va_pages) { - backing = bo->u.sparse.commitments[va_page].backing; - backing_page = bo->u.sparse.commitments[va_page].page; - } - - if (span_backing && - (backing != span_backing || - backing_page != span_first_backing_page + (va_page - span_first_va_page))) { - fprintf(stderr, " %u..%u: backing=%p:%u..%u\n", - span_first_va_page, va_page - 1, span_backing, - span_first_backing_page, - span_first_backing_page + (va_page - span_first_va_page) - 1); - - span_backing = NULL; - } - - if (va_page >= bo->u.sparse.num_va_pages) - break; - - if (backing && !span_backing) { - span_backing = backing; - span_first_backing_page = backing_page; - span_first_va_page = va_page; - } - - va_page++; - } - - fprintf(stderr, "Backing:\n"); - - list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { - fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->base.size); - for (unsigned i = 0; i < backing->num_chunks; ++i) - fprintf(stderr, " %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end); - } -} -#endif - -/* - * Attempt to allocate the given number of backing pages. Fewer pages may be - * allocated (depending on the fragmentation of existing backing buffers), - * which will be reflected by a change to *pnum_pages. - */ -static struct amdgpu_sparse_backing * -sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages) -{ - struct amdgpu_sparse_backing *best_backing; - unsigned best_idx; - uint32_t best_num_pages; - - best_backing = NULL; - best_idx = 0; - best_num_pages = 0; - - /* This is a very simple and inefficient best-fit algorithm. */ - list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { - for (unsigned idx = 0; idx < backing->num_chunks; ++idx) { - uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin; - if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) || - (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) { - best_backing = backing; - best_idx = idx; - best_num_pages = cur_num_pages; - } - } - } - - /* Allocate a new backing buffer if necessary. */ - if (!best_backing) { - struct pb_buffer *buf; - uint64_t size; - uint32_t pages; - - best_backing = CALLOC_STRUCT(amdgpu_sparse_backing); - if (!best_backing) - return NULL; - - best_backing->max_chunks = 4; - best_backing->chunks = CALLOC(best_backing->max_chunks, - sizeof(*best_backing->chunks)); - if (!best_backing->chunks) { - FREE(best_backing); - return NULL; - } - - assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE)); - - size = MIN3(bo->base.size / 16, - 8 * 1024 * 1024, - bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE); - size = MAX2(size, RADEON_SPARSE_PAGE_SIZE); - - buf = amdgpu_bo_create(&bo->ws->base, size, RADEON_SPARSE_PAGE_SIZE, - bo->initial_domain, - bo->u.sparse.flags | RADEON_FLAG_NO_SUBALLOC); - if (!buf) { - FREE(best_backing->chunks); - FREE(best_backing); - return NULL; - } - - /* We might have gotten a bigger buffer than requested via caching. */ - pages = buf->size / RADEON_SPARSE_PAGE_SIZE; - - best_backing->bo = amdgpu_winsys_bo(buf); - best_backing->num_chunks = 1; - best_backing->chunks[0].begin = 0; - best_backing->chunks[0].end = pages; - - list_add(&best_backing->list, &bo->u.sparse.backing); - bo->u.sparse.num_backing_pages += pages; - - best_idx = 0; - best_num_pages = pages; - } - - *pnum_pages = MIN2(*pnum_pages, best_num_pages); - *pstart_page = best_backing->chunks[best_idx].begin; - best_backing->chunks[best_idx].begin += *pnum_pages; - - if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) { - memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1], - sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1)); - best_backing->num_chunks--; - } - - return best_backing; -} - -static void -sparse_free_backing_buffer(struct amdgpu_winsys_bo *bo, - struct amdgpu_sparse_backing *backing) -{ - struct amdgpu_winsys *ws = backing->bo->ws; - - bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE; - - mtx_lock(&ws->bo_fence_lock); - amdgpu_add_fences(backing->bo, bo->num_fences, bo->fences); - mtx_unlock(&ws->bo_fence_lock); - - list_del(&backing->list); - amdgpu_winsys_bo_reference(&backing->bo, NULL); - FREE(backing->chunks); - FREE(backing); -} - -/* - * Return a range of pages from the given backing buffer back into the - * free structure. - */ -static bool -sparse_backing_free(struct amdgpu_winsys_bo *bo, - struct amdgpu_sparse_backing *backing, - uint32_t start_page, uint32_t num_pages) -{ - uint32_t end_page = start_page + num_pages; - unsigned low = 0; - unsigned high = backing->num_chunks; - - /* Find the first chunk with begin >= start_page. */ - while (low < high) { - unsigned mid = low + (high - low) / 2; - - if (backing->chunks[mid].begin >= start_page) - high = mid; - else - low = mid + 1; - } - - assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin); - assert(low == 0 || backing->chunks[low - 1].end <= start_page); - - if (low > 0 && backing->chunks[low - 1].end == start_page) { - backing->chunks[low - 1].end = end_page; - - if (low < backing->num_chunks && end_page == backing->chunks[low].begin) { - backing->chunks[low - 1].end = backing->chunks[low].end; - memmove(&backing->chunks[low], &backing->chunks[low + 1], - sizeof(*backing->chunks) * (backing->num_chunks - low - 1)); - backing->num_chunks--; - } - } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) { - backing->chunks[low].begin = start_page; - } else { - if (backing->num_chunks >= backing->max_chunks) { - unsigned new_max_chunks = 2 * backing->max_chunks; - struct amdgpu_sparse_backing_chunk *new_chunks = - REALLOC(backing->chunks, - sizeof(*backing->chunks) * backing->max_chunks, - sizeof(*backing->chunks) * new_max_chunks); - if (!new_chunks) - return false; - - backing->max_chunks = new_max_chunks; - backing->chunks = new_chunks; - } - - memmove(&backing->chunks[low + 1], &backing->chunks[low], - sizeof(*backing->chunks) * (backing->num_chunks - low)); - backing->chunks[low].begin = start_page; - backing->chunks[low].end = end_page; - backing->num_chunks++; - } - - if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 && - backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE) - sparse_free_backing_buffer(bo, backing); - - return true; -} - -static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf) -{ - struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); - int r; - - assert(!bo->bo && bo->sparse); - - r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, - (uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE, - bo->va, 0, AMDGPU_VA_OP_CLEAR); - if (r) { - fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r); - } - - while (!list_empty(&bo->u.sparse.backing)) { - struct amdgpu_sparse_backing *dummy = NULL; - sparse_free_backing_buffer(bo, - container_of(bo->u.sparse.backing.next, - dummy, list)); - } - - amdgpu_va_range_free(bo->u.sparse.va_handle); - mtx_destroy(&bo->u.sparse.commit_lock); - FREE(bo->u.sparse.commitments); - FREE(bo); -} - -static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = { - amdgpu_bo_sparse_destroy - /* other functions are never called */ -}; - -static struct pb_buffer * -amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size, - enum radeon_bo_domain domain, - enum radeon_bo_flag flags) -{ - struct amdgpu_winsys_bo *bo; - uint64_t map_size; - uint64_t va_gap_size; - int r; - - /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers - * that exceed this limit. This is not really a restriction: we don't have - * that much virtual address space anyway. - */ - if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE) - return NULL; - - bo = CALLOC_STRUCT(amdgpu_winsys_bo); - if (!bo) - return NULL; - - pipe_reference_init(&bo->base.reference, 1); - bo->base.alignment = RADEON_SPARSE_PAGE_SIZE; - bo->base.size = size; - bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl; - bo->ws = ws; - bo->initial_domain = domain; - bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); - bo->sparse = true; - bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE; - - bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); - bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages, - sizeof(*bo->u.sparse.commitments)); - if (!bo->u.sparse.commitments) - goto error_alloc_commitments; - - mtx_init(&bo->u.sparse.commit_lock, mtx_plain); - LIST_INITHEAD(&bo->u.sparse.backing); - - /* For simplicity, we always map a multiple of the page size. */ - map_size = align64(size, RADEON_SPARSE_PAGE_SIZE); - va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0; - r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, - map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE, - 0, &bo->va, &bo->u.sparse.va_handle, 0); - if (r) - goto error_va_alloc; - - r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va, - AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP); - if (r) - goto error_va_map; - - return &bo->base; - -error_va_map: - amdgpu_va_range_free(bo->u.sparse.va_handle); -error_va_alloc: - mtx_destroy(&bo->u.sparse.commit_lock); - FREE(bo->u.sparse.commitments); -error_alloc_commitments: - FREE(bo); - return NULL; -} - -static bool -amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size, - bool commit) -{ - struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf); - struct amdgpu_sparse_commitment *comm; - uint32_t va_page, end_va_page; - bool ok = true; - int r; - - assert(bo->sparse); - assert(offset % RADEON_SPARSE_PAGE_SIZE == 0); - assert(offset <= bo->base.size); - assert(size <= bo->base.size - offset); - assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size); - - comm = bo->u.sparse.commitments; - va_page = offset / RADEON_SPARSE_PAGE_SIZE; - end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); - - mtx_lock(&bo->u.sparse.commit_lock); - -#if DEBUG_SPARSE_COMMITS - sparse_dump(bo, __func__); -#endif - - if (commit) { - while (va_page < end_va_page) { - uint32_t span_va_page; - - /* Skip pages that are already committed. */ - if (comm[va_page].backing) { - va_page++; - continue; - } - - /* Determine length of uncommitted span. */ - span_va_page = va_page; - while (va_page < end_va_page && !comm[va_page].backing) - va_page++; - - /* Fill the uncommitted span with chunks of backing memory. */ - while (span_va_page < va_page) { - struct amdgpu_sparse_backing *backing; - uint32_t backing_start, backing_size; - - backing_size = va_page - span_va_page; - backing = sparse_backing_alloc(bo, &backing_start, &backing_size); - if (!backing) { - ok = false; - goto out; - } - - r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo, - (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE, - (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE, - bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE, - AMDGPU_VM_PAGE_READABLE | - AMDGPU_VM_PAGE_WRITEABLE | - AMDGPU_VM_PAGE_EXECUTABLE, - AMDGPU_VA_OP_REPLACE); - if (r) { - ok = sparse_backing_free(bo, backing, backing_start, backing_size); - assert(ok && "sufficient memory should already be allocated"); - - ok = false; - goto out; - } - - while (backing_size) { - comm[span_va_page].backing = backing; - comm[span_va_page].page = backing_start; - span_va_page++; - backing_start++; - backing_size--; - } - } - } - } else { - r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, - (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE, - bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE, - AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE); - if (r) { - ok = false; - goto out; - } - - while (va_page < end_va_page) { - struct amdgpu_sparse_backing *backing; - uint32_t backing_start; - uint32_t span_pages; - - /* Skip pages that are already uncommitted. */ - if (!comm[va_page].backing) { - va_page++; - continue; - } - - /* Group contiguous spans of pages. */ - backing = comm[va_page].backing; - backing_start = comm[va_page].page; - comm[va_page].backing = NULL; - - span_pages = 1; - va_page++; - - while (va_page < end_va_page && - comm[va_page].backing == backing && - comm[va_page].page == backing_start + span_pages) { - comm[va_page].backing = NULL; - va_page++; - span_pages++; - } - - if (!sparse_backing_free(bo, backing, backing_start, span_pages)) { - /* Couldn't allocate tracking data structures, so we have to leak */ - fprintf(stderr, "amdgpu: leaking PRT backing memory\n"); - ok = false; - } - } - } -out: - - mtx_unlock(&bo->u.sparse.commit_lock); - - return ok; -} - static unsigned eg_tile_split(unsigned tile_split) { switch (tile_split) { @@ -1050,7 +597,7 @@ static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf, { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); struct amdgpu_bo_info info = {0}; - uint64_t tiling_flags; + uint32_t tiling_flags; int r; assert(bo->bo && "must not be called for slab entries"); @@ -1061,25 +608,21 @@ static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf, tiling_flags = info.metadata.tiling_info; - if (bo->ws->info.chip_class >= GFX9) { - md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE); - } else { - md->u.legacy.microtile = RADEON_LAYOUT_LINEAR; - md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR; - - if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */ - md->u.legacy.macrotile = RADEON_LAYOUT_TILED; - else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */ - md->u.legacy.microtile = RADEON_LAYOUT_TILED; - - md->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG); - md->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH); - md->u.legacy.bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT); - md->u.legacy.tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT)); - md->u.legacy.mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT); - md->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS); - md->u.legacy.scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */ - } + md->microtile = RADEON_LAYOUT_LINEAR; + md->macrotile = RADEON_LAYOUT_LINEAR; + + if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */ + md->macrotile = RADEON_LAYOUT_TILED; + else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */ + md->microtile = RADEON_LAYOUT_TILED; + + md->pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG); + md->bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH); + md->bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT); + md->tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT)); + md->mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT); + md->num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS); + md->scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */ md->size_metadata = info.metadata.size_metadata; memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata)); @@ -1090,33 +633,29 @@ static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf, { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); struct amdgpu_bo_metadata metadata = {0}; - uint64_t tiling_flags = 0; + uint32_t tiling_flags = 0; assert(bo->bo && "must not be called for slab entries"); - if (bo->ws->info.chip_class >= GFX9) { - tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode); - } else { - if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED) - tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */ - else if (md->u.legacy.microtile == RADEON_LAYOUT_TILED) - tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */ - else - tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */ - - tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->u.legacy.pipe_config); - tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->u.legacy.bankw)); - tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->u.legacy.bankh)); - if (md->u.legacy.tile_split) - tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->u.legacy.tile_split)); - tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->u.legacy.mtilea)); - tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->u.legacy.num_banks)-1); - - if (md->u.legacy.scanout) - tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */ - else - tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */ - } + if (md->macrotile == RADEON_LAYOUT_TILED) + tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */ + else if (md->microtile == RADEON_LAYOUT_TILED) + tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */ + else + tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */ + + tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->pipe_config); + tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->bankw)); + tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->bankh)); + if (md->tile_split) + tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->tile_split)); + tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->mtilea)); + tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->num_banks)-1); + + if (md->scanout) + tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */ + else + tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */ metadata.tiling_info = tiling_flags; metadata.size_metadata = md->size_metadata; @@ -1136,21 +675,33 @@ amdgpu_bo_create(struct radeon_winsys *rws, struct amdgpu_winsys_bo *bo; unsigned usage = 0, pb_cache_bucket; - /* VRAM implies WC. This is not optional. */ - assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC); - - /* NO_CPU_ACCESS is valid with VRAM only. */ - assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS)); - /* Sub-allocate small buffers from slabs. */ - if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) && + if (!(flags & RADEON_FLAG_HANDLE) && size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) && alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) { struct pb_slab_entry *entry; - int heap = radeon_get_heap_index(domain, flags); + unsigned heap = 0; - if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS) + if (flags & RADEON_FLAG_GTT_WC) + heap |= 1; + if (flags & RADEON_FLAG_CPU_ACCESS) + heap |= 2; + if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_CPU_ACCESS)) + goto no_slab; + + switch (domain) { + case RADEON_DOMAIN_VRAM: + heap |= 0 * 4; + break; + case RADEON_DOMAIN_VRAM_GTT: + heap |= 1 * 4; + break; + case RADEON_DOMAIN_GTT: + heap |= 2 * 4; + break; + default: goto no_slab; + } entry = pb_slab_alloc(&ws->bo_slabs, size, heap); if (!entry) { @@ -1171,16 +722,8 @@ amdgpu_bo_create(struct radeon_winsys *rws, } no_slab: - if (flags & RADEON_FLAG_SPARSE) { - assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0); - - flags |= RADEON_FLAG_NO_CPU_ACCESS; - - return amdgpu_bo_sparse_create(ws, size, domain, flags); - } - /* This flag is irrelevant for the cache. */ - flags &= ~RADEON_FLAG_NO_SUBALLOC; + flags &= ~RADEON_FLAG_HANDLE; /* Align size to page size. This is the minimum alignment for normal * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, @@ -1189,11 +732,22 @@ no_slab: size = align64(size, ws->info.gart_page_size); alignment = align(alignment, ws->info.gart_page_size); - int heap = radeon_get_heap_index(domain, flags); - assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); - usage = 1 << heap; /* Only set one usage bit for each heap. */ - - pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); + /* Only set one usage bit each for domains and flags, or the cache manager + * might consider different sets of domains / flags compatible + */ + if (domain == RADEON_DOMAIN_VRAM_GTT) + usage = 1 << 2; + else + usage = domain >> 1; + assert(flags < sizeof(usage) * 8 - 3); + usage |= 1 << (flags + 3); + + /* Determine the pb_cache bucket for minimizing pb_cache misses. */ + pb_cache_bucket = 0; + if (domain & RADEON_DOMAIN_VRAM) /* VRAM or VRAM+GTT */ + pb_cache_bucket += 1; + if (flags == RADEON_FLAG_GTT_WC) /* WC */ + pb_cache_bucket += 2; assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); /* Get a buffer from the cache. */ @@ -1322,9 +876,10 @@ static bool amdgpu_bo_get_handle(struct pb_buffer *buffer, enum amdgpu_bo_handle_type type; int r; - /* Don't allow exports of slab entries and sparse buffers. */ - if (!bo->bo) - return false; + if (!bo->bo) { + offset += bo->va - bo->u.slab.real->va; + bo = bo->u.slab.real; + } bo->u.real.use_reusable_pool = false; @@ -1411,13 +966,6 @@ static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf) return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL; } -static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf) -{ - struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; - - return !bo->bo && !bo->sparse; -} - static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf) { return ((struct amdgpu_winsys_bo*)buf)->va; @@ -1434,9 +982,7 @@ void amdgpu_bo_init_functions(struct amdgpu_winsys *ws) ws->base.buffer_from_handle = amdgpu_bo_from_handle; ws->base.buffer_from_ptr = amdgpu_bo_from_ptr; ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr; - ws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated; ws->base.buffer_get_handle = amdgpu_bo_get_handle; - ws->base.buffer_commit = amdgpu_bo_sparse_commit; ws->base.buffer_get_virtual_address = amdgpu_bo_get_va; ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain; } diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h index 1311344b8..1e25897b6 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h @@ -37,28 +37,6 @@ #include "pipebuffer/pb_slab.h" -struct amdgpu_sparse_backing_chunk; - -/* - * Sub-allocation information for a real buffer used as backing memory of a - * sparse buffer. - */ -struct amdgpu_sparse_backing { - struct list_head list; - - struct amdgpu_winsys_bo *bo; - - /* Sorted list of free chunks. */ - struct amdgpu_sparse_backing_chunk *chunks; - uint32_t max_chunks; - uint32_t num_chunks; -}; - -struct amdgpu_sparse_commitment { - struct amdgpu_sparse_backing *backing; - uint32_t page; -}; - struct amdgpu_winsys_bo { struct pb_buffer base; union { @@ -75,26 +53,12 @@ struct amdgpu_winsys_bo { struct pb_slab_entry entry; struct amdgpu_winsys_bo *real; } slab; - struct { - mtx_t commit_lock; - amdgpu_va_handle va_handle; - enum radeon_bo_flag flags; - - uint32_t num_va_pages; - uint32_t num_backing_pages; - - struct list_head backing; - - /* Commitment information for each page of the virtual memory area. */ - struct amdgpu_sparse_commitment *commitments; - } sparse; } u; struct amdgpu_winsys *ws; void *user_ptr; /* from buffer_from_ptr */ - amdgpu_bo_handle bo; /* NULL for slab entries and sparse buffers */ - bool sparse; + amdgpu_bo_handle bo; /* NULL for slab entries */ uint32_t unique_id; uint64_t va; enum radeon_bo_domain initial_domain; diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index d26625388..2b86827ff 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -178,7 +178,6 @@ static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *ws) ctx->ws = amdgpu_winsys(ws); ctx->refcount = 1; - ctx->initial_num_total_rejected_cs = ctx->ws->num_total_rejected_cs; r = amdgpu_cs_ctx_create(ctx->ws->dev, &ctx->ctx); if (r) { @@ -228,13 +227,6 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx) uint32_t result, hangs; int r; - /* Return a failure due to a rejected command submission. */ - if (ctx->ws->num_total_rejected_cs > ctx->initial_num_total_rejected_cs) { - return ctx->num_rejected_cs ? PIPE_GUILTY_CONTEXT_RESET : - PIPE_INNOCENT_CONTEXT_RESET; - } - - /* Return a failure due to a GPU hang. */ r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs); if (r) { fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r); @@ -259,8 +251,7 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx) static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs) { return cs->request.ip_type != AMDGPU_HW_IP_UVD && - cs->request.ip_type != AMDGPU_HW_IP_VCE && - cs->request.ip_type != AMDGPU_HW_IP_VCN_DEC; + cs->request.ip_type != AMDGPU_HW_IP_VCE; } static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs) @@ -287,12 +278,9 @@ int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo * if (bo->bo) { buffers = cs->real_buffers; num_buffers = cs->num_real_buffers; - } else if (!bo->sparse) { + } else { buffers = cs->slab_buffers; num_buffers = cs->num_slab_buffers; - } else { - buffers = cs->sparse_buffers; - num_buffers = cs->num_sparse_buffers; } /* not found or found */ @@ -319,31 +307,48 @@ int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo * } static int -amdgpu_do_add_real_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo) +amdgpu_lookup_or_add_real_buffer(struct amdgpu_cs *acs, struct amdgpu_winsys_bo *bo) { + struct amdgpu_cs_context *cs = acs->csc; struct amdgpu_cs_buffer *buffer; - int idx; + unsigned hash; + int idx = amdgpu_lookup_buffer(cs, bo); + + if (idx >= 0) + return idx; /* New buffer, check if the backing array is large enough. */ if (cs->num_real_buffers >= cs->max_real_buffers) { unsigned new_max = MAX2(cs->max_real_buffers + 16, (unsigned)(cs->max_real_buffers * 1.3)); struct amdgpu_cs_buffer *new_buffers; + amdgpu_bo_handle *new_handles; + uint8_t *new_flags; new_buffers = MALLOC(new_max * sizeof(*new_buffers)); + new_handles = MALLOC(new_max * sizeof(*new_handles)); + new_flags = MALLOC(new_max * sizeof(*new_flags)); - if (!new_buffers) { - fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n"); + if (!new_buffers || !new_handles || !new_flags) { + fprintf(stderr, "amdgpu_lookup_or_add_buffer: allocation failed\n"); FREE(new_buffers); + FREE(new_handles); + FREE(new_flags); return -1; } memcpy(new_buffers, cs->real_buffers, cs->num_real_buffers * sizeof(*new_buffers)); + memcpy(new_handles, cs->handles, cs->num_real_buffers * sizeof(*new_handles)); + memcpy(new_flags, cs->flags, cs->num_real_buffers * sizeof(*new_flags)); FREE(cs->real_buffers); + FREE(cs->handles); + FREE(cs->flags); cs->max_real_buffers = new_max; cs->real_buffers = new_buffers; + cs->handles = new_handles; + cs->flags = new_flags; } idx = cs->num_real_buffers; @@ -351,24 +356,11 @@ amdgpu_do_add_real_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo memset(buffer, 0, sizeof(*buffer)); amdgpu_winsys_bo_reference(&buffer->bo, bo); + cs->handles[idx] = bo->bo; + cs->flags[idx] = 0; p_atomic_inc(&bo->num_cs_references); cs->num_real_buffers++; - return idx; -} - -static int -amdgpu_lookup_or_add_real_buffer(struct amdgpu_cs *acs, struct amdgpu_winsys_bo *bo) -{ - struct amdgpu_cs_context *cs = acs->csc; - unsigned hash; - int idx = amdgpu_lookup_buffer(cs, bo); - - if (idx >= 0) - return idx; - - idx = amdgpu_do_add_real_buffer(cs, bo); - hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); cs->buffer_indices_hashlist[hash] = idx; @@ -429,63 +421,6 @@ static int amdgpu_lookup_or_add_slab_buffer(struct amdgpu_cs *acs, return idx; } -static int amdgpu_lookup_or_add_sparse_buffer(struct amdgpu_cs *acs, - struct amdgpu_winsys_bo *bo) -{ - struct amdgpu_cs_context *cs = acs->csc; - struct amdgpu_cs_buffer *buffer; - unsigned hash; - int idx = amdgpu_lookup_buffer(cs, bo); - - if (idx >= 0) - return idx; - - /* New buffer, check if the backing array is large enough. */ - if (cs->num_sparse_buffers >= cs->max_sparse_buffers) { - unsigned new_max = - MAX2(cs->max_sparse_buffers + 16, (unsigned)(cs->max_sparse_buffers * 1.3)); - struct amdgpu_cs_buffer *new_buffers; - - new_buffers = REALLOC(cs->sparse_buffers, - cs->max_sparse_buffers * sizeof(*new_buffers), - new_max * sizeof(*new_buffers)); - if (!new_buffers) { - fprintf(stderr, "amdgpu_lookup_or_add_sparse_buffer: allocation failed\n"); - return -1; - } - - cs->max_sparse_buffers = new_max; - cs->sparse_buffers = new_buffers; - } - - idx = cs->num_sparse_buffers; - buffer = &cs->sparse_buffers[idx]; - - memset(buffer, 0, sizeof(*buffer)); - amdgpu_winsys_bo_reference(&buffer->bo, bo); - p_atomic_inc(&bo->num_cs_references); - cs->num_sparse_buffers++; - - hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); - cs->buffer_indices_hashlist[hash] = idx; - - /* We delay adding the backing buffers until we really have to. However, - * we cannot delay accounting for memory use. - */ - mtx_lock(&bo->u.sparse.commit_lock); - - list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { - if (bo->initial_domain & RADEON_DOMAIN_VRAM) - acs->main.base.used_vram += backing->bo->base.size; - else if (bo->initial_domain & RADEON_DOMAIN_GTT) - acs->main.base.used_gart += backing->bo->base.size; - } - - mtx_unlock(&bo->u.sparse.commit_lock); - - return idx; -} - static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs, struct pb_buffer *buf, enum radeon_bo_usage usage, @@ -501,48 +436,26 @@ static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs, struct amdgpu_cs_buffer *buffer; int index; - /* Fast exit for no-op calls. - * This is very effective with suballocators and linear uploaders that - * are outside of the winsys. - */ - if (bo == cs->last_added_bo && - (usage & cs->last_added_bo_usage) == usage && - (1ull << priority) & cs->last_added_bo_priority_usage) - return cs->last_added_bo_index; - - if (!bo->sparse) { - if (!bo->bo) { - index = amdgpu_lookup_or_add_slab_buffer(acs, bo); - if (index < 0) - return 0; - - buffer = &cs->slab_buffers[index]; - buffer->usage |= usage; - - usage &= ~RADEON_USAGE_SYNCHRONIZED; - index = buffer->u.slab.real_idx; - } else { - index = amdgpu_lookup_or_add_real_buffer(acs, bo); - if (index < 0) - return 0; - } + if (!bo->bo) { + index = amdgpu_lookup_or_add_slab_buffer(acs, bo); + if (index < 0) + return 0; + + buffer = &cs->slab_buffers[index]; + buffer->usage |= usage; - buffer = &cs->real_buffers[index]; + usage &= ~RADEON_USAGE_SYNCHRONIZED; + index = buffer->u.slab.real_idx; } else { - index = amdgpu_lookup_or_add_sparse_buffer(acs, bo); + index = amdgpu_lookup_or_add_real_buffer(acs, bo); if (index < 0) return 0; - - buffer = &cs->sparse_buffers[index]; } - buffer->u.real.priority_usage |= 1ull << priority; + buffer = &cs->real_buffers[index]; + buffer->u.real.priority_usage |= 1llu << priority; buffer->usage |= usage; - - cs->last_added_bo = bo; - cs->last_added_bo_index = index; - cs->last_added_bo_usage = buffer->usage; - cs->last_added_bo_priority_usage = buffer->u.real.priority_usage; + cs->flags[index] = MAX2(cs->flags[index], priority / 4); return index; } @@ -581,7 +494,8 @@ static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) pb = ws->base.buffer_create(&ws->base, buffer_size, ws->info.gart_page_size, - RADEON_DOMAIN_GTT, 0); + RADEON_DOMAIN_GTT, + RADEON_FLAG_CPU_ACCESS); if (!pb) return false; @@ -695,6 +609,8 @@ static void amdgpu_ib_finalize(struct amdgpu_ib *ib) static bool amdgpu_init_cs_context(struct amdgpu_cs_context *cs, enum ring_type ring_type) { + int i; + switch (ring_type) { case RING_DMA: cs->request.ip_type = AMDGPU_HW_IP_DMA; @@ -712,18 +628,15 @@ static bool amdgpu_init_cs_context(struct amdgpu_cs_context *cs, cs->request.ip_type = AMDGPU_HW_IP_COMPUTE; break; - case RING_VCN_DEC: - cs->request.ip_type = AMDGPU_HW_IP_VCN_DEC; - break; - default: case RING_GFX: cs->request.ip_type = AMDGPU_HW_IP_GFX; break; } - memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); - cs->last_added_bo = NULL; + for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) { + cs->buffer_indices_hashlist[i] = -1; + } cs->request.number_of_ibs = 1; cs->request.ibs = &cs->ib[IB_MAIN]; @@ -747,21 +660,14 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs) p_atomic_dec(&cs->slab_buffers[i].bo->num_cs_references); amdgpu_winsys_bo_reference(&cs->slab_buffers[i].bo, NULL); } - for (i = 0; i < cs->num_sparse_buffers; i++) { - p_atomic_dec(&cs->sparse_buffers[i].bo->num_cs_references); - amdgpu_winsys_bo_reference(&cs->sparse_buffers[i].bo, NULL); - } - for (i = 0; i < cs->num_fence_dependencies; i++) - amdgpu_fence_reference(&cs->fence_dependencies[i], NULL); cs->num_real_buffers = 0; cs->num_slab_buffers = 0; - cs->num_sparse_buffers = 0; - cs->num_fence_dependencies = 0; amdgpu_fence_reference(&cs->fence, NULL); - memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); - cs->last_added_bo = NULL; + for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) { + cs->buffer_indices_hashlist[i] = -1; + } } static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs) @@ -771,8 +677,7 @@ static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs) FREE(cs->real_buffers); FREE(cs->handles); FREE(cs->slab_buffers); - FREE(cs->sparse_buffers); - FREE(cs->fence_dependencies); + FREE(cs->request.dependencies); } @@ -983,6 +888,7 @@ static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs, { struct amdgpu_cs_context *cs = acs->csc; struct amdgpu_winsys_bo *bo = buffer->bo; + struct amdgpu_cs_fence *dep; unsigned new_num_fences = 0; for (unsigned j = 0; j < bo->num_fences; ++j) { @@ -1004,21 +910,21 @@ static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs, if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED)) continue; - idx = cs->num_fence_dependencies++; - if (idx >= cs->max_fence_dependencies) { + if (bo_fence->submission_in_progress) + os_wait_until_zero(&bo_fence->submission_in_progress, + PIPE_TIMEOUT_INFINITE); + + idx = cs->request.number_of_dependencies++; + if (idx >= cs->max_dependencies) { unsigned size; - const unsigned increment = 8; - - cs->max_fence_dependencies = idx + increment; - size = cs->max_fence_dependencies * sizeof(cs->fence_dependencies[0]); - cs->fence_dependencies = realloc(cs->fence_dependencies, size); - /* Clear the newly-allocated elements. */ - memset(cs->fence_dependencies + idx, 0, - increment * sizeof(cs->fence_dependencies[0])); + + cs->max_dependencies = idx + 8; + size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence); + cs->request.dependencies = realloc(cs->request.dependencies, size); } - amdgpu_fence_reference(&cs->fence_dependencies[idx], - (struct pipe_fence_handle*)bo_fence); + dep = &cs->request.dependencies[idx]; + memcpy(dep, &bo_fence->fence, sizeof(*dep)); } for (unsigned j = new_num_fences; j < bo->num_fences; ++j) @@ -1027,108 +933,47 @@ static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs, bo->num_fences = new_num_fences; } -/* Add the given list of fences to the buffer's fence list. - * - * Must be called with the winsys bo_fence_lock held. +/* Since the kernel driver doesn't synchronize execution between different + * rings automatically, we have to add fence dependencies manually. */ -void amdgpu_add_fences(struct amdgpu_winsys_bo *bo, - unsigned num_fences, - struct pipe_fence_handle **fences) +static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs) { - if (bo->num_fences + num_fences > bo->max_fences) { - unsigned new_max_fences = MAX2(bo->num_fences + num_fences, bo->max_fences * 2); + struct amdgpu_cs_context *cs = acs->csc; + int i; + + cs->request.number_of_dependencies = 0; + + for (i = 0; i < cs->num_real_buffers; i++) + amdgpu_add_fence_dependency(acs, &cs->real_buffers[i]); + for (i = 0; i < cs->num_slab_buffers; i++) + amdgpu_add_fence_dependency(acs, &cs->slab_buffers[i]); +} + +static void amdgpu_add_fence(struct amdgpu_winsys_bo *bo, + struct pipe_fence_handle *fence) +{ + if (bo->num_fences >= bo->max_fences) { + unsigned new_max_fences = MAX2(1, bo->max_fences * 2); struct pipe_fence_handle **new_fences = REALLOC(bo->fences, bo->num_fences * sizeof(*new_fences), new_max_fences * sizeof(*new_fences)); - if (likely(new_fences)) { + if (new_fences) { bo->fences = new_fences; bo->max_fences = new_max_fences; } else { - unsigned drop; - - fprintf(stderr, "amdgpu_add_fences: allocation failure, dropping fence(s)\n"); + fprintf(stderr, "amdgpu_add_fence: allocation failure, dropping fence\n"); if (!bo->num_fences) return; - bo->num_fences--; /* prefer to keep the most recent fence if possible */ + bo->num_fences--; /* prefer to keep a more recent fence if possible */ amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL); - - drop = bo->num_fences + num_fences - bo->max_fences; - num_fences -= drop; - fences += drop; - } - } - - for (unsigned i = 0; i < num_fences; ++i) { - bo->fences[bo->num_fences] = NULL; - amdgpu_fence_reference(&bo->fences[bo->num_fences], fences[i]); - bo->num_fences++; - } -} - -static void amdgpu_add_fence_dependencies_list(struct amdgpu_cs *acs, - struct pipe_fence_handle *fence, - unsigned num_buffers, - struct amdgpu_cs_buffer *buffers) -{ - for (unsigned i = 0; i < num_buffers; i++) { - struct amdgpu_cs_buffer *buffer = &buffers[i]; - struct amdgpu_winsys_bo *bo = buffer->bo; - - amdgpu_add_fence_dependency(acs, buffer); - p_atomic_inc(&bo->num_active_ioctls); - amdgpu_add_fences(bo, 1, &fence); - } -} - -/* Since the kernel driver doesn't synchronize execution between different - * rings automatically, we have to add fence dependencies manually. - */ -static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs) -{ - struct amdgpu_cs_context *cs = acs->csc; - - cs->num_fence_dependencies = 0; - - amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers); - amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers); - amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers); -} - -/* Add backing of sparse buffers to the buffer list. - * - * This is done late, during submission, to keep the buffer list short before - * submit, and to avoid managing fences for the backing buffers. - */ -static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs) -{ - for (unsigned i = 0; i < cs->num_sparse_buffers; ++i) { - struct amdgpu_cs_buffer *buffer = &cs->sparse_buffers[i]; - struct amdgpu_winsys_bo *bo = buffer->bo; - - mtx_lock(&bo->u.sparse.commit_lock); - - list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { - /* We can directly add the buffer here, because we know that each - * backing buffer occurs only once. - */ - int idx = amdgpu_do_add_real_buffer(cs, backing->bo); - if (idx < 0) { - fprintf(stderr, "%s: failed to add buffer\n", __FUNCTION__); - mtx_unlock(&bo->u.sparse.commit_lock); - return false; - } - - cs->real_buffers[idx].usage = buffer->usage & ~RADEON_USAGE_SYNCHRONIZED; - cs->real_buffers[idx].u.real.priority_usage = buffer->u.real.priority_usage; - p_atomic_inc(&backing->bo->num_active_ioctls); } - - mtx_unlock(&bo->u.sparse.commit_lock); } - return true; + bo->fences[bo->num_fences] = NULL; + amdgpu_fence_reference(&bo->fences[bo->num_fences], fence); + bo->num_fences++; } void amdgpu_cs_submit_ib(void *job, int thread_index) @@ -1137,30 +982,7 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) struct amdgpu_winsys *ws = acs->ctx->ws; struct amdgpu_cs_context *cs = acs->cst; int i, r; - struct amdgpu_cs_fence *dependencies = NULL; - - /* Set dependencies (input fences). */ - if (cs->num_fence_dependencies) { - dependencies = alloca(sizeof(dependencies[0]) * - cs->num_fence_dependencies); - unsigned num = 0; - - for (i = 0; i < cs->num_fence_dependencies; i++) { - struct amdgpu_fence *fence = - (struct amdgpu_fence*)cs->fence_dependencies[i]; - - /* Past fences can't be unsubmitted because we have only 1 CS thread. */ - assert(!fence->submission_in_progress); - memcpy(&dependencies[num++], &fence->fence, sizeof(dependencies[0])); - } - cs->request.dependencies = dependencies; - cs->request.number_of_dependencies = num; - } else { - cs->request.dependencies = NULL; - cs->request.number_of_dependencies = 0; - } - /* Set the output fence. */ cs->request.fence_info.handle = NULL; if (amdgpu_cs_has_user_fence(cs)) { cs->request.fence_info.handle = acs->ctx->user_fence_bo; @@ -1175,11 +997,11 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) amdgpu_bo_handle *handles; unsigned num = 0; - mtx_lock(&ws->global_bo_list_lock); + pipe_mutex_lock(ws->global_bo_list_lock); handles = malloc(sizeof(handles[0]) * ws->num_buffers); if (!handles) { - mtx_unlock(&ws->global_bo_list_lock); + pipe_mutex_unlock(ws->global_bo_list_lock); amdgpu_cs_context_cleanup(cs); cs->error_code = -ENOMEM; return; @@ -1194,44 +1016,12 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) handles, NULL, &cs->request.resources); free(handles); - mtx_unlock(&ws->global_bo_list_lock); + pipe_mutex_unlock(ws->global_bo_list_lock); } else { - if (!amdgpu_add_sparse_backing_buffers(cs)) { - r = -ENOMEM; - goto bo_list_error; - } - - if (cs->max_real_submit < cs->num_real_buffers) { - FREE(cs->handles); - FREE(cs->flags); - - cs->handles = MALLOC(sizeof(*cs->handles) * cs->num_real_buffers); - cs->flags = MALLOC(sizeof(*cs->flags) * cs->num_real_buffers); - - if (!cs->handles || !cs->flags) { - cs->max_real_submit = 0; - r = -ENOMEM; - goto bo_list_error; - } - } - - for (i = 0; i < cs->num_real_buffers; ++i) { - struct amdgpu_cs_buffer *buffer = &cs->real_buffers[i]; - - assert(buffer->u.real.priority_usage != 0); - - cs->handles[i] = buffer->bo->bo; - cs->flags[i] = (util_last_bit64(buffer->u.real.priority_usage) - 1) / 4; - } - - if (acs->ring_type == RING_GFX) - ws->gfx_bo_list_counter += cs->num_real_buffers; - r = amdgpu_bo_list_create(ws->dev, cs->num_real_buffers, cs->handles, cs->flags, &cs->request.resources); } -bo_list_error: if (r) { fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r); @@ -1241,25 +1031,16 @@ bo_list_error: goto cleanup; } - if (acs->ctx->num_rejected_cs) - r = -ECANCELED; - else - r = amdgpu_cs_submit(acs->ctx->ctx, 0, &cs->request, 1); - + r = amdgpu_cs_submit(acs->ctx->ctx, 0, &cs->request, 1); cs->error_code = r; if (r) { if (r == -ENOMEM) fprintf(stderr, "amdgpu: Not enough memory for command submission.\n"); - else if (r == -ECANCELED) - fprintf(stderr, "amdgpu: The CS has been cancelled because the context is lost.\n"); else fprintf(stderr, "amdgpu: The CS has been rejected, " "see dmesg for more information (%i).\n", r); amdgpu_fence_signalled(cs->fence); - - acs->ctx->num_rejected_cs++; - ws->num_total_rejected_cs++; } else { /* Success. */ uint64_t *user_fence = NULL; @@ -1278,8 +1059,6 @@ cleanup: p_atomic_dec(&cs->real_buffers[i].bo->num_active_ioctls); for (i = 0; i < cs->num_slab_buffers; i++) p_atomic_dec(&cs->slab_buffers[i].bo->num_active_ioctls); - for (i = 0; i < cs->num_sparse_buffers; i++) - p_atomic_dec(&cs->sparse_buffers[i].bo->num_active_ioctls); amdgpu_cs_context_cleanup(cs); } @@ -1288,9 +1067,11 @@ cleanup: void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs) { struct amdgpu_cs *cs = amdgpu_cs(rcs); + struct amdgpu_winsys *ws = cs->ctx->ws; /* Wait for any pending ioctl of this CS to complete. */ - util_queue_fence_wait(&cs->flush_completed); + if (util_queue_is_initialized(&ws->cs_queue)) + util_queue_job_wait(&cs->flush_completed); } static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, @@ -1337,10 +1118,6 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, while (rcs->current.cdw & 15) radeon_emit(rcs, 0x80000000); /* type2 nop packet */ break; - case RING_VCN_DEC: - while (rcs->current.cdw & 15) - radeon_emit(rcs, 0x81ff); /* nop packet */ - break; default: break; } @@ -1350,10 +1127,11 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, } /* If the CS is not empty or overflowed.... */ - if (likely(radeon_emitted(&cs->main.base, 0) && + if (radeon_emitted(&cs->main.base, 0) && cs->main.base.current.cdw <= cs->main.base.current.max_dw && - !debug_get_option_noop())) { + !debug_get_option_noop()) { struct amdgpu_cs_context *cur = cs->csc; + unsigned i, num_buffers; /* Set IB sizes. */ amdgpu_ib_finalize(&cs->main); @@ -1379,30 +1157,39 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, if (fence) amdgpu_fence_reference(fence, cur->fence); - amdgpu_cs_sync_flush(rcs); - - /* Prepare buffers. - * - * This fence must be held until the submission is queued to ensure - * that the order of fence dependency updates matches the order of - * submissions. - */ - mtx_lock(&ws->bo_fence_lock); + /* Prepare buffers. */ + pipe_mutex_lock(ws->bo_fence_lock); amdgpu_add_fence_dependencies(cs); + num_buffers = cur->num_real_buffers; + for (i = 0; i < num_buffers; i++) { + struct amdgpu_winsys_bo *bo = cur->real_buffers[i].bo; + p_atomic_inc(&bo->num_active_ioctls); + amdgpu_add_fence(bo, cur->fence); + } + + num_buffers = cur->num_slab_buffers; + for (i = 0; i < num_buffers; i++) { + struct amdgpu_winsys_bo *bo = cur->slab_buffers[i].bo; + p_atomic_inc(&bo->num_active_ioctls); + amdgpu_add_fence(bo, cur->fence); + } + pipe_mutex_unlock(ws->bo_fence_lock); + + amdgpu_cs_sync_flush(rcs); + /* Swap command streams. "cst" is going to be submitted. */ cs->csc = cs->cst; cs->cst = cur; /* Submit. */ - util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed, - amdgpu_cs_submit_ib, NULL); - /* The submission has been queued, unlock the fence now. */ - mtx_unlock(&ws->bo_fence_lock); - - if (!(flags & RADEON_FLUSH_ASYNC)) { - amdgpu_cs_sync_flush(rcs); - error_code = cur->error_code; + if ((flags & RADEON_FLUSH_ASYNC) && + util_queue_is_initialized(&ws->cs_queue)) { + util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed, + amdgpu_cs_submit_ib, NULL); + } else { + amdgpu_cs_submit_ib(cs, 0); + error_code = cs->cst->error_code; } } else { amdgpu_cs_context_cleanup(cs->csc); @@ -1417,11 +1204,7 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, cs->main.base.used_gart = 0; cs->main.base.used_vram = 0; - if (cs->ring_type == RING_GFX) - ws->num_gfx_IBs++; - else if (cs->ring_type == RING_DMA) - ws->num_sdma_IBs++; - + ws->num_cs_flushes++; return error_code; } diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index d83c1e0fe..5f181a5da 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -41,8 +41,6 @@ struct amdgpu_ctx { amdgpu_bo_handle user_fence_bo; uint64_t *user_fence_cpu_address_base; int refcount; - unsigned initial_num_total_rejected_cs; - unsigned num_rejected_cs; }; struct amdgpu_cs_buffer { @@ -84,30 +82,17 @@ struct amdgpu_cs_context { /* Buffers. */ unsigned max_real_buffers; unsigned num_real_buffers; - struct amdgpu_cs_buffer *real_buffers; - - unsigned max_real_submit; amdgpu_bo_handle *handles; uint8_t *flags; + struct amdgpu_cs_buffer *real_buffers; unsigned num_slab_buffers; unsigned max_slab_buffers; struct amdgpu_cs_buffer *slab_buffers; - unsigned num_sparse_buffers; - unsigned max_sparse_buffers; - struct amdgpu_cs_buffer *sparse_buffers; - int buffer_indices_hashlist[4096]; - struct amdgpu_winsys_bo *last_added_bo; - unsigned last_added_bo_index; - unsigned last_added_bo_usage; - uint64_t last_added_bo_priority_usage; - - struct pipe_fence_handle **fence_dependencies; - unsigned num_fence_dependencies; - unsigned max_fence_dependencies; + unsigned max_dependencies; struct pipe_fence_handle *fence; @@ -232,9 +217,8 @@ amdgpu_bo_is_referenced_by_cs_with_usage(struct amdgpu_cs *cs, if (index == -1) return false; - buffer = bo->bo ? &cs->csc->real_buffers[index] : - bo->sparse ? &cs->csc->sparse_buffers[index] : - &cs->csc->slab_buffers[index]; + buffer = bo->bo ? &cs->csc->real_buffers[index] + : &cs->csc->slab_buffers[index]; return (buffer->usage & usage) != 0; } @@ -247,9 +231,6 @@ amdgpu_bo_is_referenced_by_any_cs(struct amdgpu_winsys_bo *bo) bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, bool absolute); -void amdgpu_add_fences(struct amdgpu_winsys_bo *bo, - unsigned num_fences, - struct pipe_fence_handle **fences); void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs); void amdgpu_cs_init_functions(struct amdgpu_winsys *ws); void amdgpu_cs_submit_ib(void *job, int thread_index); diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c index 1a2b7c4af..c5462bc0e 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c @@ -30,32 +30,65 @@ */ #include "amdgpu_winsys.h" -#include "util/u_format.h" -static int amdgpu_surface_sanity(const struct pipe_resource *tex) +#ifndef NO_ENTRIES +#define NO_ENTRIES 32 +#endif + +#ifndef NO_MACRO_ENTRIES +#define NO_MACRO_ENTRIES 16 +#endif + +#ifndef CIASICIDGFXENGINE_SOUTHERNISLAND +#define CIASICIDGFXENGINE_SOUTHERNISLAND 0x0000000A +#endif + + +static int amdgpu_surface_sanity(const struct radeon_surf *surf) { - switch (tex->target) { - case PIPE_TEXTURE_1D: - if (tex->height0 > 1) + unsigned type = RADEON_SURF_GET(surf->flags, TYPE); + + if (!(surf->flags & RADEON_SURF_HAS_TILE_MODE_INDEX)) + return -EINVAL; + + /* all dimension must be at least 1 ! */ + if (!surf->npix_x || !surf->npix_y || !surf->npix_z || + !surf->array_size) + return -EINVAL; + + if (!surf->blk_w || !surf->blk_h || !surf->blk_d) + return -EINVAL; + + switch (surf->nsamples) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return -EINVAL; + } + + switch (type) { + case RADEON_SURF_TYPE_1D: + if (surf->npix_y > 1) return -EINVAL; /* fall through */ - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_RECT: - if (tex->depth0 > 1 || tex->array_size > 1) + case RADEON_SURF_TYPE_2D: + case RADEON_SURF_TYPE_CUBEMAP: + if (surf->npix_z > 1 || surf->array_size > 1) return -EINVAL; break; - case PIPE_TEXTURE_3D: - if (tex->array_size > 1) + case RADEON_SURF_TYPE_3D: + if (surf->array_size > 1) return -EINVAL; break; - case PIPE_TEXTURE_1D_ARRAY: - if (tex->height0 > 1) + case RADEON_SURF_TYPE_1D_ARRAY: + if (surf->npix_y > 1) return -EINVAL; /* fall through */ - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_2D_ARRAY: - case PIPE_TEXTURE_CUBE_ARRAY: - if (tex->depth0 > 1) + case RADEON_SURF_TYPE_2D_ARRAY: + if (surf->npix_z > 1) return -EINVAL; break; default: @@ -64,39 +97,494 @@ static int amdgpu_surface_sanity(const struct pipe_resource *tex) return 0; } +static void *ADDR_API allocSysMem(const ADDR_ALLOCSYSMEM_INPUT * pInput) +{ + return malloc(pInput->sizeInBytes); +} + +static ADDR_E_RETURNCODE ADDR_API freeSysMem(const ADDR_FREESYSMEM_INPUT * pInput) +{ + free(pInput->pVirtAddr); + return ADDR_OK; +} + +ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws) +{ + ADDR_CREATE_INPUT addrCreateInput = {0}; + ADDR_CREATE_OUTPUT addrCreateOutput = {0}; + ADDR_REGISTER_VALUE regValue = {0}; + ADDR_CREATE_FLAGS createFlags = {{0}}; + ADDR_E_RETURNCODE addrRet; + + addrCreateInput.size = sizeof(ADDR_CREATE_INPUT); + addrCreateOutput.size = sizeof(ADDR_CREATE_OUTPUT); + + regValue.noOfBanks = ws->amdinfo.mc_arb_ramcfg & 0x3; + regValue.gbAddrConfig = ws->amdinfo.gb_addr_cfg; + regValue.noOfRanks = (ws->amdinfo.mc_arb_ramcfg & 0x4) >> 2; + + regValue.backendDisables = ws->amdinfo.backend_disable[0]; + regValue.pTileConfig = ws->amdinfo.gb_tile_mode; + regValue.noOfEntries = ARRAY_SIZE(ws->amdinfo.gb_tile_mode); + if (ws->info.chip_class == SI) { + regValue.pMacroTileConfig = NULL; + regValue.noOfMacroEntries = 0; + } else { + regValue.pMacroTileConfig = ws->amdinfo.gb_macro_tile_mode; + regValue.noOfMacroEntries = ARRAY_SIZE(ws->amdinfo.gb_macro_tile_mode); + } + + createFlags.value = 0; + createFlags.useTileIndex = 1; + createFlags.degradeBaseLevel = 1; + createFlags.useHtileSliceAlign = 1; + + addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND; + addrCreateInput.chipFamily = ws->family; + addrCreateInput.chipRevision = ws->rev_id; + addrCreateInput.createFlags = createFlags; + addrCreateInput.callbacks.allocSysMem = allocSysMem; + addrCreateInput.callbacks.freeSysMem = freeSysMem; + addrCreateInput.callbacks.debugPrint = 0; + addrCreateInput.regValue = regValue; + + addrRet = AddrCreate(&addrCreateInput, &addrCreateOutput); + if (addrRet != ADDR_OK) + return NULL; + + return addrCreateOutput.hLib; +} + +static int compute_level(struct amdgpu_winsys *ws, + struct radeon_surf *surf, bool is_stencil, + unsigned level, unsigned type, bool compressed, + ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn, + ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut, + ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn, + ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut, + ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn, + ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut) +{ + struct radeon_surf_level *surf_level; + ADDR_E_RETURNCODE ret; + + AddrSurfInfoIn->mipLevel = level; + AddrSurfInfoIn->width = u_minify(surf->npix_x, level); + AddrSurfInfoIn->height = u_minify(surf->npix_y, level); + + if (type == RADEON_SURF_TYPE_3D) + AddrSurfInfoIn->numSlices = u_minify(surf->npix_z, level); + else if (type == RADEON_SURF_TYPE_CUBEMAP) + AddrSurfInfoIn->numSlices = 6; + else + AddrSurfInfoIn->numSlices = surf->array_size; + + if (level > 0) { + /* Set the base level pitch. This is needed for calculation + * of non-zero levels. */ + if (is_stencil) + AddrSurfInfoIn->basePitch = surf->stencil_level[0].nblk_x; + else + AddrSurfInfoIn->basePitch = surf->level[0].nblk_x; + + /* Convert blocks to pixels for compressed formats. */ + if (compressed) + AddrSurfInfoIn->basePitch *= surf->blk_w; + } + + ret = AddrComputeSurfaceInfo(ws->addrlib, + AddrSurfInfoIn, + AddrSurfInfoOut); + if (ret != ADDR_OK) { + return ret; + } + + surf_level = is_stencil ? &surf->stencil_level[level] : &surf->level[level]; + surf_level->offset = align64(surf->bo_size, AddrSurfInfoOut->baseAlign); + surf_level->slice_size = AddrSurfInfoOut->sliceSize; + surf_level->pitch_bytes = AddrSurfInfoOut->pitch * (is_stencil ? 1 : surf->bpe); + surf_level->npix_x = u_minify(surf->npix_x, level); + surf_level->npix_y = u_minify(surf->npix_y, level); + surf_level->npix_z = u_minify(surf->npix_z, level); + surf_level->nblk_x = AddrSurfInfoOut->pitch; + surf_level->nblk_y = AddrSurfInfoOut->height; + if (type == RADEON_SURF_TYPE_3D) + surf_level->nblk_z = AddrSurfInfoOut->depth; + else + surf_level->nblk_z = 1; + + switch (AddrSurfInfoOut->tileMode) { + case ADDR_TM_LINEAR_ALIGNED: + surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED; + break; + case ADDR_TM_1D_TILED_THIN1: + surf_level->mode = RADEON_SURF_MODE_1D; + break; + case ADDR_TM_2D_TILED_THIN1: + surf_level->mode = RADEON_SURF_MODE_2D; + break; + default: + assert(0); + } + + if (is_stencil) + surf->stencil_tiling_index[level] = AddrSurfInfoOut->tileIndex; + else + surf->tiling_index[level] = AddrSurfInfoOut->tileIndex; + + surf->bo_size = surf_level->offset + AddrSurfInfoOut->surfSize; + + /* Clear DCC fields at the beginning. */ + surf_level->dcc_offset = 0; + surf_level->dcc_enabled = false; + + /* The previous level's flag tells us if we can use DCC for this level. */ + if (AddrSurfInfoIn->flags.dccCompatible && + (level == 0 || AddrDccOut->subLvlCompressible)) { + AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize; + AddrDccIn->tileMode = AddrSurfInfoOut->tileMode; + AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo; + AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex; + AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; + + ret = AddrComputeDccInfo(ws->addrlib, + AddrDccIn, + AddrDccOut); + + if (ret == ADDR_OK) { + surf_level->dcc_offset = surf->dcc_size; + surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize; + surf_level->dcc_enabled = true; + surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize; + surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign); + } + } + + /* TC-compatible HTILE. */ + if (!is_stencil && + AddrSurfInfoIn->flags.depth && + AddrSurfInfoIn->flags.tcCompatible && + surf_level->mode == RADEON_SURF_MODE_2D && + level == 0) { + AddrHtileIn->flags.tcCompatible = 1; + AddrHtileIn->pitch = AddrSurfInfoOut->pitch; + AddrHtileIn->height = AddrSurfInfoOut->height; + AddrHtileIn->numSlices = AddrSurfInfoOut->depth; + AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8; + AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8; + AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo; + AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex; + AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex; + + ret = AddrComputeHtileInfo(ws->addrlib, + AddrHtileIn, + AddrHtileOut); + + if (ret == ADDR_OK) { + surf->htile_size = AddrHtileOut->htileBytes; + surf->htile_alignment = AddrHtileOut->baseAlign; + } + } + + return 0; +} + +#define G_009910_MICRO_TILE_MODE(x) (((x) >> 0) & 0x03) +#define G_009910_MICRO_TILE_MODE_NEW(x) (((x) >> 22) & 0x07) + +static void set_micro_tile_mode(struct radeon_surf *surf, + struct radeon_info *info) +{ + uint32_t tile_mode = info->si_tile_mode_array[surf->tiling_index[0]]; + + if (info->chip_class >= CIK) + surf->micro_tile_mode = G_009910_MICRO_TILE_MODE_NEW(tile_mode); + else + surf->micro_tile_mode = G_009910_MICRO_TILE_MODE(tile_mode); +} + +static unsigned cik_get_macro_tile_index(struct radeon_surf *surf) +{ + unsigned index, tileb; + + tileb = 8 * 8 * surf->bpe; + tileb = MIN2(surf->tile_split, tileb); + + for (index = 0; tileb > 64; index++) + tileb >>= 1; + + assert(index < 16); + return index; +} + static int amdgpu_surface_init(struct radeon_winsys *rws, - const struct pipe_resource *tex, - unsigned flags, unsigned bpe, - enum radeon_surf_mode mode, struct radeon_surf *surf) { struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws; + unsigned level, mode, type; + bool compressed; + ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0}; + ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0}; + ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0}; + ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0}; + ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0}; + ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0}; + ADDR_TILEINFO AddrTileInfoIn = {0}; + ADDR_TILEINFO AddrTileInfoOut = {0}; int r; - r = amdgpu_surface_sanity(tex); + r = amdgpu_surface_sanity(surf); if (r) return r; - surf->blk_w = util_format_get_blockwidth(tex->format); - surf->blk_h = util_format_get_blockheight(tex->format); - surf->bpe = bpe; - surf->flags = flags; + AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT); + AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT); + AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT); + AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT); + AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT); + AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT); + AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut; + + type = RADEON_SURF_GET(surf->flags, TYPE); + mode = RADEON_SURF_GET(surf->flags, MODE); + compressed = surf->blk_w == 4 && surf->blk_h == 4; + + /* MSAA and FMASK require 2D tiling. */ + if (surf->nsamples > 1 || + (surf->flags & RADEON_SURF_FMASK)) + mode = RADEON_SURF_MODE_2D; + + /* DB doesn't support linear layouts. */ + if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER) && + mode < RADEON_SURF_MODE_1D) + mode = RADEON_SURF_MODE_1D; + + /* Set the requested tiling mode. */ + switch (mode) { + case RADEON_SURF_MODE_LINEAR_ALIGNED: + AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED; + break; + case RADEON_SURF_MODE_1D: + AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1; + break; + case RADEON_SURF_MODE_2D: + AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1; + break; + default: + assert(0); + } + + /* The format must be set correctly for the allocation of compressed + * textures to work. In other cases, setting the bpp is sufficient. */ + if (compressed) { + switch (surf->bpe) { + case 8: + AddrSurfInfoIn.format = ADDR_FMT_BC1; + break; + case 16: + AddrSurfInfoIn.format = ADDR_FMT_BC3; + break; + default: + assert(0); + } + } + else { + AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8; + } + + AddrDccIn.numSamples = AddrSurfInfoIn.numSamples = surf->nsamples; + AddrSurfInfoIn.tileIndex = -1; - struct ac_surf_config config; + /* Set the micro tile type. */ + if (surf->flags & RADEON_SURF_SCANOUT) + AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE; + else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER) + AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER; + else + AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE; - config.info.width = tex->width0; - config.info.height = tex->height0; - config.info.depth = tex->depth0; - config.info.array_size = tex->array_size; - config.info.samples = tex->nr_samples; - config.info.levels = tex->last_level + 1; - config.is_3d = !!(tex->target == PIPE_TEXTURE_3D); - config.is_cube = !!(tex->target == PIPE_TEXTURE_CUBE); + AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER); + AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0; + AddrSurfInfoIn.flags.cube = type == RADEON_SURF_TYPE_CUBEMAP; + AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0; + AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0; + AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0; - return ac_compute_surface(ws->addrlib, &ws->info, &config, mode, surf); + /* Only degrade the tile mode for space if TC-compatible HTILE hasn't been + * requested, because TC-compatible HTILE requires 2D tiling. + */ + AddrSurfInfoIn.flags.degrade4Space = !AddrSurfInfoIn.flags.tcCompatible; + + /* DCC notes: + * - If we add MSAA support, keep in mind that CB can't decompress 8bpp + * with samples >= 4. + * - Mipmapped array textures have low performance (discovered by a closed + * driver team). + */ + AddrSurfInfoIn.flags.dccCompatible = ws->info.chip_class >= VI && + !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) && + !(surf->flags & RADEON_SURF_DISABLE_DCC) && + !compressed && AddrDccIn.numSamples <= 1 && + ((surf->array_size == 1 && surf->npix_z == 1) || + surf->last_level == 0); + + AddrSurfInfoIn.flags.noStencil = (surf->flags & RADEON_SURF_SBUFFER) == 0; + AddrSurfInfoIn.flags.compressZ = AddrSurfInfoIn.flags.depth; + + /* noStencil = 0 can result in a depth part that is incompatible with + * mipmapped texturing. So set noStencil = 1 when mipmaps are requested (in + * this case, we may end up setting stencil_adjusted). + * + * TODO: update addrlib to a newer version, remove this, and + * use flags.matchStencilTileCfg = 1 as an alternative fix. + */ + if (surf->last_level > 0) + AddrSurfInfoIn.flags.noStencil = 1; + + /* Set preferred macrotile parameters. This is usually required + * for shared resources. This is for 2D tiling only. */ + if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 && + surf->bankw && surf->bankh && surf->mtilea && surf->tile_split) { + /* If any of these parameters are incorrect, the calculation + * will fail. */ + AddrTileInfoIn.banks = surf->num_banks; + AddrTileInfoIn.bankWidth = surf->bankw; + AddrTileInfoIn.bankHeight = surf->bankh; + AddrTileInfoIn.macroAspectRatio = surf->mtilea; + AddrTileInfoIn.tileSplitBytes = surf->tile_split; + AddrTileInfoIn.pipeConfig = surf->pipe_config + 1; /* +1 compared to GB_TILE_MODE */ + AddrSurfInfoIn.flags.degrade4Space = 0; + AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn; + + /* If AddrSurfInfoIn.pTileInfo is set, Addrlib doesn't set + * the tile index, because we are expected to know it if + * we know the other parameters. + * + * This is something that can easily be fixed in Addrlib. + * For now, just figure it out here. + * Note that only 2D_TILE_THIN1 is handled here. + */ + assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); + assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1); + + if (ws->info.chip_class == SI) { + if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) { + if (surf->bpe == 2) + AddrSurfInfoIn.tileIndex = 11; /* 16bpp */ + else + AddrSurfInfoIn.tileIndex = 12; /* 32bpp */ + } else { + if (surf->bpe == 1) + AddrSurfInfoIn.tileIndex = 14; /* 8bpp */ + else if (surf->bpe == 2) + AddrSurfInfoIn.tileIndex = 15; /* 16bpp */ + else if (surf->bpe == 4) + AddrSurfInfoIn.tileIndex = 16; /* 32bpp */ + else + AddrSurfInfoIn.tileIndex = 17; /* 64bpp (and 128bpp) */ + } + } else { + /* CIK - VI */ + if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) + AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */ + else + AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */ + + /* Addrlib doesn't set this if tileIndex is forced like above. */ + AddrSurfInfoOut.macroModeIndex = cik_get_macro_tile_index(surf); + } + } + + surf->bo_size = 0; + surf->dcc_size = 0; + surf->dcc_alignment = 1; + surf->htile_size = 0; + surf->htile_alignment = 1; + + /* Calculate texture layout information. */ + for (level = 0; level <= surf->last_level; level++) { + r = compute_level(ws, surf, false, level, type, compressed, + &AddrSurfInfoIn, &AddrSurfInfoOut, + &AddrDccIn, &AddrDccOut, &AddrHtileIn, &AddrHtileOut); + if (r) + return r; + + if (level == 0) { + surf->bo_alignment = AddrSurfInfoOut.baseAlign; + surf->pipe_config = AddrSurfInfoOut.pTileInfo->pipeConfig - 1; + set_micro_tile_mode(surf, &ws->info); + + /* For 2D modes only. */ + if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) { + surf->bankw = AddrSurfInfoOut.pTileInfo->bankWidth; + surf->bankh = AddrSurfInfoOut.pTileInfo->bankHeight; + surf->mtilea = AddrSurfInfoOut.pTileInfo->macroAspectRatio; + surf->tile_split = AddrSurfInfoOut.pTileInfo->tileSplitBytes; + surf->num_banks = AddrSurfInfoOut.pTileInfo->banks; + surf->macro_tile_index = AddrSurfInfoOut.macroModeIndex; + } else { + surf->macro_tile_index = 0; + } + } + } + + /* Calculate texture layout information for stencil. */ + if (surf->flags & RADEON_SURF_SBUFFER) { + AddrSurfInfoIn.bpp = 8; + AddrSurfInfoIn.flags.depth = 0; + AddrSurfInfoIn.flags.stencil = 1; + AddrSurfInfoIn.flags.tcCompatible = 0; + /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */ + AddrTileInfoIn.tileSplitBytes = surf->stencil_tile_split; + + for (level = 0; level <= surf->last_level; level++) { + r = compute_level(ws, surf, true, level, type, compressed, + &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut, + NULL, NULL); + if (r) + return r; + + /* DB uses the depth pitch for both stencil and depth. */ + if (surf->stencil_level[level].nblk_x != surf->level[level].nblk_x) + surf->stencil_adjusted = true; + + if (level == 0) { + /* For 2D modes only. */ + if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) { + surf->stencil_tile_split = + AddrSurfInfoOut.pTileInfo->tileSplitBytes; + } + } + } + } + + /* Recalculate the whole DCC miptree size including disabled levels. + * This is what addrlib does, but calling addrlib would be a lot more + * complicated. + */ + if (surf->dcc_size && surf->last_level > 0) { + surf->dcc_size = align64(surf->bo_size >> 8, + ws->info.pipe_interleave_bytes * + ws->info.num_tile_pipes); + } + + /* Make sure HTILE covers the whole miptree, because the shader reads + * TC-compatible HTILE even for levels where it's disabled by DB. + */ + if (surf->htile_size && surf->last_level) + surf->htile_size *= 2; + + return 0; +} + +static int amdgpu_surface_best(struct radeon_winsys *rws, + struct radeon_surf *surf) +{ + return 0; } void amdgpu_surface_init_functions(struct amdgpu_winsys *ws) { ws->base.surface_init = amdgpu_surface_init; + ws->base.surface_best = amdgpu_surface_best; } diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index 837c1e2aa..d92c0bd83 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -40,40 +40,331 @@ #include <stdio.h> #include <sys/stat.h> #include "amd/common/amdgpu_id.h" -#include "amd/common/sid.h" -#include "amd/common/gfx9d.h" -#ifndef AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS -#define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS 0x1E +#define CIK_TILE_MODE_COLOR_2D 14 + +#define CIK__GB_TILE_MODE__PIPE_CONFIG(x) (((x) >> 6) & 0x1f) +#define CIK__PIPE_CONFIG__ADDR_SURF_P2 0 +#define CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16 4 +#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16 5 +#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32 6 +#define CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32 7 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16 8 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16 9 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16 10 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16 11 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16 12 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32 13 +#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32 14 +#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16 16 +#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16 17 + +#ifndef AMDGPU_INFO_NUM_EVICTIONS +#define AMDGPU_INFO_NUM_EVICTIONS 0x18 #endif static struct util_hash_table *dev_tab = NULL; -static mtx_t dev_tab_mutex = _MTX_INITIALIZER_NP; +pipe_static_mutex(dev_tab_mutex); + +static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info) +{ + unsigned mode2d = info->gb_tile_mode[CIK_TILE_MODE_COLOR_2D]; + + switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) { + case CIK__PIPE_CONFIG__ADDR_SURF_P2: + return 2; + case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32: + case CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32: + return 4; + case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16: + case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32: + case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32: + return 8; + case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16: + case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16: + return 16; + default: + fprintf(stderr, "Invalid CIK pipe configuration, assuming P2\n"); + assert(!"this should never occur"); + return 2; + } +} /* Helper function to do the ioctls needed for setup and init. */ static bool do_winsys_init(struct amdgpu_winsys *ws, int fd) { - if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo)) + struct amdgpu_buffer_size_alignments alignment_info = {}; + struct amdgpu_heap_info vram, gtt; + struct drm_amdgpu_info_hw_ip dma = {}, uvd = {}, vce = {}; + uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0; + uint32_t unused_feature; + int r, i, j; + drmDevicePtr devinfo; + + /* Get PCI info. */ + r = drmGetDevice(fd, &devinfo); + if (r) { + fprintf(stderr, "amdgpu: drmGetDevice failed.\n"); + goto fail; + } + ws->info.pci_domain = devinfo->businfo.pci->domain; + ws->info.pci_bus = devinfo->businfo.pci->bus; + ws->info.pci_dev = devinfo->businfo.pci->dev; + ws->info.pci_func = devinfo->businfo.pci->func; + drmFreeDevice(&devinfo); + + /* Query hardware and driver information. */ + r = amdgpu_query_gpu_info(ws->dev, &ws->amdinfo); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n"); + goto fail; + } + + r = amdgpu_query_buffer_size_alignment(ws->dev, &alignment_info); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n"); + goto fail; + } + + r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n"); + goto fail; + } + + r = amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_GTT, 0, >t); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n"); goto fail; + } + + r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_DMA, 0, &dma); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n"); + goto fail; + } + + r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_UVD, 0, &uvd); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd) failed.\n"); + goto fail; + } + + r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, + &ws->info.me_fw_version, &unused_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n"); + goto fail; + } + + r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0, + &ws->info.pfp_fw_version, &unused_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n"); + goto fail; + } - /* LLVM 5.0 is required for GFX9. */ - if (ws->info.chip_class >= GFX9 && HAVE_LLVM < 0x0500) { - fprintf(stderr, "amdgpu: LLVM 5.0 is required, got LLVM %i.%i\n", - HAVE_LLVM >> 8, HAVE_LLVM & 255); + r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_GFX_CE, 0, 0, + &ws->info.ce_fw_version, &unused_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(ce) failed.\n"); goto fail; } - ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, NULL); + r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_UVD, 0, 0, + &uvd_version, &uvd_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n"); + goto fail; + } + + r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_VCE, 0, &vce); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n"); + goto fail; + } + + r = amdgpu_query_firmware_version(ws->dev, AMDGPU_INFO_FW_VCE, 0, 0, + &vce_version, &vce_feature); + if (r) { + fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n"); + goto fail; + } + + /* Set chip identification. */ + ws->info.pci_id = ws->amdinfo.asic_id; /* TODO: is this correct? */ + ws->info.vce_harvest_config = ws->amdinfo.vce_harvest_config; + + switch (ws->info.pci_id) { +#define CHIPSET(pci_id, name, cfamily) case pci_id: ws->info.family = CHIP_##cfamily; break; +#include "pci_ids/radeonsi_pci_ids.h" +#undef CHIPSET + + default: + fprintf(stderr, "amdgpu: Invalid PCI ID.\n"); + goto fail; + } + + if (ws->info.family >= CHIP_TONGA) + ws->info.chip_class = VI; + else if (ws->info.family >= CHIP_BONAIRE) + ws->info.chip_class = CIK; + else if (ws->info.family >= CHIP_TAHITI) + ws->info.chip_class = SI; + else { + fprintf(stderr, "amdgpu: Unknown family.\n"); + goto fail; + } + + /* LLVM 3.6.1 is required for VI. */ + if (ws->info.chip_class >= VI && + HAVE_LLVM == 0x0306 && MESA_LLVM_VERSION_PATCH < 1) { + fprintf(stderr, "amdgpu: LLVM 3.6.1 is required, got LLVM %i.%i.%i\n", + HAVE_LLVM >> 8, HAVE_LLVM & 255, MESA_LLVM_VERSION_PATCH); + goto fail; + } + + /* family and rev_id are for addrlib */ + switch (ws->info.family) { + case CHIP_TAHITI: + ws->family = FAMILY_SI; + ws->rev_id = SI_TAHITI_P_A0; + break; + case CHIP_PITCAIRN: + ws->family = FAMILY_SI; + ws->rev_id = SI_PITCAIRN_PM_A0; + break; + case CHIP_VERDE: + ws->family = FAMILY_SI; + ws->rev_id = SI_CAPEVERDE_M_A0; + break; + case CHIP_OLAND: + ws->family = FAMILY_SI; + ws->rev_id = SI_OLAND_M_A0; + break; + case CHIP_HAINAN: + ws->family = FAMILY_SI; + ws->rev_id = SI_HAINAN_V_A0; + break; + case CHIP_BONAIRE: + ws->family = FAMILY_CI; + ws->rev_id = CI_BONAIRE_M_A0; + break; + case CHIP_KAVERI: + ws->family = FAMILY_KV; + ws->rev_id = KV_SPECTRE_A0; + break; + case CHIP_KABINI: + ws->family = FAMILY_KV; + ws->rev_id = KB_KALINDI_A0; + break; + case CHIP_HAWAII: + ws->family = FAMILY_CI; + ws->rev_id = CI_HAWAII_P_A0; + break; + case CHIP_MULLINS: + ws->family = FAMILY_KV; + ws->rev_id = ML_GODAVARI_A0; + break; + case CHIP_TONGA: + ws->family = FAMILY_VI; + ws->rev_id = VI_TONGA_P_A0; + break; + case CHIP_ICELAND: + ws->family = FAMILY_VI; + ws->rev_id = VI_ICELAND_M_A0; + break; + case CHIP_CARRIZO: + ws->family = FAMILY_CZ; + ws->rev_id = CARRIZO_A0; + break; + case CHIP_STONEY: + ws->family = FAMILY_CZ; + ws->rev_id = STONEY_A0; + break; + case CHIP_FIJI: + ws->family = FAMILY_VI; + ws->rev_id = VI_FIJI_P_A0; + break; + case CHIP_POLARIS10: + ws->family = FAMILY_VI; + ws->rev_id = VI_POLARIS10_P_A0; + break; + case CHIP_POLARIS11: + ws->family = FAMILY_VI; + ws->rev_id = VI_POLARIS11_M_A0; + break; + default: + fprintf(stderr, "amdgpu: Unknown family.\n"); + goto fail; + } + + ws->addrlib = amdgpu_addr_create(ws); if (!ws->addrlib) { fprintf(stderr, "amdgpu: Cannot create addrlib.\n"); goto fail; } + /* Set which chips have dedicated VRAM. */ + ws->info.has_dedicated_vram = + !(ws->amdinfo.ids_flags & AMDGPU_IDS_FLAGS_FUSION); + + /* Set hardware information. */ + ws->info.gart_size = gtt.heap_size; + ws->info.vram_size = vram.heap_size; + /* The kernel can split large buffers in VRAM but not in GTT, so large + * allocations can fail or cause buffer movement failures in the kernel. + */ + ws->info.max_alloc_size = MIN2(ws->info.vram_size * 0.9, ws->info.gart_size * 0.7); + /* convert the shader clock from KHz to MHz */ + ws->info.max_shader_clock = ws->amdinfo.max_engine_clk / 1000; + ws->info.max_se = ws->amdinfo.num_shader_engines; + ws->info.max_sh_per_se = ws->amdinfo.num_shader_arrays_per_engine; + ws->info.has_uvd = uvd.available_rings != 0; + ws->info.uvd_fw_version = + uvd.available_rings ? uvd_version : 0; + ws->info.vce_fw_version = + vce.available_rings ? vce_version : 0; + ws->info.has_userptr = true; + ws->info.num_render_backends = ws->amdinfo.rb_pipes; + ws->info.clock_crystal_freq = ws->amdinfo.gpu_counter_freq; + ws->info.num_tile_pipes = cik_get_num_tile_pipes(&ws->amdinfo); + ws->info.pipe_interleave_bytes = 256 << ((ws->amdinfo.gb_addr_cfg >> 4) & 0x7); + ws->info.has_virtual_memory = true; + ws->info.has_sdma = dma.available_rings != 0; + + /* Get the number of good compute units. */ + ws->info.num_good_compute_units = 0; + for (i = 0; i < ws->info.max_se; i++) + for (j = 0; j < ws->info.max_sh_per_se; j++) + ws->info.num_good_compute_units += + util_bitcount(ws->amdinfo.cu_bitmap[i][j]); + + memcpy(ws->info.si_tile_mode_array, ws->amdinfo.gb_tile_mode, + sizeof(ws->amdinfo.gb_tile_mode)); + ws->info.enabled_rb_mask = ws->amdinfo.enabled_rb_pipes_mask; + + memcpy(ws->info.cik_macrotile_mode_array, ws->amdinfo.gb_macro_tile_mode, + sizeof(ws->amdinfo.gb_macro_tile_mode)); + + ws->info.gart_page_size = alignment_info.size_remote; + + if (ws->info.chip_class == SI) + ws->info.gfx_ib_pad_with_type2 = TRUE; + ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL; return true; fail: + if (ws->addrlib) + AddrDestroy(ws->addrlib); amdgpu_device_deinitialize(ws->dev); ws->dev = NULL; return false; @@ -92,10 +383,10 @@ static void amdgpu_winsys_destroy(struct radeon_winsys *rws) if (util_queue_is_initialized(&ws->cs_queue)) util_queue_destroy(&ws->cs_queue); - mtx_destroy(&ws->bo_fence_lock); + pipe_mutex_destroy(ws->bo_fence_lock); pb_slabs_deinit(&ws->bo_slabs); pb_cache_deinit(&ws->bo_cache); - mtx_destroy(&ws->global_bo_list_lock); + pipe_mutex_destroy(ws->global_bo_list_lock); do_winsys_deinit(ws); FREE(rws); } @@ -131,50 +422,30 @@ static uint64_t amdgpu_query_value(struct radeon_winsys *rws, return ws->mapped_gtt; case RADEON_BUFFER_WAIT_TIME_NS: return ws->buffer_wait_time; - case RADEON_NUM_MAPPED_BUFFERS: - return ws->num_mapped_buffers; case RADEON_TIMESTAMP: amdgpu_query_info(ws->dev, AMDGPU_INFO_TIMESTAMP, 8, &retval); return retval; - case RADEON_NUM_GFX_IBS: - return ws->num_gfx_IBs; - case RADEON_NUM_SDMA_IBS: - return ws->num_sdma_IBs; - case RADEON_GFX_BO_LIST_COUNTER: - return ws->gfx_bo_list_counter; + case RADEON_NUM_CS_FLUSHES: + return ws->num_cs_flushes; case RADEON_NUM_BYTES_MOVED: amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_BYTES_MOVED, 8, &retval); return retval; case RADEON_NUM_EVICTIONS: amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_EVICTIONS, 8, &retval); return retval; - case RADEON_NUM_VRAM_CPU_PAGE_FAULTS: - amdgpu_query_info(ws->dev, AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS, 8, &retval); - return retval; case RADEON_VRAM_USAGE: amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &heap); return heap.heap_usage; - case RADEON_VRAM_VIS_USAGE: - amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_VRAM, - AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED, &heap); - return heap.heap_usage; case RADEON_GTT_USAGE: amdgpu_query_heap_info(ws->dev, AMDGPU_GEM_DOMAIN_GTT, 0, &heap); return heap.heap_usage; case RADEON_GPU_TEMPERATURE: - amdgpu_query_sensor_info(ws->dev, AMDGPU_INFO_SENSOR_GPU_TEMP, 4, &retval); - return retval; case RADEON_CURRENT_SCLK: - amdgpu_query_sensor_info(ws->dev, AMDGPU_INFO_SENSOR_GFX_SCLK, 4, &retval); - return retval; case RADEON_CURRENT_MCLK: - amdgpu_query_sensor_info(ws->dev, AMDGPU_INFO_SENSOR_GFX_MCLK, 4, &retval); - return retval; + return 0; case RADEON_GPU_RESET_COUNTER: assert(0); return 0; - case RADEON_CS_THREAD_TIME: - return util_queue_get_thread_time_nano(&ws->cs_queue, 0); } return 0; } @@ -203,6 +474,8 @@ static int compare_dev(void *key1, void *key2) return key1 != key2; } +DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", true) + static bool amdgpu_winsys_unref(struct radeon_winsys *rws) { struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws; @@ -213,26 +486,18 @@ static bool amdgpu_winsys_unref(struct radeon_winsys *rws) * This must happen while the mutex is locked, so that * amdgpu_winsys_create in another thread doesn't get the winsys * from the table when the counter drops to 0. */ - mtx_lock(&dev_tab_mutex); + pipe_mutex_lock(dev_tab_mutex); destroy = pipe_reference(&ws->reference, NULL); if (destroy && dev_tab) util_hash_table_remove(dev_tab, ws->dev); - mtx_unlock(&dev_tab_mutex); + pipe_mutex_unlock(dev_tab_mutex); return destroy; } -static const char* amdgpu_get_chip_name(struct radeon_winsys *ws) -{ - amdgpu_device_handle dev = ((struct amdgpu_winsys *)ws)->dev; - return amdgpu_get_marketing_name(dev); -} - - PUBLIC struct radeon_winsys * -amdgpu_winsys_create(int fd, unsigned flags, - radeon_screen_create_t screen_create) +amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) { struct amdgpu_winsys *ws; drmVersionPtr version = drmGetVersion(fd); @@ -247,7 +512,7 @@ amdgpu_winsys_create(int fd, unsigned flags, drmFreeVersion(version); /* Look up the winsys from the dev table. */ - mtx_lock(&dev_tab_mutex); + pipe_mutex_lock(dev_tab_mutex); if (!dev_tab) dev_tab = util_hash_table_create(hash_dev, compare_dev); @@ -255,7 +520,7 @@ amdgpu_winsys_create(int fd, unsigned flags, * for the same fd. */ r = amdgpu_device_initialize(fd, &drm_major, &drm_minor, &dev); if (r) { - mtx_unlock(&dev_tab_mutex); + pipe_mutex_unlock(dev_tab_mutex); fprintf(stderr, "amdgpu: amdgpu_device_initialize failed.\n"); return NULL; } @@ -264,7 +529,7 @@ amdgpu_winsys_create(int fd, unsigned flags, ws = util_hash_table_get(dev_tab, dev); if (ws) { pipe_reference(NULL, &ws->reference); - mtx_unlock(&dev_tab_mutex); + pipe_mutex_unlock(dev_tab_mutex); return &ws->base; } @@ -287,7 +552,7 @@ amdgpu_winsys_create(int fd, unsigned flags, if (!pb_slabs_init(&ws->bo_slabs, AMDGPU_SLAB_MIN_SIZE_LOG2, AMDGPU_SLAB_MAX_SIZE_LOG2, - RADEON_MAX_SLAB_HEAPS, + 12, /* number of heaps (domain/flags combinations) */ ws, amdgpu_bo_can_reclaim_slab, amdgpu_bo_slab_alloc, @@ -306,32 +571,27 @@ amdgpu_winsys_create(int fd, unsigned flags, ws->base.cs_request_feature = amdgpu_cs_request_feature; ws->base.query_value = amdgpu_query_value; ws->base.read_registers = amdgpu_read_registers; - ws->base.get_chip_name = amdgpu_get_chip_name; amdgpu_bo_init_functions(ws); amdgpu_cs_init_functions(ws); amdgpu_surface_init_functions(ws); LIST_INITHEAD(&ws->global_bo_list); - (void) mtx_init(&ws->global_bo_list_lock, mtx_plain); - (void) mtx_init(&ws->bo_fence_lock, mtx_plain); + pipe_mutex_init(ws->global_bo_list_lock); + pipe_mutex_init(ws->bo_fence_lock); - if (!util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1, - UTIL_QUEUE_INIT_RESIZE_IF_FULL)) { - amdgpu_winsys_destroy(&ws->base); - mtx_unlock(&dev_tab_mutex); - return NULL; - } + if (sysconf(_SC_NPROCESSORS_ONLN) > 1 && debug_get_option_thread()) + util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1); /* Create the screen at the end. The winsys must be initialized * completely. * * Alternatively, we could create the screen based on "ws->gen" * and link all drivers into one binary blob. */ - ws->base.screen = screen_create(&ws->base, flags); + ws->base.screen = screen_create(&ws->base); if (!ws->base.screen) { amdgpu_winsys_destroy(&ws->base); - mtx_unlock(&dev_tab_mutex); + pipe_mutex_unlock(dev_tab_mutex); return NULL; } @@ -340,7 +600,7 @@ amdgpu_winsys_create(int fd, unsigned flags, /* We must unlock the mutex once the winsys is fully initialized, so that * other threads attempting to create the winsys from the same fd will * get a fully initialized winsys and not just half-way initialized. */ - mtx_unlock(&dev_tab_mutex); + pipe_mutex_unlock(dev_tab_mutex); return &ws->base; @@ -350,6 +610,6 @@ fail_cache: fail_alloc: FREE(ws); fail: - mtx_unlock(&dev_tab_mutex); + pipe_mutex_unlock(dev_tab_mutex); return NULL; } diff --git a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h index 7cd2f2048..69c663807 100644 --- a/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h +++ b/lib/mesa/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h @@ -41,9 +41,8 @@ struct amdgpu_cs; -#define AMDGPU_SLAB_MIN_SIZE_LOG2 9 /* 512 bytes */ -#define AMDGPU_SLAB_MAX_SIZE_LOG2 16 /* 64 KB */ -#define AMDGPU_SLAB_BO_SIZE_LOG2 17 /* 128 KB */ +#define AMDGPU_SLAB_MIN_SIZE_LOG2 9 +#define AMDGPU_SLAB_MAX_SIZE_LOG2 14 struct amdgpu_winsys { struct radeon_winsys base; @@ -53,20 +52,16 @@ struct amdgpu_winsys { amdgpu_device_handle dev; - mtx_t bo_fence_lock; + pipe_mutex bo_fence_lock; int num_cs; /* The number of command streams created. */ - unsigned num_total_rejected_cs; uint32_t next_bo_unique_id; uint64_t allocated_vram; uint64_t allocated_gtt; uint64_t mapped_vram; uint64_t mapped_gtt; uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */ - uint64_t num_gfx_IBs; - uint64_t num_sdma_IBs; - uint64_t num_mapped_buffers; - uint64_t gfx_bo_list_counter; + uint64_t num_cs_flushes; struct radeon_info info; @@ -75,11 +70,13 @@ struct amdgpu_winsys { struct amdgpu_gpu_info amdinfo; ADDR_HANDLE addrlib; + uint32_t rev_id; + unsigned family; bool check_vm; /* List of all allocated buffers */ - mtx_t global_bo_list_lock; + pipe_mutex global_bo_list_lock; struct list_head global_bo_list; unsigned num_buffers; }; @@ -91,5 +88,6 @@ amdgpu_winsys(struct radeon_winsys *base) } void amdgpu_surface_init_functions(struct amdgpu_winsys *ws); +ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws); #endif |