diff options
Diffstat (limited to 'lib/mesa/src/gallium/drivers/radeon')
22 files changed, 5757 insertions, 1886 deletions
diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.am b/lib/mesa/src/gallium/drivers/radeon/Makefile.am index 13d8976de..a6fc145cb 100644 --- a/lib/mesa/src/gallium/drivers/radeon/Makefile.am +++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.am @@ -16,7 +16,8 @@ libradeon_la_SOURCES = \ if NEED_RADEON_LLVM AM_CFLAGS += \ - $(LLVM_CFLAGS) + $(LLVM_CFLAGS) \ + $(LIBELF_CFLAGS) libradeon_la_SOURCES += \ $(LLVM_C_FILES) @@ -24,7 +25,7 @@ libradeon_la_SOURCES += \ libradeon_la_LIBADD = \ $(CLOCK_LIB) \ $(LLVM_LIBS) \ - $(ELF_LIB) + $(LIBELF_LIBS) libradeon_la_LDFLAGS = \ $(LLVM_LDFLAGS) diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.in b/lib/mesa/src/gallium/drivers/radeon/Makefile.in index f9faa3eef..d720beb87 100644 --- a/lib/mesa/src/gallium/drivers/radeon/Makefile.in +++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.in @@ -54,18 +54,19 @@ target_triplet = @target@ DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \ $(srcdir)/Makefile.sources $(top_srcdir)/bin/depcomp \ $(top_srcdir)/src/gallium/Automake.inc -@HAVE_LIBDRM_TRUE@am__append_1 = \ -@HAVE_LIBDRM_TRUE@ $(LIBDRM_LIBS) - -@HAVE_DRISW_TRUE@am__append_2 = \ +@HAVE_DRISW_TRUE@am__append_1 = \ @HAVE_DRISW_TRUE@ $(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la -@HAVE_DRISW_KMS_TRUE@am__append_3 = \ +@HAVE_DRISW_KMS_TRUE@am__append_2 = \ @HAVE_DRISW_KMS_TRUE@ $(top_builddir)/src/gallium/winsys/sw/kms-dri/libswkmsdri.la \ @HAVE_DRISW_KMS_TRUE@ $(LIBDRM_LIBS) -@HAVE_GALLIUM_LLVM_TRUE@am__append_4 = \ -@HAVE_GALLIUM_LLVM_TRUE@ $(LLVM_CFLAGS) +@NEED_RADEON_LLVM_TRUE@am__append_3 = \ +@NEED_RADEON_LLVM_TRUE@ $(LLVM_CFLAGS) \ +@NEED_RADEON_LLVM_TRUE@ $(LIBELF_CFLAGS) + +@NEED_RADEON_LLVM_TRUE@am__append_4 = \ +@NEED_RADEON_LLVM_TRUE@ $(LLVM_C_FILES) subdir = src/gallium/drivers/radeon ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 @@ -86,16 +87,27 @@ CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = LTLIBRARIES = $(noinst_LTLIBRARIES) am__DEPENDENCIES_1 = -@HAVE_GALLIUM_LLVM_TRUE@libradeon_la_DEPENDENCIES = \ -@HAVE_GALLIUM_LLVM_TRUE@ $(am__DEPENDENCIES_1) \ -@HAVE_GALLIUM_LLVM_TRUE@ $(am__DEPENDENCIES_1) +@NEED_RADEON_LLVM_TRUE@libradeon_la_DEPENDENCIES = \ +@NEED_RADEON_LLVM_TRUE@ $(am__DEPENDENCIES_1) \ +@NEED_RADEON_LLVM_TRUE@ $(am__DEPENDENCIES_1) \ +@NEED_RADEON_LLVM_TRUE@ $(am__DEPENDENCIES_1) +am__libradeon_la_SOURCES_DIST = cayman_msaa.c r600_buffer_common.c \ + r600_cs.h r600_gpu_load.c r600_perfcounter.c \ + r600_pipe_common.c r600_pipe_common.h r600_query.c \ + r600_query.h r600_streamout.c r600_test_dma.c r600_texture.c \ + r600_viewport.c radeon_uvd.c radeon_uvd.h radeon_vce_40_2_2.c \ + radeon_vce_50.c radeon_vce_52.c radeon_vce.c radeon_vce.h \ + radeon_video.c radeon_video.h radeon_winsys.h \ + radeon_elf_util.c radeon_elf_util.h am__objects_1 = cayman_msaa.lo r600_buffer_common.lo r600_gpu_load.lo \ r600_perfcounter.lo r600_pipe_common.lo r600_query.lo \ r600_streamout.lo r600_test_dma.lo r600_texture.lo \ r600_viewport.lo radeon_uvd.lo radeon_vce_40_2_2.lo \ radeon_vce_50.lo radeon_vce_52.lo radeon_vce.lo \ radeon_video.lo -am_libradeon_la_OBJECTS = $(am__objects_1) +am__objects_2 = radeon_elf_util.lo +@NEED_RADEON_LLVM_TRUE@am__objects_3 = $(am__objects_2) +am_libradeon_la_OBJECTS = $(am__objects_1) $(am__objects_3) libradeon_la_OBJECTS = $(am_libradeon_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -139,7 +151,7 @@ am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) am__v_CCLD_0 = @echo " CCLD " $@; am__v_CCLD_1 = SOURCES = $(libradeon_la_SOURCES) -DIST_SOURCES = $(libradeon_la_SOURCES) +DIST_SOURCES = $(am__libradeon_la_SOURCES_DIST) am__can_run_installinfo = \ case $$AM_UPDATE_INFO_DIR in \ n|no|NO) false;; \ @@ -153,8 +165,6 @@ AMDGPU_CFLAGS = @AMDGPU_CFLAGS@ AMDGPU_LIBS = @AMDGPU_LIBS@ AMTAR = @AMTAR@ AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ -ANDROID_CFLAGS = @ANDROID_CFLAGS@ -ANDROID_LIBS = @ANDROID_LIBS@ AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ @@ -185,6 +195,8 @@ DLLTOOL = @DLLTOOL@ DLOPEN_LIBS = @DLOPEN_LIBS@ DRI2PROTO_CFLAGS = @DRI2PROTO_CFLAGS@ DRI2PROTO_LIBS = @DRI2PROTO_LIBS@ +DRI3PROTO_CFLAGS = @DRI3PROTO_CFLAGS@ +DRI3PROTO_LIBS = @DRI3PROTO_LIBS@ DRIGL_CFLAGS = @DRIGL_CFLAGS@ DRIGL_LIBS = @DRIGL_LIBS@ DRI_DRIVER_INSTALL_DIR = @DRI_DRIVER_INSTALL_DIR@ @@ -197,11 +209,10 @@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGL_CFLAGS = @EGL_CFLAGS@ +EGL_CLIENT_APIS = @EGL_CLIENT_APIS@ EGL_LIB_DEPS = @EGL_LIB_DEPS@ EGL_NATIVE_PLATFORM = @EGL_NATIVE_PLATFORM@ EGREP = @EGREP@ -ETNAVIV_CFLAGS = @ETNAVIV_CFLAGS@ -ETNAVIV_LIBS = @ETNAVIV_LIBS@ EXEEXT = @EXEEXT@ EXPAT_CFLAGS = @EXPAT_CFLAGS@ EXPAT_LIBS = @EXPAT_LIBS@ @@ -249,27 +260,31 @@ LIBDRM_CFLAGS = @LIBDRM_CFLAGS@ LIBDRM_LIBS = @LIBDRM_LIBS@ LIBELF_CFLAGS = @LIBELF_CFLAGS@ LIBELF_LIBS = @LIBELF_LIBS@ -LIBGLVND_DATADIR = @LIBGLVND_DATADIR@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ -LIBSENSORS_LIBS = @LIBSENSORS_LIBS@ +LIBSENSORS_LDFLAGS = @LIBSENSORS_LDFLAGS@ +LIBSHA1_CFLAGS = @LIBSHA1_CFLAGS@ +LIBSHA1_LIBS = @LIBSHA1_LIBS@ LIBTOOL = @LIBTOOL@ -LIBUNWIND_CFLAGS = @LIBUNWIND_CFLAGS@ -LIBUNWIND_LIBS = @LIBUNWIND_LIBS@ LIB_DIR = @LIB_DIR@ LIB_EXT = @LIB_EXT@ LIPO = @LIPO@ +LLVM_BINDIR = @LLVM_BINDIR@ LLVM_CFLAGS = @LLVM_CFLAGS@ LLVM_CONFIG = @LLVM_CONFIG@ +LLVM_CPPFLAGS = @LLVM_CPPFLAGS@ LLVM_CXXFLAGS = @LLVM_CXXFLAGS@ LLVM_INCLUDEDIR = @LLVM_INCLUDEDIR@ LLVM_LDFLAGS = @LLVM_LDFLAGS@ +LLVM_LIBDIR = @LLVM_LIBDIR@ LLVM_LIBS = @LLVM_LIBS@ +LLVM_VERSION = @LLVM_VERSION@ LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ MAINT = @MAINT@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ +MESA_LLVM = @MESA_LLVM@ MKDIR_P = @MKDIR_P@ MSVC2013_COMPAT_CFLAGS = @MSVC2013_COMPAT_CFLAGS@ MSVC2013_COMPAT_CXXFLAGS = @MSVC2013_COMPAT_CXXFLAGS@ @@ -290,6 +305,8 @@ OMX_LIBS = @OMX_LIBS@ OMX_LIB_INSTALL_DIR = @OMX_LIB_INSTALL_DIR@ OPENCL_LIBNAME = @OPENCL_LIBNAME@ OPENCL_VERSION = @OPENCL_VERSION@ +OPENSSL_CFLAGS = @OPENSSL_CFLAGS@ +OPENSSL_LIBS = @OPENSSL_LIBS@ OSMESA_LIB = @OSMESA_LIB@ OSMESA_LIB_DEPS = @OSMESA_LIB_DEPS@ OSMESA_PC_LIB_PRIV = @OSMESA_PC_LIB_PRIV@ @@ -309,6 +326,8 @@ PKG_CONFIG = @PKG_CONFIG@ PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@ PKG_CONFIG_PATH = @PKG_CONFIG_PATH@ POSIX_SHELL = @POSIX_SHELL@ +PRESENTPROTO_CFLAGS = @PRESENTPROTO_CFLAGS@ +PRESENTPROTO_LIBS = @PRESENTPROTO_LIBS@ PTHREADSTUBS_CFLAGS = @PTHREADSTUBS_CFLAGS@ PTHREADSTUBS_LIBS = @PTHREADSTUBS_LIBS@ PTHREAD_CC = @PTHREAD_CC@ @@ -324,6 +343,8 @@ SED = @SED@ SELINUX_CFLAGS = @SELINUX_CFLAGS@ SELINUX_LIBS = @SELINUX_LIBS@ SET_MAKE = @SET_MAKE@ +SHA1_CFLAGS = @SHA1_CFLAGS@ +SHA1_LIBS = @SHA1_LIBS@ SHELL = @SHELL@ SIMPENROSE_CFLAGS = @SIMPENROSE_CFLAGS@ SIMPENROSE_LIBS = @SIMPENROSE_LIBS@ @@ -332,6 +353,7 @@ STRIP = @STRIP@ SWR_AVX2_CXXFLAGS = @SWR_AVX2_CXXFLAGS@ SWR_AVX_CXXFLAGS = @SWR_AVX_CXXFLAGS@ SWR_CXX11_CXXFLAGS = @SWR_CXX11_CXXFLAGS@ +TIMESTAMP_CMD = @TIMESTAMP_CMD@ VALGRIND_CFLAGS = @VALGRIND_CFLAGS@ VALGRIND_LIBS = @VALGRIND_LIBS@ VA_CFLAGS = @VA_CFLAGS@ @@ -347,6 +369,7 @@ VDPAU_LIB_INSTALL_DIR = @VDPAU_LIB_INSTALL_DIR@ VDPAU_MAJOR = @VDPAU_MAJOR@ VDPAU_MINOR = @VDPAU_MINOR@ VERSION = @VERSION@ +VG_LIB_DEPS = @VG_LIB_DEPS@ VISIBILITY_CFLAGS = @VISIBILITY_CFLAGS@ VISIBILITY_CXXFLAGS = @VISIBILITY_CXXFLAGS@ VL_CFLAGS = @VL_CFLAGS@ @@ -375,10 +398,9 @@ XVMC_LIBS = @XVMC_LIBS@ XVMC_LIB_INSTALL_DIR = @XVMC_LIB_INSTALL_DIR@ XVMC_MAJOR = @XVMC_MAJOR@ XVMC_MINOR = @XVMC_MINOR@ +XXD = @XXD@ YACC = @YACC@ YFLAGS = @YFLAGS@ -ZLIB_CFLAGS = @ZLIB_CFLAGS@ -ZLIB_LIBS = @ZLIB_LIBS@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ @@ -464,6 +486,10 @@ C_SOURCES := \ radeon_video.h \ radeon_winsys.h +LLVM_C_FILES := \ + radeon_elf_util.c \ + radeon_elf_util.h + GALLIUM_CFLAGS = \ -I$(top_srcdir)/include \ -I$(top_srcdir)/src \ @@ -511,8 +537,12 @@ GALLIUM_TARGET_CFLAGS = \ $(LIBDRM_CFLAGS) \ $(VISIBILITY_CFLAGS) -GALLIUM_COMMON_LIB_DEPS = -lm $(LIBUNWIND_LIBS) $(LIBSENSORS_LIBS) \ - $(CLOCK_LIB) $(PTHREAD_LIBS) $(DLOPEN_LIBS) $(am__append_1) +GALLIUM_COMMON_LIB_DEPS = \ + -lm \ + $(CLOCK_LIB) \ + $(PTHREAD_LIBS) \ + $(DLOPEN_LIBS) + GALLIUM_WINSYS_CFLAGS = \ -I$(top_srcdir)/src \ -I$(top_srcdir)/include \ @@ -524,20 +554,19 @@ GALLIUM_WINSYS_CFLAGS = \ GALLIUM_PIPE_LOADER_WINSYS_LIBS = \ $(top_builddir)/src/gallium/winsys/sw/null/libws_null.la \ $(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \ - $(am__append_2) $(am__append_3) + $(am__append_1) $(am__append_2) AM_CFLAGS = $(GALLIUM_DRIVER_CFLAGS) $(RADEON_CFLAGS) \ - -Wstrict-overflow=0 $(am__append_4) + -Wstrict-overflow=0 $(am__append_3) # ^^ disable warnings about overflows (os_time_timeout) noinst_LTLIBRARIES = libradeon.la -libradeon_la_SOURCES = \ - $(C_SOURCES) - -@HAVE_GALLIUM_LLVM_TRUE@libradeon_la_LIBADD = \ -@HAVE_GALLIUM_LLVM_TRUE@ $(CLOCK_LIB) \ -@HAVE_GALLIUM_LLVM_TRUE@ $(LLVM_LIBS) +libradeon_la_SOURCES = $(C_SOURCES) $(am__append_4) +@NEED_RADEON_LLVM_TRUE@libradeon_la_LIBADD = \ +@NEED_RADEON_LLVM_TRUE@ $(CLOCK_LIB) \ +@NEED_RADEON_LLVM_TRUE@ $(LLVM_LIBS) \ +@NEED_RADEON_LLVM_TRUE@ $(LIBELF_LIBS) -@HAVE_GALLIUM_LLVM_TRUE@libradeon_la_LDFLAGS = \ -@HAVE_GALLIUM_LLVM_TRUE@ $(LLVM_LDFLAGS) +@NEED_RADEON_LLVM_TRUE@libradeon_la_LDFLAGS = \ +@NEED_RADEON_LLVM_TRUE@ $(LLVM_LDFLAGS) EXTRA_DIST = \ LLVM_REVISION.txt @@ -607,6 +636,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r600_test_dma.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r600_texture.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r600_viewport.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/radeon_elf_util.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/radeon_uvd.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/radeon_vce.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/radeon_vce_40_2_2.Plo@am__quote@ diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.sources b/lib/mesa/src/gallium/drivers/radeon/Makefile.sources index f63790c32..3e13dae3c 100644 --- a/lib/mesa/src/gallium/drivers/radeon/Makefile.sources +++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.sources @@ -2,17 +2,21 @@ C_SOURCES := \ cayman_msaa.c \ r600_buffer_common.c \ r600_cs.h \ - r600d_common.h \ r600_gpu_load.c \ + r600_perfcounter.c \ r600_pipe_common.c \ r600_pipe_common.h \ r600_query.c \ + r600_query.h \ r600_streamout.c \ + r600_test_dma.c \ r600_texture.c \ + r600_viewport.c \ radeon_uvd.c \ radeon_uvd.h \ radeon_vce_40_2_2.c \ radeon_vce_50.c \ + radeon_vce_52.c \ radeon_vce.c \ radeon_vce.h \ radeon_video.c \ @@ -21,10 +25,4 @@ C_SOURCES := \ LLVM_C_FILES := \ radeon_elf_util.c \ - radeon_elf_util.h \ - radeon_llvm_emit.c \ - radeon_llvm_emit.h \ - radeon_llvm.h \ - radeon_llvm_util.c \ - radeon_llvm_util.h \ - radeon_setup_tgsi_llvm.c + radeon_elf_util.h diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c b/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c index 2d1058479..bbab58946 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c @@ -30,18 +30,18 @@ #include <inttypes.h> #include <stdio.h> -boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx, - struct radeon_winsys_cs_handle *buf, - enum radeon_bo_usage usage) +bool r600_rings_is_buffer_referenced(struct r600_common_context *ctx, + struct pb_buffer *buf, + enum radeon_bo_usage usage) { - if (ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs, buf, usage)) { - return TRUE; + if (ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, buf, usage)) { + return true; } - if (ctx->rings.dma.cs && ctx->rings.dma.cs->cdw && - ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs, buf, usage)) { - return TRUE; + if (radeon_emitted(ctx->dma.cs, 0) && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, buf, usage)) { + return true; } - return FALSE; + return false; } void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, @@ -52,7 +52,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, bool busy = false; if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) { - return ctx->ws->buffer_map(resource->cs_buf, NULL, usage); + return ctx->ws->buffer_map(resource->buf, NULL, usage); } if (!(usage & PIPE_TRANSFER_WRITE)) { @@ -60,26 +60,25 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, rusage = RADEON_USAGE_WRITE; } - if (ctx->rings.gfx.cs->cdw != ctx->initial_gfx_cs_size && - ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs, - resource->cs_buf, rusage)) { + if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && + ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, + resource->buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { - ctx->rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return NULL; } else { - ctx->rings.gfx.flush(ctx, 0, NULL); + ctx->gfx.flush(ctx, 0, NULL); busy = true; } } - if (ctx->rings.dma.cs && - ctx->rings.dma.cs->cdw && - ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs, - resource->cs_buf, rusage)) { + if (radeon_emitted(ctx->dma.cs, 0) && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, + resource->buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { - ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return NULL; } else { - ctx->rings.dma.flush(ctx, 0, NULL); + ctx->dma.flush(ctx, 0, NULL); busy = true; } } @@ -90,31 +89,33 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, } else { /* We will be wait for the GPU. Wait for any offloaded * CS flush to complete to avoid busy-waiting in the winsys. */ - ctx->ws->cs_sync_flush(ctx->rings.gfx.cs); - if (ctx->rings.dma.cs) - ctx->ws->cs_sync_flush(ctx->rings.dma.cs); + ctx->ws->cs_sync_flush(ctx->gfx.cs); + if (ctx->dma.cs) + ctx->ws->cs_sync_flush(ctx->dma.cs); } } /* Setting the CS to NULL will prevent doing checks we have done already. */ - return ctx->ws->buffer_map(resource->cs_buf, NULL, usage); + return ctx->ws->buffer_map(resource->buf, NULL, usage); } -bool r600_init_resource(struct r600_common_screen *rscreen, - struct r600_resource *res, - unsigned size, unsigned alignment, - bool use_reusable_pool) +void r600_init_resource_fields(struct r600_common_screen *rscreen, + struct r600_resource *res, + uint64_t size, unsigned alignment) { struct r600_texture *rtex = (struct r600_texture*)res; - struct pb_buffer *old_buf, *new_buf; - enum radeon_bo_flag flags = 0; + + res->bo_size = size; + res->bo_alignment = alignment; + res->flags = 0; switch (res->b.b.usage) { case PIPE_USAGE_STREAM: - flags = RADEON_FLAG_GTT_WC; + res->flags = RADEON_FLAG_GTT_WC; /* fall through */ case PIPE_USAGE_STAGING: - /* Transfers are likely to occur more often with these resources. */ + /* Transfers are likely to occur more often with these + * resources. */ res->domains = RADEON_DOMAIN_GTT; break; case PIPE_USAGE_DYNAMIC: @@ -124,52 +125,78 @@ bool r600_init_resource(struct r600_common_screen *rscreen, if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 40) { res->domains = RADEON_DOMAIN_GTT; - flags |= RADEON_FLAG_GTT_WC; + res->flags |= RADEON_FLAG_GTT_WC; break; } - flags |= RADEON_FLAG_CPU_ACCESS; + res->flags |= RADEON_FLAG_CPU_ACCESS; /* fall through */ case PIPE_USAGE_DEFAULT: case PIPE_USAGE_IMMUTABLE: default: - /* Not listing GTT here improves performance in some apps. */ + /* Not listing GTT here improves performance in some + * apps. */ res->domains = RADEON_DOMAIN_VRAM; - flags |= RADEON_FLAG_GTT_WC; + res->flags |= RADEON_FLAG_GTT_WC; break; } if (res->b.b.target == PIPE_BUFFER && res->b.b.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT | PIPE_RESOURCE_FLAG_MAP_COHERENT)) { - /* Use GTT for all persistent mappings with older kernels, - * because they didn't always flush the HDP cache before CS - * execution. + /* Use GTT for all persistent mappings with older + * kernels, because they didn't always flush the HDP + * cache before CS execution. * - * Write-combined CPU mappings are fine, the kernel ensures all CPU - * writes finish before the GPU executes a command stream. + * Write-combined CPU mappings are fine, the kernel + * ensures all CPU writes finish before the GPU + * executes a command stream. */ if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 40) res->domains = RADEON_DOMAIN_GTT; else if (res->domains & RADEON_DOMAIN_VRAM) - flags |= RADEON_FLAG_CPU_ACCESS; + res->flags |= RADEON_FLAG_CPU_ACCESS; } /* Tiled textures are unmappable. Always put them in VRAM. */ if (res->b.b.target != PIPE_BUFFER && rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) { res->domains = RADEON_DOMAIN_VRAM; - flags &= ~RADEON_FLAG_CPU_ACCESS; - flags |= RADEON_FLAG_NO_CPU_ACCESS; + res->flags &= ~RADEON_FLAG_CPU_ACCESS; + res->flags |= RADEON_FLAG_NO_CPU_ACCESS | + RADEON_FLAG_GTT_WC; } + /* If VRAM is just stolen system memory, allow both VRAM and + * GTT, whichever has free space. If a buffer is evicted from + * VRAM to GTT, it will stay there. + */ + if (!rscreen->info.has_dedicated_vram && + res->domains == RADEON_DOMAIN_VRAM) + res->domains = RADEON_DOMAIN_VRAM_GTT; + if (rscreen->debug_flags & DBG_NO_WC) - flags &= ~RADEON_FLAG_GTT_WC; + res->flags &= ~RADEON_FLAG_GTT_WC; + + /* Set expected VRAM and GART usage for the buffer. */ + res->vram_usage = 0; + res->gart_usage = 0; + + if (res->domains & RADEON_DOMAIN_VRAM) + res->vram_usage = size; + else if (res->domains & RADEON_DOMAIN_GTT) + res->gart_usage = size; +} + +bool r600_alloc_resource(struct r600_common_screen *rscreen, + struct r600_resource *res) +{ + struct pb_buffer *old_buf, *new_buf; /* Allocate a new resource. */ - new_buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment, - use_reusable_pool, - res->domains, flags); + new_buf = rscreen->ws->buffer_create(rscreen->ws, res->bo_size, + res->bo_alignment, + res->domains, res->flags); if (!new_buf) { return false; } @@ -179,11 +206,10 @@ bool r600_init_resource(struct r600_common_screen *rscreen, * the same buffer where one of the contexts invalidates it while * the others are using it. */ old_buf = res->buf; - res->cs_buf = rscreen->ws->buffer_get_cs_handle(new_buf); /* should be atomic */ res->buf = new_buf; /* should be atomic */ - if (rscreen->info.r600_virtual_address) - res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->cs_buf); + if (rscreen->info.has_virtual_memory) + res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->buf); else res->gpu_address = 0; @@ -192,8 +218,9 @@ bool r600_init_resource(struct r600_common_screen *rscreen, util_range_set_empty(&res->valid_buffer_range); res->TC_L2_dirty = false; + /* Print debug information. */ if (rscreen->debug_flags & DBG_VM && res->b.b.target == PIPE_BUFFER) { - fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Buffer %u bytes\n", + fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n", res->gpu_address, res->gpu_address + res->buf->size, res->buf->size); } @@ -210,6 +237,42 @@ static void r600_buffer_destroy(struct pipe_screen *screen, FREE(rbuffer); } +static bool +r600_invalidate_buffer(struct r600_common_context *rctx, + struct r600_resource *rbuffer) +{ + /* Shared buffers can't be reallocated. */ + if (rbuffer->is_shared) + return false; + + /* In AMD_pinned_memory, the user pointer association only gets + * broken when the buffer is explicitly re-allocated. + */ + if (rctx->ws->buffer_is_user_ptr(rbuffer->buf)) + return false; + + /* Check if mapping this buffer would cause waiting for the GPU. */ + if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || + !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { + rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b); + } else { + util_range_set_empty(&rbuffer->valid_buffer_range); + } + + return true; +} + +void r600_invalidate_resource(struct pipe_context *ctx, + struct pipe_resource *resource) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + struct r600_resource *rbuffer = r600_resource(resource); + + /* We currently only do anyting here for buffers */ + if (resource->target == PIPE_BUFFER) + (void)r600_invalidate_buffer(rctx, rbuffer); +} + static void *r600_buffer_get_transfer(struct pipe_context *ctx, struct pipe_resource *resource, unsigned level, @@ -220,7 +283,7 @@ static void *r600_buffer_get_transfer(struct pipe_context *ctx, unsigned offset) { struct r600_common_context *rctx = (struct r600_common_context*)ctx; - struct r600_transfer *transfer = util_slab_alloc(&rctx->pool_transfers); + struct r600_transfer *transfer = slab_alloc(&rctx->pool_transfers); transfer->transfer.resource = resource; transfer->transfer.level = level; @@ -240,7 +303,7 @@ static bool r600_can_dma_copy_buffer(struct r600_common_context *rctx, bool dword_aligned = !(dstx % 4) && !(srcx % 4) && !(size % 4); return rctx->screen->has_cp_dma || - (dword_aligned && (rctx->rings.dma.cs || + (dword_aligned && (rctx->dma.cs || rctx->screen->has_streamout)); } @@ -263,6 +326,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, * in which case it can be mapped unsynchronized. */ if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && usage & PIPE_TRANSFER_WRITE && + !rbuffer->is_shared && !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) { usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } @@ -277,29 +341,31 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { assert(usage & PIPE_TRANSFER_WRITE); - /* Check if mapping this buffer would cause waiting for the GPU. */ - if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || - !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { - rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b); + if (r600_invalidate_buffer(rctx, rbuffer)) { + /* At this point, the buffer is always idle. */ + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + } else { + /* Fall back to a temporary buffer. */ + usage |= PIPE_TRANSFER_DISCARD_RANGE; } - /* At this point, the buffer is always idle. */ - usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } - else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && - !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && - !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) && - r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) { + + if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && + !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | + PIPE_TRANSFER_PERSISTENT)) && + !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) && + r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ - if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || + if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ unsigned offset; struct r600_resource *staging = NULL; u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), - &offset, (struct pipe_resource**)&staging, (void**)&data); + 256, &offset, (struct pipe_resource**)&staging, (void**)&data); if (staging) { data += box->x % R600_MAP_BUFFER_ALIGNMENT; @@ -311,23 +377,29 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } } - /* Using a staging buffer in GTT for larger reads is much faster. */ + /* Use a staging buffer in cached GTT for reads. */ else if ((usage & PIPE_TRANSFER_READ) && - !(usage & PIPE_TRANSFER_WRITE) && - rbuffer->domains == RADEON_DOMAIN_VRAM && + !(usage & PIPE_TRANSFER_PERSISTENT) && + (rbuffer->domains & RADEON_DOMAIN_VRAM || + rbuffer->flags & RADEON_FLAG_GTT_WC) && r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) { struct r600_resource *staging; staging = (struct r600_resource*) pipe_buffer_create( - ctx->screen, PIPE_BIND_TRANSFER_READ, PIPE_USAGE_STAGING, + ctx->screen, 0, PIPE_USAGE_STAGING, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT)); if (staging) { /* Copy the VRAM buffer to the staging buffer. */ rctx->dma_copy(ctx, &staging->b.b, 0, box->x % R600_MAP_BUFFER_ALIGNMENT, - 0, 0, resource, level, box); + 0, 0, resource, 0, box); - data = r600_buffer_map_sync_with_rings(rctx, staging, PIPE_TRANSFER_READ); + data = r600_buffer_map_sync_with_rings(rctx, staging, + usage & ~PIPE_TRANSFER_UNSYNCHRONIZED); + if (!data) { + r600_resource_reference(&staging, NULL); + return NULL; + } data += box->x % R600_MAP_BUFFER_ALIGNMENT; return r600_buffer_get_transfer(ctx, resource, level, usage, box, @@ -345,38 +417,81 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, ptransfer, data, NULL, 0); } -static void r600_buffer_transfer_unmap(struct pipe_context *ctx, - struct pipe_transfer *transfer) +static void r600_buffer_do_flush_region(struct pipe_context *ctx, + struct pipe_transfer *transfer, + const struct pipe_box *box) { - struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; struct r600_resource *rbuffer = r600_resource(transfer->resource); if (rtransfer->staging) { - if (rtransfer->transfer.usage & PIPE_TRANSFER_WRITE) { - struct pipe_resource *dst, *src; - unsigned soffset, doffset, size; - struct pipe_box box; + struct pipe_resource *dst, *src; + unsigned soffset; + struct pipe_box dma_box; - dst = transfer->resource; - src = &rtransfer->staging->b.b; - size = transfer->box.width; - doffset = transfer->box.x; - soffset = rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT; + dst = transfer->resource; + src = &rtransfer->staging->b.b; + soffset = rtransfer->offset + box->x % R600_MAP_BUFFER_ALIGNMENT; - u_box_1d(soffset, size, &box); + u_box_1d(soffset, box->width, &dma_box); - /* Copy the staging buffer into the original one. */ - rctx->dma_copy(ctx, dst, 0, doffset, 0, 0, src, 0, &box); - } - pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL); + /* Copy the staging buffer into the original one. */ + ctx->resource_copy_region(ctx, dst, 0, box->x, 0, 0, src, 0, &dma_box); } - if (transfer->usage & PIPE_TRANSFER_WRITE) { - util_range_add(&rbuffer->valid_buffer_range, transfer->box.x, - transfer->box.x + transfer->box.width); + util_range_add(&rbuffer->valid_buffer_range, box->x, + box->x + box->width); +} + +static void r600_buffer_flush_region(struct pipe_context *ctx, + struct pipe_transfer *transfer, + const struct pipe_box *rel_box) +{ + if (transfer->usage & (PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_FLUSH_EXPLICIT)) { + struct pipe_box box; + + u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box); + r600_buffer_do_flush_region(ctx, transfer, &box); } - util_slab_free(&rctx->pool_transfers, transfer); +} + +static void r600_buffer_transfer_unmap(struct pipe_context *ctx, + struct pipe_transfer *transfer) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; + + if (transfer->usage & PIPE_TRANSFER_WRITE && + !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT)) + r600_buffer_do_flush_region(ctx, transfer, &transfer->box); + + if (rtransfer->staging) + r600_resource_reference(&rtransfer->staging, NULL); + + slab_free(&rctx->pool_transfers, transfer); +} + +void r600_buffer_subdata(struct pipe_context *ctx, + struct pipe_resource *buffer, + unsigned usage, unsigned offset, + unsigned size, const void *data) +{ + struct pipe_transfer *transfer = NULL; + struct pipe_box box; + uint8_t *map = NULL; + + u_box_1d(offset, size, &box); + map = r600_buffer_transfer_map(ctx, buffer, 0, + PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_DISCARD_RANGE | + usage, + &box, &transfer); + if (!map) + return; + + memcpy(map, data, size); + r600_buffer_transfer_unmap(ctx, transfer); } static const struct u_resource_vtbl r600_buffer_vtbl = @@ -384,9 +499,8 @@ static const struct u_resource_vtbl r600_buffer_vtbl = NULL, /* get_handle */ r600_buffer_destroy, /* resource_destroy */ r600_buffer_transfer_map, /* transfer_map */ - NULL, /* transfer_flush_region */ + r600_buffer_flush_region, /* transfer_flush_region */ r600_buffer_transfer_unmap, /* transfer_unmap */ - NULL /* transfer_inline_write */ }; static struct r600_resource * @@ -398,11 +512,14 @@ r600_alloc_buffer_struct(struct pipe_screen *screen, rbuffer = MALLOC_STRUCT(r600_resource); rbuffer->b.b = *templ; + rbuffer->b.b.next = NULL; pipe_reference_init(&rbuffer->b.b.reference, 1); rbuffer->b.b.screen = screen; rbuffer->b.vtbl = &r600_buffer_vtbl; rbuffer->buf = NULL; + rbuffer->bind_history = 0; rbuffer->TC_L2_dirty = false; + rbuffer->is_shared = false; util_range_init(&rbuffer->valid_buffer_range); return rbuffer; } @@ -414,13 +531,39 @@ struct pipe_resource *r600_buffer_create(struct pipe_screen *screen, struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; struct r600_resource *rbuffer = r600_alloc_buffer_struct(screen, templ); - if (!r600_init_resource(rscreen, rbuffer, templ->width0, alignment, TRUE)) { + r600_init_resource_fields(rscreen, rbuffer, templ->width0, alignment); + + if (templ->bind & PIPE_BIND_SHARED) + rbuffer->flags |= RADEON_FLAG_HANDLE; + + if (!r600_alloc_resource(rscreen, rbuffer)) { FREE(rbuffer); return NULL; } return &rbuffer->b.b; } +struct pipe_resource *r600_aligned_buffer_create(struct pipe_screen *screen, + unsigned bind, + unsigned usage, + unsigned size, + unsigned alignment) +{ + struct pipe_resource buffer; + + memset(&buffer, 0, sizeof buffer); + buffer.target = PIPE_BUFFER; + buffer.format = PIPE_FORMAT_R8_UNORM; + buffer.bind = bind; + buffer.usage = usage; + buffer.flags = 0; + buffer.width0 = size; + buffer.height0 = 1; + buffer.depth0 = 1; + buffer.array_size = 1; + return r600_buffer_create(screen, &buffer, alignment); +} + struct pipe_resource * r600_buffer_from_user_memory(struct pipe_screen *screen, const struct pipe_resource *templ, @@ -440,11 +583,9 @@ r600_buffer_from_user_memory(struct pipe_screen *screen, return NULL; } - rbuffer->cs_buf = ws->buffer_get_cs_handle(rbuffer->buf); - - if (rscreen->info.r600_virtual_address) + if (rscreen->info.has_virtual_memory) rbuffer->gpu_address = - ws->buffer_get_virtual_address(rbuffer->cs_buf); + ws->buffer_get_virtual_address(rbuffer->buf); else rbuffer->gpu_address = 0; diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c b/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c index f3529a1fe..0c55fc2a2 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c @@ -28,7 +28,7 @@ #include "util/u_memory.h" #include "r600_query.h" #include "r600_pipe_common.h" -#include "r600d_common.h" +#include "amd/common/r600d_common.h" /* Max counters per HW block */ #define R600_QUERY_MAX_COUNTERS 16 @@ -84,8 +84,8 @@ struct r600_pc_group { struct r600_pc_counter { unsigned base; - unsigned dwords; - unsigned stride; + unsigned qwords; + unsigned stride; /* in uint64s */ }; #define R600_PC_SHADERS_WINDOWING (1 << 31) @@ -115,6 +115,14 @@ static void r600_pc_query_destroy(struct r600_common_context *ctx, r600_query_hw_destroy(ctx, rquery); } +static bool r600_pc_query_prepare_buffer(struct r600_common_context *ctx, + struct r600_query_hw *hwquery, + struct r600_resource *buffer) +{ + /* no-op */ + return true; +} + static void r600_pc_query_emit_start(struct r600_common_context *ctx, struct r600_query_hw *hwquery, struct r600_resource *buffer, uint64_t va) @@ -172,7 +180,7 @@ static void r600_pc_query_emit_stop(struct r600_common_context *ctx, pc->emit_read(ctx, block, group->num_counters, group->selectors, buffer, va); - va += 4 * group->num_counters; + va += sizeof(uint64_t) * group->num_counters; } while (group->instance < 0 && ++instance < block->num_instances); } while (++se < se_end); } @@ -194,15 +202,15 @@ static void r600_pc_query_add_result(struct r600_common_context *ctx, union pipe_query_result *result) { struct r600_query_pc *query = (struct r600_query_pc *)hwquery; - uint32_t *results = buffer; + uint64_t *results = buffer; unsigned i, j; for (i = 0; i < query->num_counters; ++i) { struct r600_pc_counter *counter = &query->counters[i]; - for (j = 0; j < counter->dwords; ++j) { + for (j = 0; j < counter->qwords; ++j) { uint32_t value = results[counter->base + j * counter->stride]; - result->batch[i].u32 += value; + result->batch[i].u64 += value; } } } @@ -215,6 +223,7 @@ static struct r600_query_ops batch_query_ops = { }; static struct r600_query_hw_ops batch_query_hw_ops = { + .prepare_buffer = r600_pc_query_prepare_buffer, .emit_start = r600_pc_query_emit_start, .emit_stop = r600_pc_query_emit_stop, .clear_result = r600_pc_query_clear_result, @@ -310,7 +319,6 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, query->b.b.ops = &batch_query_ops; query->b.ops = &batch_query_hw_ops; - query->b.flags = R600_QUERY_HW_FLAG_TIMER; query->num_counters = num_queries; @@ -362,7 +370,7 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, instances *= block->num_instances; group->result_base = i; - query->b.result_size += 4 * instances * group->num_counters; + query->b.result_size += sizeof(uint64_t) * instances * group->num_counters; i += instances * group->num_counters; pc->get_size(block, group->num_counters, group->selectors, @@ -402,11 +410,11 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, counter->base = group->result_base + j; counter->stride = group->num_counters; - counter->dwords = 1; + counter->qwords = 1; if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0) - counter->dwords = screen->info.max_se; + counter->qwords = screen->info.max_se; if (group->instance < 0) - counter->dwords *= block->num_instances; + counter->qwords *= block->num_instances; } if (!r600_query_hw_init(rctx, &query->b)) @@ -419,8 +427,8 @@ error: return NULL; } -static boolean r600_init_block_names(struct r600_common_screen *screen, - struct r600_perfcounter_block *block) +static bool r600_init_block_names(struct r600_common_screen *screen, + struct r600_perfcounter_block *block) { unsigned i, j, k; unsigned groups_shader = 1, groups_se = 1, groups_instance = 1; @@ -453,7 +461,7 @@ static boolean r600_init_block_names(struct r600_common_screen *screen, block->group_names = MALLOC(block->num_groups * block->group_name_stride); if (!block->group_names) - return FALSE; + return false; groupname = block->group_names; for (i = 0; i < groups_shader; ++i) { @@ -488,7 +496,7 @@ static boolean r600_init_block_names(struct r600_common_screen *screen, block->selector_names = MALLOC(block->num_groups * block->num_selectors * block->selector_name_stride); if (!block->selector_names) - return FALSE; + return false; groupname = block->group_names; p = block->selector_names; @@ -500,7 +508,7 @@ static boolean r600_init_block_names(struct r600_common_screen *screen, groupname += block->group_name_stride; } - return TRUE; + return true; } int r600_get_perfcounter_info(struct r600_common_screen *screen, @@ -536,7 +544,7 @@ int r600_get_perfcounter_info(struct r600_common_screen *screen, info->name = block->selector_names + sub * block->selector_name_stride; info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index; info->max_value.u64 = 0; - info->type = PIPE_DRIVER_QUERY_TYPE_UINT; + info->type = PIPE_DRIVER_QUERY_TYPE_UINT64; info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE; info->group_id = base_gid + sub / block->num_selectors; info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; @@ -578,17 +586,17 @@ void r600_perfcounters_destroy(struct r600_common_screen *rscreen) rscreen->perfcounters->cleanup(rscreen); } -boolean r600_perfcounters_init(struct r600_perfcounters *pc, - unsigned num_blocks) +bool r600_perfcounters_init(struct r600_perfcounters *pc, + unsigned num_blocks) { pc->blocks = CALLOC(num_blocks, sizeof(struct r600_perfcounter_block)); if (!pc->blocks) - return FALSE; + return false; - pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", FALSE); - pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", FALSE); + pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false); + pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false); - return TRUE; + return true; } void r600_perfcounters_add_block(struct r600_common_screen *rscreen, diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c index 495fda0a8..f62bbf2e0 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c @@ -27,23 +27,118 @@ #include "r600_pipe_common.h" #include "r600_cs.h" #include "tgsi/tgsi_parse.h" +#include "util/list.h" #include "util/u_draw_quad.h" #include "util/u_memory.h" #include "util/u_format_s3tc.h" #include "util/u_upload_mgr.h" +#include "os/os_time.h" #include "vl/vl_decoder.h" #include "vl/vl_video_buffer.h" #include "radeon/radeon_video.h" #include <inttypes.h> +#include <sys/utsname.h> #ifndef HAVE_LLVM #define HAVE_LLVM 0 #endif +struct r600_multi_fence { + struct pipe_reference reference; + struct pipe_fence_handle *gfx; + struct pipe_fence_handle *sdma; + + /* If the context wasn't flushed at fence creation, this is non-NULL. */ + struct { + struct r600_common_context *ctx; + unsigned ib_index; + } gfx_unflushed; +}; + +/* + * shader binary helpers. + */ +void radeon_shader_binary_init(struct radeon_shader_binary *b) +{ + memset(b, 0, sizeof(*b)); +} + +void radeon_shader_binary_clean(struct radeon_shader_binary *b) +{ + if (!b) + return; + FREE(b->code); + FREE(b->config); + FREE(b->rodata); + FREE(b->global_symbol_offsets); + FREE(b->relocs); + FREE(b->disasm_string); + FREE(b->llvm_ir_string); +} + /* * pipe_context */ +void r600_gfx_write_fence(struct r600_common_context *ctx, struct r600_resource *buf, + uint64_t va, uint32_t old_value, uint32_t new_value) +{ + struct radeon_winsys_cs *cs = ctx->gfx.cs; + + if (ctx->chip_class == CIK || + ctx->chip_class == VI) { + /* Two EOP events are required to make all engines go idle + * (and optional cache flushes executed) before the timestamp + * is written. + */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | + EVENT_INDEX(5)); + radeon_emit(cs, va); + radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1)); + radeon_emit(cs, old_value); /* immediate data */ + radeon_emit(cs, 0); /* unused */ + } + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | + EVENT_INDEX(5)); + radeon_emit(cs, va); + radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1)); + radeon_emit(cs, new_value); /* immediate data */ + radeon_emit(cs, 0); /* unused */ + + r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); +} + +unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen) +{ + unsigned dwords = 6; + + if (screen->chip_class == CIK || + screen->chip_class == VI) + dwords *= 2; + + if (!screen->info.has_virtual_memory) + dwords += 2; + + return dwords; +} + +void r600_gfx_wait_fence(struct r600_common_context *ctx, + uint64_t va, uint32_t ref, uint32_t mask) +{ + struct radeon_winsys_cs *cs = ctx->gfx.cs; + + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, ref); /* reference value */ + radeon_emit(cs, mask); /* mask */ + radeon_emit(cs, 4); /* poll interval */ +} + void r600_draw_rectangle(struct blitter_context *blitter, int x1, int y1, int x2, int y2, float depth, enum blitter_attrib_type type, @@ -77,7 +172,7 @@ void r600_draw_rectangle(struct blitter_context *blitter, /* Upload vertices. The hw rectangle has only 3 vertices, * I guess the 4th one is derived from the first 3. * The vertex specification should match u_blitter's vertex element state. */ - u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, &offset, &buf, (void**)&vb); + u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, 256, &offset, &buf, (void**)&vb); if (!buf) return; @@ -108,12 +203,89 @@ void r600_draw_rectangle(struct blitter_context *blitter, pipe_resource_reference(&buf, NULL); } -void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw) +void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, + struct r600_resource *dst, struct r600_resource *src) +{ + uint64_t vram = 0, gtt = 0; + + if (dst) { + vram += dst->vram_usage; + gtt += dst->gart_usage; + } + if (src) { + vram += src->vram_usage; + gtt += src->gart_usage; + } + + /* Flush the GFX IB if DMA depends on it. */ + if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && + ((dst && + ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf, + RADEON_USAGE_READWRITE)) || + (src && + ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf, + RADEON_USAGE_WRITE)))) + ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + + /* Flush if there's not enough space, or if the memory usage per IB + * is too large. + */ + if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) || + !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) { + ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw); + } + + /* If GPUVM is not supported, the CS checker needs 2 entries + * in the buffer list per packet, which has to be done manually. + */ + if (ctx->screen->info.has_virtual_memory) { + if (dst) + radeon_add_to_buffer_list(ctx, &ctx->dma, dst, + RADEON_USAGE_WRITE, + RADEON_PRIO_SDMA_BUFFER); + if (src) + radeon_add_to_buffer_list(ctx, &ctx->dma, src, + RADEON_USAGE_READ, + RADEON_PRIO_SDMA_BUFFER); + } +} + +/* This is required to prevent read-after-write hazards. */ +void r600_dma_emit_wait_idle(struct r600_common_context *rctx) { - /* Flush if there's not enough space. */ - if ((num_dw + ctx->rings.dma.cs->cdw) > ctx->rings.dma.cs->max_dw) { - ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); - assert((num_dw + ctx->rings.dma.cs->cdw) <= ctx->rings.dma.cs->max_dw); + struct radeon_winsys_cs *cs = rctx->dma.cs; + + /* done at the end of DMA calls, so increment this. */ + rctx->num_dma_calls++; + + /* IBs using too little memory are limited by the IB submission overhead. + * IBs using too much memory are limited by the kernel/TTM overhead. + * Too long IBs create CPU-GPU pipeline bubbles and add latency. + * + * This heuristic makes sure that DMA requests are executed + * very soon after the call is made and lowers memory usage. + * It improves texture upload performance by keeping the DMA + * engine busy while uploads are being submitted. + */ + if (cs->used_vram + cs->used_gart > 64 * 1024 * 1024) { + rctx->dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); + return; + } + + r600_need_dma_space(rctx, 1, NULL, NULL); + + if (!radeon_emitted(cs, 0)) /* empty queue */ + return; + + /* NOP waits for idle on Evergreen and later. */ + if (rctx->chip_class >= CIK) + radeon_emit(cs, 0x00000000); /* NOP */ + else if (rctx->chip_class >= EVERGREEN) + radeon_emit(cs, 0xf0000000); /* NOP */ + else { + /* TODO: R600-R700 should use the FENCE packet. + * CS checker support is required. */ } } @@ -123,24 +295,9 @@ static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags) void r600_preflush_suspend_features(struct r600_common_context *ctx) { - /* Disable render condition. */ - ctx->saved_render_cond = NULL; - ctx->saved_render_cond_cond = FALSE; - ctx->saved_render_cond_mode = 0; - if (ctx->current_render_cond) { - ctx->saved_render_cond = ctx->current_render_cond; - ctx->saved_render_cond_cond = ctx->current_render_cond_cond; - ctx->saved_render_cond_mode = ctx->current_render_cond_mode; - ctx->b.render_condition(&ctx->b, NULL, FALSE, 0); - } - /* suspend queries */ - ctx->queries_suspended_for_flush = false; - if (ctx->num_cs_dw_nontimer_queries_suspend) { - r600_suspend_nontimer_queries(ctx); - r600_suspend_timer_queries(ctx); - ctx->queries_suspended_for_flush = true; - } + if (!LIST_IS_EMPTY(&ctx->active_queries)) + r600_suspend_queries(ctx); ctx->streamout.suspended = false; if (ctx->streamout.begin_emitted) { @@ -157,48 +314,152 @@ void r600_postflush_resume_features(struct r600_common_context *ctx) } /* resume queries */ - if (ctx->queries_suspended_for_flush) { - r600_resume_nontimer_queries(ctx); - r600_resume_timer_queries(ctx); - } - - /* Re-enable render condition. */ - if (ctx->saved_render_cond) { - ctx->b.render_condition(&ctx->b, ctx->saved_render_cond, - ctx->saved_render_cond_cond, - ctx->saved_render_cond_mode); - } + if (!LIST_IS_EMPTY(&ctx->active_queries)) + r600_resume_queries(ctx); } static void r600_flush_from_st(struct pipe_context *ctx, struct pipe_fence_handle **fence, unsigned flags) { + struct pipe_screen *screen = ctx->screen; struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct radeon_winsys *ws = rctx->ws; unsigned rflags = 0; + struct pipe_fence_handle *gfx_fence = NULL; + struct pipe_fence_handle *sdma_fence = NULL; + bool deferred_fence = false; if (flags & PIPE_FLUSH_END_OF_FRAME) rflags |= RADEON_FLUSH_END_OF_FRAME; + if (flags & PIPE_FLUSH_DEFERRED) + rflags |= RADEON_FLUSH_ASYNC; + + if (rctx->dma.cs) { + rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL); + } + + if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) { + if (fence) + ws->fence_reference(&gfx_fence, rctx->last_gfx_fence); + if (!(rflags & RADEON_FLUSH_ASYNC)) + ws->cs_sync_flush(rctx->gfx.cs); + } else { + /* Instead of flushing, create a deferred fence. Constraints: + * - The state tracker must allow a deferred flush. + * - The state tracker must request a fence. + * Thread safety in fence_finish must be ensured by the state tracker. + */ + if (flags & PIPE_FLUSH_DEFERRED && fence) { + gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs); + deferred_fence = true; + } else { + rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL); + } + } + + /* Both engines can signal out of order, so we need to keep both fences. */ + if (fence) { + struct r600_multi_fence *multi_fence = + CALLOC_STRUCT(r600_multi_fence); + if (!multi_fence) + return; + + multi_fence->reference.count = 1; + /* If both fences are NULL, fence_finish will always return true. */ + multi_fence->gfx = gfx_fence; + multi_fence->sdma = sdma_fence; + + if (deferred_fence) { + multi_fence->gfx_unflushed.ctx = rctx; + multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes; + } - if (rctx->rings.dma.cs) { - rctx->rings.dma.flush(rctx, rflags, NULL); + screen->fence_reference(screen, fence, NULL); + *fence = (struct pipe_fence_handle*)multi_fence; } - rctx->rings.gfx.flush(rctx, rflags, fence); } static void r600_flush_dma_ring(void *ctx, unsigned flags, struct pipe_fence_handle **fence) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; - struct radeon_winsys_cs *cs = rctx->rings.dma.cs; - - if (!cs->cdw) { + struct radeon_winsys_cs *cs = rctx->dma.cs; + struct radeon_saved_cs saved; + bool check_vm = + (rctx->screen->debug_flags & DBG_CHECK_VM) && + rctx->check_vm_faults; + + if (!radeon_emitted(cs, 0)) { + if (fence) + rctx->ws->fence_reference(fence, rctx->last_sdma_fence); return; } - rctx->rings.dma.flushing = true; - rctx->ws->cs_flush(cs, flags, fence, 0); - rctx->rings.dma.flushing = false; + if (check_vm) + radeon_save_cs(rctx->ws, cs, &saved); + + rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence); + if (fence) + rctx->ws->fence_reference(fence, rctx->last_sdma_fence); + + if (check_vm) { + /* Use conservative timeout 800ms, after which we won't wait any + * longer and assume the GPU is hung. + */ + rctx->ws->fence_wait(rctx->ws, rctx->last_sdma_fence, 800*1000*1000); + + rctx->check_vm_faults(rctx, &saved, RING_DMA); + radeon_clear_saved_cs(&saved); + } +} + +/** + * Store a linearized copy of all chunks of \p cs together with the buffer + * list in \p saved. + */ +void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs, + struct radeon_saved_cs *saved) +{ + void *buf; + unsigned i; + + /* Save the IB chunks. */ + saved->num_dw = cs->prev_dw + cs->current.cdw; + saved->ib = MALLOC(4 * saved->num_dw); + if (!saved->ib) + goto oom; + + buf = saved->ib; + for (i = 0; i < cs->num_prev; ++i) { + memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4); + buf += cs->prev[i].cdw; + } + memcpy(buf, cs->current.buf, cs->current.cdw * 4); + + /* Save the buffer list. */ + saved->bo_count = ws->cs_get_buffer_list(cs, NULL); + saved->bo_list = CALLOC(saved->bo_count, + sizeof(saved->bo_list[0])); + if (!saved->bo_list) { + FREE(saved->ib); + goto oom; + } + ws->cs_get_buffer_list(cs, saved->bo_list); + + return; + +oom: + fprintf(stderr, "%s: out of memory\n", __func__); + memset(saved, 0, sizeof(*saved)); +} + +void radeon_clear_saved_cs(struct radeon_saved_cs *saved) +{ + FREE(saved->ib); + FREE(saved->bo_list); + + memset(saved, 0, sizeof(*saved)); } static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx) @@ -214,31 +475,82 @@ static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx) return PIPE_UNKNOWN_CONTEXT_RESET; } +static void r600_set_debug_callback(struct pipe_context *ctx, + const struct pipe_debug_callback *cb) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + + if (cb) + rctx->debug = *cb; + else + memset(&rctx->debug, 0, sizeof(rctx->debug)); +} + +static void r600_set_device_reset_callback(struct pipe_context *ctx, + const struct pipe_device_reset_callback *cb) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + + if (cb) + rctx->device_reset_callback = *cb; + else + memset(&rctx->device_reset_callback, 0, + sizeof(rctx->device_reset_callback)); +} + +bool r600_check_device_reset(struct r600_common_context *rctx) +{ + enum pipe_reset_status status; + + if (!rctx->device_reset_callback.reset) + return false; + + if (!rctx->b.get_device_reset_status) + return false; + + status = rctx->b.get_device_reset_status(&rctx->b); + if (status == PIPE_NO_RESET) + return false; + + rctx->device_reset_callback.reset(rctx->device_reset_callback.data, status); + return true; +} + bool r600_common_context_init(struct r600_common_context *rctx, - struct r600_common_screen *rscreen) + struct r600_common_screen *rscreen, + unsigned context_flags) { - util_slab_create(&rctx->pool_transfers, - sizeof(struct r600_transfer), 64, - UTIL_SLAB_SINGLETHREADED); + slab_create_child(&rctx->pool_transfers, &rscreen->pool_transfers); rctx->screen = rscreen; rctx->ws = rscreen->ws; rctx->family = rscreen->family; rctx->chip_class = rscreen->chip_class; - if (rscreen->family == CHIP_HAWAII) - rctx->max_db = 16; + if (rscreen->chip_class >= CIK) + rctx->max_db = MAX2(8, rscreen->info.num_render_backends); else if (rscreen->chip_class >= EVERGREEN) rctx->max_db = 8; else rctx->max_db = 4; + rctx->b.invalidate_resource = r600_invalidate_resource; rctx->b.transfer_map = u_transfer_map_vtbl; - rctx->b.transfer_flush_region = u_default_transfer_flush_region; + rctx->b.transfer_flush_region = u_transfer_flush_region_vtbl; rctx->b.transfer_unmap = u_transfer_unmap_vtbl; - rctx->b.transfer_inline_write = u_default_transfer_inline_write; - rctx->b.memory_barrier = r600_memory_barrier; + rctx->b.texture_subdata = u_default_texture_subdata; + rctx->b.memory_barrier = r600_memory_barrier; rctx->b.flush = r600_flush_from_st; + rctx->b.set_debug_callback = r600_set_debug_callback; + + /* evergreen_compute.c has a special codepath for global buffers. + * Everything else can use the direct path. + */ + if ((rscreen->chip_class == EVERGREEN || rscreen->chip_class == CAYMAN) && + (context_flags & PIPE_CONTEXT_COMPUTE_ONLY)) + rctx->b.buffer_subdata = u_default_buffer_subdata; + else + rctx->b.buffer_subdata = r600_buffer_subdata; if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) { rctx->b.get_device_reset_status = r600_get_reset_status; @@ -247,21 +559,23 @@ bool r600_common_context_init(struct r600_common_context *rctx, RADEON_GPU_RESET_COUNTER); } - LIST_INITHEAD(&rctx->texture_buffers); + rctx->b.set_device_reset_callback = r600_set_device_reset_callback; r600_init_context_texture_functions(rctx); + r600_init_viewport_functions(rctx); r600_streamout_init(rctx); r600_query_init(rctx); cayman_init_msaa(&rctx->b); - rctx->allocator_so_filled_size = u_suballocator_create(&rctx->b, 4096, 4, - 0, PIPE_USAGE_DEFAULT, TRUE); - if (!rctx->allocator_so_filled_size) + rctx->allocator_zeroed_memory = + u_suballocator_create(&rctx->b, rscreen->info.gart_page_size, + 0, PIPE_USAGE_DEFAULT, true); + if (!rctx->allocator_zeroed_memory) return false; - rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024, 256, + rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024, PIPE_BIND_INDEX_BUFFER | - PIPE_BIND_CONSTANT_BUFFER); + PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM); if (!rctx->uploader) return false; @@ -269,11 +583,11 @@ bool r600_common_context_init(struct r600_common_context *rctx, if (!rctx->ctx) return false; - if (rscreen->info.r600_has_dma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) { - rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA, - r600_flush_dma_ring, - rctx, NULL); - rctx->rings.dma.flush = r600_flush_dma_ring; + if (rscreen->info.has_sdma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) { + rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA, + r600_flush_dma_ring, + rctx); + rctx->dma.flush = r600_flush_dma_ring; } return true; @@ -281,46 +595,41 @@ bool r600_common_context_init(struct r600_common_context *rctx, void r600_common_context_cleanup(struct r600_common_context *rctx) { - if (rctx->rings.gfx.cs) - rctx->ws->cs_destroy(rctx->rings.gfx.cs); - if (rctx->rings.dma.cs) - rctx->ws->cs_destroy(rctx->rings.dma.cs); - if (rctx->ctx) - rctx->ws->ctx_destroy(rctx->ctx); + unsigned i,j; - if (rctx->uploader) { - u_upload_destroy(rctx->uploader); - } + /* Release DCC stats. */ + for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) { + assert(!rctx->dcc_stats[i].query_active); - util_slab_destroy(&rctx->pool_transfers); + for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++) + if (rctx->dcc_stats[i].ps_stats[j]) + rctx->b.destroy_query(&rctx->b, + rctx->dcc_stats[i].ps_stats[j]); - if (rctx->allocator_so_filled_size) { - u_suballocator_destroy(rctx->allocator_so_filled_size); + r600_texture_reference(&rctx->dcc_stats[i].tex, NULL); } -} -void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r) -{ - struct r600_common_context *rctx = (struct r600_common_context *)ctx; - struct r600_resource *rr = (struct r600_resource *)r; + if (rctx->query_result_shader) + rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader); - if (r == NULL) { - return; - } + if (rctx->gfx.cs) + rctx->ws->cs_destroy(rctx->gfx.cs); + if (rctx->dma.cs) + rctx->ws->cs_destroy(rctx->dma.cs); + if (rctx->ctx) + rctx->ws->ctx_destroy(rctx->ctx); - /* - * The idea is to compute a gross estimate of memory requirement of - * each draw call. After each draw call, memory will be precisely - * accounted. So the uncertainty is only on the current draw call. - * In practice this gave very good estimate (+/- 10% of the target - * memory limit). - */ - if (rr->domains & RADEON_DOMAIN_GTT) { - rctx->gtt += rr->buf->size; + if (rctx->uploader) { + u_upload_destroy(rctx->uploader); } - if (rr->domains & RADEON_DOMAIN_VRAM) { - rctx->vram += rr->buf->size; + + slab_destroy_child(&rctx->pool_transfers); + + if (rctx->allocator_zeroed_memory) { + u_suballocator_destroy(rctx->allocator_zeroed_memory); } + rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL); + rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL); } /* @@ -330,10 +639,8 @@ void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resour static const struct debug_named_value common_debug_options[] = { /* logging */ { "tex", DBG_TEX, "Print texture info" }, - { "texmip", DBG_TEXMIP, "Print texture info (mipmapped only)" }, { "compute", DBG_COMPUTE, "Print compute info" }, { "vm", DBG_VM, "Print virtual addresses when creating resources" }, - { "trace_cs", DBG_TRACE_CS, "Trace cs and write rlockup_<csid>.c file with faulty cs" }, { "info", DBG_INFO, "Print driver information" }, /* shaders */ @@ -347,6 +654,10 @@ static const struct debug_named_value common_debug_options[] = { { "noir", DBG_NO_IR, "Don't print the LLVM IR"}, { "notgsi", DBG_NO_TGSI, "Don't print the TGSI"}, { "noasm", DBG_NO_ASM, "Don't print disassembled shaders"}, + { "preoptir", DBG_PREOPT_IR, "Print the LLVM IR before initial optimizations" }, + { "checkir", DBG_CHECK_IR, "Enable additional sanity checks on shader IR" }, + + { "testdma", DBG_TEST_DMA, "Invoke SDMA tests and exit." }, /* features */ { "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" }, @@ -359,6 +670,15 @@ static const struct debug_named_value common_debug_options[] = { { "forcedma", DBG_FORCE_DMA, "Use asynchronous DMA for all operations when possible." }, { "precompile", DBG_PRECOMPILE, "Compile one shader variant at shader creation." }, { "nowc", DBG_NO_WC, "Disable GTT write combining" }, + { "check_vm", DBG_CHECK_VM, "Check VM faults and dump debug info." }, + { "nodcc", DBG_NO_DCC, "Disable DCC." }, + { "nodccclear", DBG_NO_DCC_CLEAR, "Disable DCC fast clear." }, + { "norbplus", DBG_NO_RB_PLUS, "Disable RB+ on Stoney." }, + { "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction Scheduler." }, + { "mono", DBG_MONOLITHIC_SHADERS, "Use old-style monolithic shaders compiled on demand" }, + { "noce", DBG_NO_CE, "Disable the constant engine"}, + { "unsafemath", DBG_UNSAFE_MATH, "Enable unsafe math shader optimizations" }, + { "nodccfb", DBG_NO_DCC_FB, "Disable separate DCC on the main framebuffer" }, DEBUG_NAMED_VALUE_END /* must be last */ }; @@ -415,6 +735,8 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen) case CHIP_ICELAND: return "AMD ICELAND"; case CHIP_CARRIZO: return "AMD CARRIZO"; case CHIP_FIJI: return "AMD FIJI"; + case CHIP_POLARIS10: return "AMD POLARIS10"; + case CHIP_POLARIS11: return "AMD POLARIS11"; case CHIP_STONEY: return "AMD STONEY"; default: return "AMD unknown"; } @@ -535,25 +857,30 @@ const char *r600_get_llvm_processor_name(enum radeon_family family) case CHIP_KAVERI: return "kaveri"; case CHIP_HAWAII: return "hawaii"; case CHIP_MULLINS: -#if HAVE_LLVM >= 0x0305 return "mullins"; -#else - return "kabini"; -#endif case CHIP_TONGA: return "tonga"; case CHIP_ICELAND: return "iceland"; case CHIP_CARRIZO: return "carrizo"; - case CHIP_FIJI: return "fiji"; #if HAVE_LLVM <= 0x0307 + case CHIP_FIJI: return "tonga"; case CHIP_STONEY: return "carrizo"; #else + case CHIP_FIJI: return "fiji"; case CHIP_STONEY: return "stoney"; #endif +#if HAVE_LLVM <= 0x0308 + case CHIP_POLARIS10: return "tonga"; + case CHIP_POLARIS11: return "tonga"; +#else + case CHIP_POLARIS10: return "polaris10"; + case CHIP_POLARIS11: return "polaris11"; +#endif default: return ""; } } static int r600_get_compute_param(struct pipe_screen *screen, + enum pipe_shader_ir ir_type, enum pipe_compute_cap param, void *ret) { @@ -564,20 +891,19 @@ static int r600_get_compute_param(struct pipe_screen *screen, case PIPE_COMPUTE_CAP_IR_TARGET: { const char *gpu; const char *triple; - if (rscreen->family <= CHIP_ARUBA || HAVE_LLVM < 0x0306) { + if (rscreen->family <= CHIP_ARUBA) { triple = "r600--"; } else { - triple = "amdgcn--"; + if (HAVE_LLVM < 0x0400) { + triple = "amdgcn--"; + } else { + triple = "amdgcn-mesa-mesa3d"; + } } switch(rscreen->family) { /* Clang < 3.6 is missing Hainan in its list of * GPUs, so we need to use the name of a similar GPU. */ -#if HAVE_LLVM < 0x0306 - case CHIP_HAINAN: - gpu = "oland"; - break; -#endif default: gpu = r600_get_llvm_processor_name(rscreen->family); break; @@ -600,32 +926,51 @@ static int r600_get_compute_param(struct pipe_screen *screen, uint64_t *grid_size = ret; grid_size[0] = 65535; grid_size[1] = 65535; - grid_size[2] = 1; + grid_size[2] = 65535; } return 3 * sizeof(uint64_t) ; case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: if (ret) { uint64_t *block_size = ret; - block_size[0] = 256; - block_size[1] = 256; - block_size[2] = 256; + if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 && + ir_type == PIPE_SHADER_IR_TGSI) { + block_size[0] = 2048; + block_size[1] = 2048; + block_size[2] = 2048; + } else { + block_size[0] = 256; + block_size[1] = 256; + block_size[2] = 256; + } } return 3 * sizeof(uint64_t); case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: if (ret) { uint64_t *max_threads_per_block = ret; - *max_threads_per_block = 256; + if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 && + ir_type == PIPE_SHADER_IR_TGSI) + *max_threads_per_block = 2048; + else + *max_threads_per_block = 256; } return sizeof(uint64_t); + case PIPE_COMPUTE_CAP_ADDRESS_BITS: + if (ret) { + uint32_t *address_bits = ret; + address_bits[0] = 32; + if (rscreen->chip_class >= SI) + address_bits[0] = 64; + } + return 1 * sizeof(uint32_t); case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: if (ret) { uint64_t *max_global_size = ret; uint64_t max_mem_alloc_size; - r600_get_compute_param(screen, + r600_get_compute_param(screen, ir_type, PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, &max_mem_alloc_size); @@ -636,8 +981,8 @@ static int r600_get_compute_param(struct pipe_screen *screen, * 4 * MAX_MEM_ALLOC_SIZE. */ *max_global_size = MIN2(4 * max_mem_alloc_size, - rscreen->info.gart_size + - rscreen->info.vram_size); + MAX2(rscreen->info.gart_size, + rscreen->info.vram_size)); } return sizeof(uint64_t); @@ -661,24 +1006,21 @@ static int r600_get_compute_param(struct pipe_screen *screen, if (ret) { uint64_t *max_mem_alloc_size = ret; - /* XXX: The limit in older kernels is 256 MB. We - * should add a query here for newer kernels. - */ - *max_mem_alloc_size = 256 * 1024 * 1024; + *max_mem_alloc_size = rscreen->info.max_alloc_size; } return sizeof(uint64_t); case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: if (ret) { uint32_t *max_clock_frequency = ret; - *max_clock_frequency = rscreen->info.max_sclk; + *max_clock_frequency = rscreen->info.max_shader_clock; } return sizeof(uint32_t); case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: if (ret) { uint32_t *max_compute_units = ret; - *max_compute_units = rscreen->info.max_compute_units; + *max_compute_units = rscreen->info.num_good_compute_units; } return sizeof(uint32_t); @@ -696,6 +1038,16 @@ static int r600_get_compute_param(struct pipe_screen *screen, *subgroup_size = r600_wavefront_size(rscreen->family); } return sizeof(uint32_t); + case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: + if (ret) { + uint64_t *max_variable_threads_per_block = ret; + if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 && + ir_type == PIPE_SHADER_IR_TGSI) + *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK; + else + *max_variable_threads_per_block = 0; + } + return sizeof(uint64_t); } fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param); @@ -707,188 +1059,116 @@ static uint64_t r600_get_timestamp(struct pipe_screen *screen) struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; return 1000000 * rscreen->ws->query_value(rscreen->ws, RADEON_TIMESTAMP) / - rscreen->info.r600_clock_crystal_freq; -} - -static int r600_get_driver_query_info(struct pipe_screen *screen, - unsigned index, - struct pipe_driver_query_info *info) -{ - struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; - struct pipe_driver_query_info list[] = { - {"num-compilations", R600_QUERY_NUM_COMPILATIONS, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64, - PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE}, - {"num-shaders-created", R600_QUERY_NUM_SHADERS_CREATED, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64, - PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE}, - {"draw-calls", R600_QUERY_DRAW_CALLS, {0}}, - {"requested-VRAM", R600_QUERY_REQUESTED_VRAM, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES}, - {"requested-GTT", R600_QUERY_REQUESTED_GTT, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES}, - {"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}, PIPE_DRIVER_QUERY_TYPE_MICROSECONDS, - PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE}, - {"num-cs-flushes", R600_QUERY_NUM_CS_FLUSHES, {0}}, - {"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES, - PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE}, - {"VRAM-usage", R600_QUERY_VRAM_USAGE, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES}, - {"GTT-usage", R600_QUERY_GTT_USAGE, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES}, - {"GPU-load", R600_QUERY_GPU_LOAD, {100}}, - {"temperature", R600_QUERY_GPU_TEMPERATURE, {100}}, - {"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ}, - {"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ}, - }; - unsigned num_queries; - - if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42) - num_queries = Elements(list); - else if (rscreen->info.drm_major == 3) - num_queries = Elements(list) - 3; - else - num_queries = Elements(list) - 4; - - if (!info) - return num_queries; - - if (index >= num_queries) - return 0; - - *info = list[index]; - return 1; + rscreen->info.clock_crystal_freq; } static void r600_fence_reference(struct pipe_screen *screen, - struct pipe_fence_handle **ptr, - struct pipe_fence_handle *fence) + struct pipe_fence_handle **dst, + struct pipe_fence_handle *src) { - struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws; - - rws->fence_reference(ptr, fence); + struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws; + struct r600_multi_fence **rdst = (struct r600_multi_fence **)dst; + struct r600_multi_fence *rsrc = (struct r600_multi_fence *)src; + + if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) { + ws->fence_reference(&(*rdst)->gfx, NULL); + ws->fence_reference(&(*rdst)->sdma, NULL); + FREE(*rdst); + } + *rdst = rsrc; } static boolean r600_fence_finish(struct pipe_screen *screen, + struct pipe_context *ctx, struct pipe_fence_handle *fence, uint64_t timeout) { struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws; + struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence; + struct r600_common_context *rctx = + ctx ? (struct r600_common_context*)ctx : NULL; + int64_t abs_timeout = os_time_get_absolute_timeout(timeout); - return rws->fence_wait(rws, fence, timeout); -} + if (rfence->sdma) { + if (!rws->fence_wait(rws, rfence->sdma, timeout)) + return false; -static bool r600_interpret_tiling(struct r600_common_screen *rscreen, - uint32_t tiling_config) -{ - switch ((tiling_config & 0xe) >> 1) { - case 0: - rscreen->tiling_info.num_channels = 1; - break; - case 1: - rscreen->tiling_info.num_channels = 2; - break; - case 2: - rscreen->tiling_info.num_channels = 4; - break; - case 3: - rscreen->tiling_info.num_channels = 8; - break; - default: - return false; + /* Recompute the timeout after waiting. */ + if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { + int64_t time = os_time_get_nano(); + timeout = abs_timeout > time ? abs_timeout - time : 0; + } } - switch ((tiling_config & 0x30) >> 4) { - case 0: - rscreen->tiling_info.num_banks = 4; - break; - case 1: - rscreen->tiling_info.num_banks = 8; - break; - default: - return false; + if (!rfence->gfx) + return true; - } - switch ((tiling_config & 0xc0) >> 6) { - case 0: - rscreen->tiling_info.group_bytes = 256; - break; - case 1: - rscreen->tiling_info.group_bytes = 512; - break; - default: - return false; - } - return true; -} + /* Flush the gfx IB if it hasn't been flushed yet. */ + if (rctx && + rfence->gfx_unflushed.ctx == rctx && + rfence->gfx_unflushed.ib_index == rctx->num_gfx_cs_flushes) { + rctx->gfx.flush(rctx, timeout ? 0 : RADEON_FLUSH_ASYNC, NULL); + rfence->gfx_unflushed.ctx = NULL; -static bool evergreen_interpret_tiling(struct r600_common_screen *rscreen, - uint32_t tiling_config) -{ - switch (tiling_config & 0xf) { - case 0: - rscreen->tiling_info.num_channels = 1; - break; - case 1: - rscreen->tiling_info.num_channels = 2; - break; - case 2: - rscreen->tiling_info.num_channels = 4; - break; - case 3: - rscreen->tiling_info.num_channels = 8; - break; - default: - return false; - } + if (!timeout) + return false; - switch ((tiling_config & 0xf0) >> 4) { - case 0: - rscreen->tiling_info.num_banks = 4; - break; - case 1: - rscreen->tiling_info.num_banks = 8; - break; - case 2: - rscreen->tiling_info.num_banks = 16; - break; - default: - return false; + /* Recompute the timeout after all that. */ + if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { + int64_t time = os_time_get_nano(); + timeout = abs_timeout > time ? abs_timeout - time : 0; + } } - switch ((tiling_config & 0xf00) >> 8) { - case 0: - rscreen->tiling_info.group_bytes = 256; - break; - case 1: - rscreen->tiling_info.group_bytes = 512; - break; - default: - return false; - } - return true; + return rws->fence_wait(rws, rfence->gfx, timeout); } -static bool r600_init_tiling(struct r600_common_screen *rscreen) +static void r600_query_memory_info(struct pipe_screen *screen, + struct pipe_memory_info *info) { - uint32_t tiling_config = rscreen->info.r600_tiling_config; - - /* set default group bytes, overridden by tiling info ioctl */ - if (rscreen->chip_class <= R700) { - rscreen->tiling_info.group_bytes = 256; - } else { - rscreen->tiling_info.group_bytes = 512; - } - - if (!tiling_config) - return true; - - if (rscreen->chip_class <= R700) { - return r600_interpret_tiling(rscreen, tiling_config); - } else { - return evergreen_interpret_tiling(rscreen, tiling_config); - } + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + struct radeon_winsys *ws = rscreen->ws; + unsigned vram_usage, gtt_usage; + + info->total_device_memory = rscreen->info.vram_size / 1024; + info->total_staging_memory = rscreen->info.gart_size / 1024; + + /* The real TTM memory usage is somewhat random, because: + * + * 1) TTM delays freeing memory, because it can only free it after + * fences expire. + * + * 2) The memory usage can be really low if big VRAM evictions are + * taking place, but the real usage is well above the size of VRAM. + * + * Instead, return statistics of this process. + */ + vram_usage = ws->query_value(ws, RADEON_REQUESTED_VRAM_MEMORY) / 1024; + gtt_usage = ws->query_value(ws, RADEON_REQUESTED_GTT_MEMORY) / 1024; + + info->avail_device_memory = + vram_usage <= info->total_device_memory ? + info->total_device_memory - vram_usage : 0; + info->avail_staging_memory = + gtt_usage <= info->total_staging_memory ? + info->total_staging_memory - gtt_usage : 0; + + info->device_memory_evicted = + ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024; + + if (rscreen->info.drm_major == 3 && rscreen->info.drm_minor >= 4) + info->nr_device_memory_evictions = + ws->query_value(ws, RADEON_NUM_EVICTIONS); + else + /* Just return the number of evicted 64KB pages. */ + info->nr_device_memory_evictions = info->device_memory_evicted / 64; } struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen, const struct pipe_resource *templ) { if (templ->target == PIPE_BUFFER) { - return r600_buffer_create(screen, templ, 4096); + return r600_buffer_create(screen, templ, 256); } else { return r600_texture_create(screen, templ); } @@ -897,10 +1177,15 @@ struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen, bool r600_common_screen_init(struct r600_common_screen *rscreen, struct radeon_winsys *ws) { - char llvm_string[32] = {}; + char llvm_string[32] = {}, kernel_version[128] = {}; + struct utsname uname_data; ws->query_info(ws, &rscreen->info); + if (uname(&uname_data) == 0) + snprintf(kernel_version, sizeof(kernel_version), + " / %s", uname_data.release); + #if HAVE_LLVM snprintf(llvm_string, sizeof(llvm_string), ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff, @@ -908,22 +1193,22 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, #endif snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string), - "%s (DRM %i.%i.%i%s)", + "%s (DRM %i.%i.%i%s%s)", r600_get_chip_name(rscreen), rscreen->info.drm_major, rscreen->info.drm_minor, rscreen->info.drm_patchlevel, - llvm_string); + kernel_version, llvm_string); rscreen->b.get_name = r600_get_name; rscreen->b.get_vendor = r600_get_vendor; rscreen->b.get_device_vendor = r600_get_device_vendor; rscreen->b.get_compute_param = r600_get_compute_param; rscreen->b.get_paramf = r600_get_paramf; - rscreen->b.get_driver_query_info = r600_get_driver_query_info; rscreen->b.get_timestamp = r600_get_timestamp; rscreen->b.fence_finish = r600_fence_finish; rscreen->b.fence_reference = r600_fence_reference; rscreen->b.resource_destroy = u_resource_destroy_vtbl; rscreen->b.resource_from_user_memory = r600_buffer_from_user_memory; + rscreen->b.query_memory_info = r600_query_memory_info; if (rscreen->info.has_uvd) { rscreen->b.get_video_param = rvid_get_video_param; @@ -934,109 +1219,115 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, } r600_init_screen_texture_functions(rscreen); + r600_init_screen_query_functions(rscreen); rscreen->ws = ws; rscreen->family = rscreen->info.family; rscreen->chip_class = rscreen->info.chip_class; rscreen->debug_flags = debug_get_flags_option("R600_DEBUG", common_debug_options, 0); - if (!r600_init_tiling(rscreen)) { - return false; + slab_create_parent(&rscreen->pool_transfers, sizeof(struct r600_transfer), 64); + + rscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1)); + if (rscreen->force_aniso >= 0) { + printf("radeon: Forcing anisotropy filter to %ix\n", + /* round down to a power of two */ + 1 << util_logbase2(rscreen->force_aniso)); } + util_format_s3tc_init(); pipe_mutex_init(rscreen->aux_context_lock); pipe_mutex_init(rscreen->gpu_load_mutex); - if (((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 28) || - rscreen->info.drm_major == 3) && - (rscreen->debug_flags & DBG_TRACE_CS)) { - rscreen->trace_bo = (struct r600_resource*)pipe_buffer_create(&rscreen->b, - PIPE_BIND_CUSTOM, - PIPE_USAGE_STAGING, - 4096); - if (rscreen->trace_bo) { - rscreen->trace_ptr = rscreen->ws->buffer_map(rscreen->trace_bo->cs_buf, NULL, - PIPE_TRANSFER_UNSYNCHRONIZED); - } - } - if (rscreen->debug_flags & DBG_INFO) { printf("pci_id = 0x%x\n", rscreen->info.pci_id); - printf("family = %i\n", rscreen->info.family); + printf("family = %i (%s)\n", rscreen->info.family, + r600_get_chip_name(rscreen)); printf("chip_class = %i\n", rscreen->info.chip_class); - printf("gart_size = %i MB\n", (int)(rscreen->info.gart_size >> 20)); - printf("vram_size = %i MB\n", (int)(rscreen->info.vram_size >> 20)); - printf("max_sclk = %i\n", rscreen->info.max_sclk); - printf("max_compute_units = %i\n", rscreen->info.max_compute_units); - printf("max_se = %i\n", rscreen->info.max_se); - printf("max_sh_per_se = %i\n", rscreen->info.max_sh_per_se); - printf("drm = %i.%i.%i\n", rscreen->info.drm_major, - rscreen->info.drm_minor, rscreen->info.drm_patchlevel); + printf("gart_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.gart_size, 1024*1024)); + printf("vram_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_size, 1024*1024)); + printf("max_alloc_size = %i MB\n", + (int)DIV_ROUND_UP(rscreen->info.max_alloc_size, 1024*1024)); + printf("has_virtual_memory = %i\n", rscreen->info.has_virtual_memory); + printf("gfx_ib_pad_with_type2 = %i\n", rscreen->info.gfx_ib_pad_with_type2); + printf("has_sdma = %i\n", rscreen->info.has_sdma); printf("has_uvd = %i\n", rscreen->info.has_uvd); + printf("me_fw_version = %i\n", rscreen->info.me_fw_version); + printf("pfp_fw_version = %i\n", rscreen->info.pfp_fw_version); + printf("ce_fw_version = %i\n", rscreen->info.ce_fw_version); printf("vce_fw_version = %i\n", rscreen->info.vce_fw_version); - printf("r600_num_backends = %i\n", rscreen->info.r600_num_backends); - printf("r600_clock_crystal_freq = %i\n", rscreen->info.r600_clock_crystal_freq); - printf("r600_tiling_config = 0x%x\n", rscreen->info.r600_tiling_config); - printf("r600_num_tile_pipes = %i\n", rscreen->info.r600_num_tile_pipes); - printf("r600_max_pipes = %i\n", rscreen->info.r600_max_pipes); - printf("r600_virtual_address = %i\n", rscreen->info.r600_virtual_address); - printf("r600_has_dma = %i\n", rscreen->info.r600_has_dma); - printf("r600_backend_map = %i\n", rscreen->info.r600_backend_map); - printf("r600_backend_map_valid = %i\n", rscreen->info.r600_backend_map_valid); - printf("si_tile_mode_array_valid = %i\n", rscreen->info.si_tile_mode_array_valid); - printf("cik_macrotile_mode_array_valid = %i\n", rscreen->info.cik_macrotile_mode_array_valid); + printf("vce_harvest_config = %i\n", rscreen->info.vce_harvest_config); + printf("clock_crystal_freq = %i\n", rscreen->info.clock_crystal_freq); + printf("drm = %i.%i.%i\n", rscreen->info.drm_major, + rscreen->info.drm_minor, rscreen->info.drm_patchlevel); + printf("has_userptr = %i\n", rscreen->info.has_userptr); + + printf("r600_max_quad_pipes = %i\n", rscreen->info.r600_max_quad_pipes); + printf("max_shader_clock = %i\n", rscreen->info.max_shader_clock); + printf("num_good_compute_units = %i\n", rscreen->info.num_good_compute_units); + printf("max_se = %i\n", rscreen->info.max_se); + printf("max_sh_per_se = %i\n", rscreen->info.max_sh_per_se); + + printf("r600_gb_backend_map = %i\n", rscreen->info.r600_gb_backend_map); + printf("r600_gb_backend_map_valid = %i\n", rscreen->info.r600_gb_backend_map_valid); + printf("r600_num_banks = %i\n", rscreen->info.r600_num_banks); + printf("num_render_backends = %i\n", rscreen->info.num_render_backends); + printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes); + printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes); } return true; } void r600_destroy_common_screen(struct r600_common_screen *rscreen) { + r600_perfcounters_destroy(rscreen); r600_gpu_load_kill_thread(rscreen); pipe_mutex_destroy(rscreen->gpu_load_mutex); pipe_mutex_destroy(rscreen->aux_context_lock); rscreen->aux_context->destroy(rscreen->aux_context); - if (rscreen->trace_bo) - pipe_resource_reference((struct pipe_resource**)&rscreen->trace_bo, NULL); + slab_destroy_parent(&rscreen->pool_transfers); rscreen->ws->destroy(rscreen->ws); FREE(rscreen); } bool r600_can_dump_shader(struct r600_common_screen *rscreen, - const struct tgsi_token *tokens) + unsigned processor) { - /* Compute shader don't have tgsi_tokens */ - if (!tokens) - return (rscreen->debug_flags & DBG_CS) != 0; - - switch (tgsi_get_processor_type(tokens)) { - case TGSI_PROCESSOR_VERTEX: + switch (processor) { + case PIPE_SHADER_VERTEX: return (rscreen->debug_flags & DBG_VS) != 0; - case TGSI_PROCESSOR_TESS_CTRL: + case PIPE_SHADER_TESS_CTRL: return (rscreen->debug_flags & DBG_TCS) != 0; - case TGSI_PROCESSOR_TESS_EVAL: + case PIPE_SHADER_TESS_EVAL: return (rscreen->debug_flags & DBG_TES) != 0; - case TGSI_PROCESSOR_GEOMETRY: + case PIPE_SHADER_GEOMETRY: return (rscreen->debug_flags & DBG_GS) != 0; - case TGSI_PROCESSOR_FRAGMENT: + case PIPE_SHADER_FRAGMENT: return (rscreen->debug_flags & DBG_PS) != 0; - case TGSI_PROCESSOR_COMPUTE: + case PIPE_SHADER_COMPUTE: return (rscreen->debug_flags & DBG_CS) != 0; default: return false; } } +bool r600_extra_shader_checks(struct r600_common_screen *rscreen, unsigned processor) +{ + return (rscreen->debug_flags & DBG_CHECK_IR) || + r600_can_dump_shader(rscreen, processor); +} + void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst, - unsigned offset, unsigned size, unsigned value, - bool is_framebuffer) + uint64_t offset, uint64_t size, unsigned value, + enum r600_coherency coher) { struct r600_common_context *rctx = (struct r600_common_context*)rscreen->aux_context; pipe_mutex_lock(rscreen->aux_context_lock); - rctx->clear_buffer(&rctx->b, dst, offset, size, value, is_framebuffer); + rctx->clear_buffer(&rctx->b, dst, offset, size, value, coher); rscreen->aux_context->flush(rscreen->aux_context, NULL, 0); pipe_mutex_unlock(rscreen->aux_context_lock); } diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h index 29db1cc4e..86772c0af 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h @@ -39,31 +39,22 @@ #include "util/u_blitter.h" #include "util/list.h" #include "util/u_range.h" -#include "util/u_slab.h" +#include "util/slab.h" #include "util/u_suballoc.h" #include "util/u_transfer.h" +#define ATI_VENDOR_ID 0x1002 + #define R600_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) #define R600_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) #define R600_RESOURCE_FLAG_FORCE_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2) - -#define R600_QUERY_DRAW_CALLS (PIPE_QUERY_DRIVER_SPECIFIC + 0) -#define R600_QUERY_REQUESTED_VRAM (PIPE_QUERY_DRIVER_SPECIFIC + 1) -#define R600_QUERY_REQUESTED_GTT (PIPE_QUERY_DRIVER_SPECIFIC + 2) -#define R600_QUERY_BUFFER_WAIT_TIME (PIPE_QUERY_DRIVER_SPECIFIC + 3) -#define R600_QUERY_NUM_CS_FLUSHES (PIPE_QUERY_DRIVER_SPECIFIC + 4) -#define R600_QUERY_NUM_BYTES_MOVED (PIPE_QUERY_DRIVER_SPECIFIC + 5) -#define R600_QUERY_VRAM_USAGE (PIPE_QUERY_DRIVER_SPECIFIC + 6) -#define R600_QUERY_GTT_USAGE (PIPE_QUERY_DRIVER_SPECIFIC + 7) -#define R600_QUERY_GPU_TEMPERATURE (PIPE_QUERY_DRIVER_SPECIFIC + 8) -#define R600_QUERY_CURRENT_GPU_SCLK (PIPE_QUERY_DRIVER_SPECIFIC + 9) -#define R600_QUERY_CURRENT_GPU_MCLK (PIPE_QUERY_DRIVER_SPECIFIC + 10) -#define R600_QUERY_GPU_LOAD (PIPE_QUERY_DRIVER_SPECIFIC + 11) -#define R600_QUERY_NUM_COMPILATIONS (PIPE_QUERY_DRIVER_SPECIFIC + 12) -#define R600_QUERY_NUM_SHADERS_CREATED (PIPE_QUERY_DRIVER_SPECIFIC + 13) +#define R600_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3) #define R600_CONTEXT_STREAMOUT_FLUSH (1u << 0) -#define R600_CONTEXT_PRIVATE_FLAG (1u << 1) +/* Pipeline & streamout query controls. */ +#define R600_CONTEXT_START_PIPELINE_STATS (1u << 1) +#define R600_CONTEXT_STOP_PIPELINE_STATS (1u << 2) +#define R600_CONTEXT_PRIVATE_FLAG (1u << 3) /* special primitive types */ #define R600_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX @@ -71,10 +62,10 @@ /* Debug flags. */ /* logging */ #define DBG_TEX (1 << 0) -#define DBG_TEXMIP (1 << 1) +/* gap - reuse */ #define DBG_COMPUTE (1 << 2) #define DBG_VM (1 << 3) -#define DBG_TRACE_CS (1 << 4) +/* gap - reuse */ /* shader logging */ #define DBG_FS (1 << 5) #define DBG_VS (1 << 6) @@ -86,6 +77,10 @@ #define DBG_NO_IR (1 << 12) #define DBG_NO_TGSI (1 << 13) #define DBG_NO_ASM (1 << 14) +#define DBG_PREOPT_IR (1 << 15) +#define DBG_CHECK_IR (1 << 16) +/* gaps */ +#define DBG_TEST_DMA (1 << 20) /* Bits 21-31 are reserved for the r600g driver. */ /* features */ #define DBG_NO_ASYNC_DMA (1llu << 32) @@ -98,13 +93,40 @@ #define DBG_PRECOMPILE (1llu << 39) #define DBG_INFO (1llu << 40) #define DBG_NO_WC (1llu << 41) +#define DBG_CHECK_VM (1llu << 42) +#define DBG_NO_DCC (1llu << 43) +#define DBG_NO_DCC_CLEAR (1llu << 44) +#define DBG_NO_RB_PLUS (1llu << 45) +#define DBG_SI_SCHED (1llu << 46) +#define DBG_MONOLITHIC_SHADERS (1llu << 47) +#define DBG_NO_CE (1llu << 48) +#define DBG_UNSAFE_MATH (1llu << 49) +#define DBG_NO_DCC_FB (1llu << 50) #define R600_MAP_BUFFER_ALIGNMENT 64 +#define R600_MAX_VIEWPORTS 16 + +#define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024 + +enum r600_coherency { + R600_COHERENCY_NONE, /* no cache flushes needed */ + R600_COHERENCY_SHADER, + R600_COHERENCY_CB_META, +}; + +#ifdef PIPE_ARCH_BIG_ENDIAN +#define R600_BIG_ENDIAN 1 +#else +#define R600_BIG_ENDIAN 0 +#endif struct r600_common_context; +struct r600_perfcounters; +struct tgsi_shader_info; +struct r600_qbo_state; struct radeon_shader_reloc { - char *name; + char name[32]; uint64_t offset; }; @@ -137,18 +159,31 @@ struct radeon_shader_binary { /** Disassembled shader in a string. */ char *disasm_string; + char *llvm_ir_string; }; +void radeon_shader_binary_init(struct radeon_shader_binary *b); +void radeon_shader_binary_clean(struct radeon_shader_binary *b); + +/* Only 32-bit buffer allocations are supported, gallium doesn't support more + * at the moment. + */ struct r600_resource { struct u_resource b; /* Winsys objects. */ struct pb_buffer *buf; - struct radeon_winsys_cs_handle *cs_buf; uint64_t gpu_address; + /* Memory usage if the buffer placement is optimal. */ + uint64_t vram_usage; + uint64_t gart_usage; - /* Resource state. */ + /* Resource properties. */ + uint64_t bo_size; + unsigned bo_alignment; enum radeon_bo_domain domains; + enum radeon_bo_flag flags; + unsigned bind_history; /* The buffer range which is initialized (with a write transfer, * streamout, DMA, or as a random access target). The rest of @@ -171,6 +206,10 @@ struct r600_resource { * use TC L2. */ bool TC_L2_dirty; + + /* Whether the resource has been exported via resource_get_handle. */ + bool is_shared; + unsigned external_usage; /* PIPE_HANDLE_USAGE_* */ }; struct r600_transfer { @@ -180,51 +219,107 @@ struct r600_transfer { }; struct r600_fmask_info { - unsigned offset; - unsigned size; + uint64_t offset; + uint64_t size; unsigned alignment; - unsigned pitch; + unsigned pitch_in_pixels; unsigned bank_height; unsigned slice_tile_max; unsigned tile_mode_index; }; struct r600_cmask_info { - unsigned offset; - unsigned size; + uint64_t offset; + uint64_t size; unsigned alignment; + unsigned pitch; + unsigned height; + unsigned xalign; + unsigned yalign; unsigned slice_tile_max; unsigned base_address_reg; }; +struct r600_htile_info { + unsigned pitch; + unsigned height; + unsigned xalign; + unsigned yalign; + unsigned alignment; +}; + struct r600_texture { struct r600_resource resource; - unsigned size; - unsigned pitch_override; + uint64_t size; + unsigned num_level0_transfers; + enum pipe_format db_render_format; bool is_depth; + bool db_compatible; + bool can_sample_z; + bool can_sample_s; unsigned dirty_level_mask; /* each bit says if that mipmap is compressed */ + unsigned stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */ struct r600_texture *flushed_depth_texture; - boolean is_flushing_texture; struct radeon_surf surface; /* Colorbuffer compression and fast clear. */ struct r600_fmask_info fmask; struct r600_cmask_info cmask; struct r600_resource *cmask_buffer; + uint64_t dcc_offset; /* 0 = disabled */ unsigned cb_color_info; /* fast clear enable bit */ unsigned color_clear_value[2]; + unsigned last_msaa_resolve_target_micro_mode; /* Depth buffer compression and fast clear. */ + struct r600_htile_info htile; struct r600_resource *htile_buffer; + bool tc_compatible_htile; bool depth_cleared; /* if it was cleared at least once */ float depth_clear_value; + bool stencil_cleared; /* if it was cleared at least once */ + uint8_t stencil_clear_value; bool non_disp_tiling; /* R600-Cayman only */ + + /* Whether the texture is a displayable back buffer and needs DCC + * decompression, which is expensive. Therefore, it's enabled only + * if statistics suggest that it will pay off and it's allocated + * separately. It can't be bound as a sampler by apps. Limited to + * target == 2D and last_level == 0. If enabled, dcc_offset contains + * the absolute GPUVM address, not the relative one. + */ + struct r600_resource *dcc_separate_buffer; + /* When DCC is temporarily disabled, the separate buffer is here. */ + struct r600_resource *last_dcc_separate_buffer; + /* We need to track DCC dirtiness, because st/dri usually calls + * flush_resource twice per frame (not a bug) and we don't wanna + * decompress DCC twice. Also, the dirty tracking must be done even + * if DCC isn't used, because it's required by the DCC usage analysis + * for a possible future enablement. + */ + bool separate_dcc_dirty; + /* Statistics gathering for the DCC enablement heuristic. */ + bool dcc_gather_statistics; + /* Estimate of how much this color buffer is written to in units of + * full-screen draws: ps_invocations / (width * height) + * Shader kills, late Z, and blending with trivial discards make it + * inaccurate (we need to count CB updates, not PS invocations). + */ + unsigned ps_draw_ratio; + /* The number of clears since the last DCC usage analysis. */ + unsigned num_slow_clears; + + /* Counter that should be non-zero if the texture is bound to a + * framebuffer. Implemented in radeonsi only. + */ + uint32_t framebuffers_bound; }; struct r600_surface { struct pipe_surface base; + const struct radeon_surf_level *level_info; bool color_initialized; bool depth_initialized; @@ -232,6 +327,8 @@ struct r600_surface { /* Misc. color flags. */ bool alphatest_bypass; bool export_16bpc; + bool color_is_int8; + bool color_is_int10; /* Color registers. */ unsigned cb_color_info; @@ -247,6 +344,10 @@ struct r600_surface { unsigned cb_color_fmask_slice; /* EG and later */ unsigned cb_color_cmask; /* CB_COLORn_TILE (r600 only) */ unsigned cb_color_mask; /* R600 only */ + unsigned spi_shader_col_format; /* SI+, no blending, no alpha-to-coverage. */ + unsigned spi_shader_col_format_alpha; /* SI+, alpha-to-coverage */ + unsigned spi_shader_col_format_blend; /* SI+, blending without alpha. */ + unsigned spi_shader_col_format_blend_alpha; /* SI+, blending with alpha. */ struct r600_resource *cb_buffer_fmask; /* Used for FMASK relocations. R600 only */ struct r600_resource *cb_buffer_cmask; /* Used for CMASK relocations. R600 only */ @@ -263,13 +364,6 @@ struct r600_surface { unsigned db_htile_surface; unsigned db_htile_data_base; unsigned db_preload_control; /* EG and later */ - unsigned pa_su_poly_offset_db_fmt_cntl; -}; - -struct r600_tiling_info { - unsigned num_channels; - unsigned num_banks; - unsigned group_bytes; }; struct r600_common_screen { @@ -278,20 +372,20 @@ struct r600_common_screen { enum radeon_family family; enum chip_class chip_class; struct radeon_info info; - struct r600_tiling_info tiling_info; uint64_t debug_flags; bool has_cp_dma; bool has_streamout; + struct slab_parent_pool pool_transfers; + + /* Texture filter settings. */ + int force_aniso; /* -1 = disabled */ + /* Auxiliary context. Mainly used to initialize resources. * It must be locked prior to using and flushed before unlocking. */ struct pipe_context *aux_context; pipe_mutex aux_context_lock; - struct r600_resource *trace_bo; - uint32_t *trace_ptr; - unsigned cs_count; - /* This must be in the screen, because UE4 uses one context for * compilation and another one for rendering. */ @@ -308,7 +402,49 @@ struct r600_common_screen { unsigned gpu_load_counter_idle; volatile unsigned gpu_load_stop_thread; /* bool */ - char renderer_string[64]; + char renderer_string[100]; + + /* Performance counters. */ + struct r600_perfcounters *perfcounters; + + /* If pipe_screen wants to re-emit the framebuffer state of all + * contexts, it should atomically increment this. Each context will + * compare this with its own last known value of the counter before + * drawing and re-emit the framebuffer state accordingly. + */ + unsigned dirty_fb_counter; + + /* Atomically increment this counter when an existing texture's + * metadata is enabled or disabled in a way that requires changing + * contexts' compressed texture binding masks. + */ + unsigned compressed_colortex_counter; + + /* Atomically increment this counter when an existing texture's + * backing buffer or tile mode parameters have changed that requires + * recomputation of shader descriptors. + */ + unsigned dirty_tex_descriptor_counter; + + struct { + /* Context flags to set so that all writes from earlier jobs + * in the CP are seen by L2 clients. + */ + unsigned cp_to_L2; + + /* Context flags to set so that all writes from earlier + * compute jobs are seen by L2 clients. + */ + unsigned compute_to_L2; + } barrier_flags; + + void (*query_opaque_metadata)(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + struct radeon_bo_metadata *md); + + void (*apply_opaque_metadata)(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + struct radeon_bo_metadata *md); }; /* This encapsulates a state or an operation which can emitted into the GPU @@ -316,8 +452,7 @@ struct r600_common_screen { struct r600_atom { void (*emit)(struct r600_common_context *ctx, struct r600_atom *state); unsigned num_dw; - unsigned short id; /* used by r600 only */ - bool dirty; + unsigned short id; }; struct r600_so_target { @@ -358,16 +493,40 @@ struct r600_streamout { int num_prims_gen_queries; }; +struct r600_signed_scissor { + int minx; + int miny; + int maxx; + int maxy; +}; + +struct r600_scissors { + struct r600_atom atom; + unsigned dirty_mask; + struct pipe_scissor_state states[R600_MAX_VIEWPORTS]; +}; + +struct r600_viewports { + struct r600_atom atom; + unsigned dirty_mask; + unsigned depth_range_dirty_mask; + struct pipe_viewport_state states[R600_MAX_VIEWPORTS]; + struct r600_signed_scissor as_scissor[R600_MAX_VIEWPORTS]; +}; + struct r600_ring { struct radeon_winsys_cs *cs; - bool flushing; void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence); }; -struct r600_rings { - struct r600_ring gfx; - struct r600_ring dma; +/* Saved CS data for debugging features. */ +struct radeon_saved_cs { + uint32_t *ib; + unsigned num_dw; + + struct radeon_bo_list_item *bo_list; + unsigned bo_count; }; struct r600_common_context { @@ -378,13 +537,20 @@ struct r600_common_context { struct radeon_winsys_ctx *ctx; enum radeon_family family; enum chip_class chip_class; - struct r600_rings rings; + struct r600_ring gfx; + struct r600_ring dma; + struct pipe_fence_handle *last_gfx_fence; + struct pipe_fence_handle *last_sdma_fence; + unsigned num_gfx_cs_flushes; unsigned initial_gfx_cs_size; unsigned gpu_reset_counter; + unsigned last_dirty_fb_counter; + unsigned last_compressed_colortex_counter; + unsigned last_dirty_tex_descriptor_counter; struct u_upload_mgr *uploader; - struct u_suballocator *allocator_so_filled_size; - struct util_slab_mempool pool_transfers; + struct u_suballocator *allocator_zeroed_memory; + struct slab_child_pool pool_transfers; /* Current unaccounted memory usage. */ uint64_t vram; @@ -392,38 +558,43 @@ struct r600_common_context { /* States. */ struct r600_streamout streamout; + struct r600_scissors scissors; + struct r600_viewports viewports; + bool scissor_enabled; + bool clip_halfz; + bool vs_writes_viewport_index; + bool vs_disables_clipping_viewport; /* Additional context states. */ unsigned flags; /* flush flags */ /* Queries. */ - /* The list of active queries. Only one query of each type can be active. */ + /* Maintain the list of active queries for pausing between IBs. */ int num_occlusion_queries; - /* Keep track of non-timer queries, because they should be suspended - * during context flushing. - * The timer queries (TIME_ELAPSED) shouldn't be suspended for blits, - * but they should be suspended between IBs. */ - struct list_head active_nontimer_queries; - struct list_head active_timer_queries; - unsigned num_cs_dw_nontimer_queries_suspend; - unsigned num_cs_dw_timer_queries_suspend; - /* If queries have been suspended. */ - bool queries_suspended_for_flush; + int num_perfect_occlusion_queries; + struct list_head active_queries; + unsigned num_cs_dw_queries_suspend; /* Additional hardware info. */ unsigned backend_mask; unsigned max_db; /* for OQ */ /* Misc stats. */ unsigned num_draw_calls; + unsigned num_spill_draw_calls; + unsigned num_compute_calls; + unsigned num_spill_compute_calls; + unsigned num_dma_calls; + unsigned num_vs_flushes; + unsigned num_ps_flushes; + unsigned num_cs_flushes; + uint64_t num_alloc_tex_transfer_bytes; + unsigned last_tex_ps_draw_ratio; /* for query */ /* Render condition. */ - struct pipe_query *current_render_cond; - unsigned current_render_cond_mode; - boolean current_render_cond_cond; - boolean predicate_drawing; - /* For context flushing. */ - struct pipe_query *saved_render_cond; - boolean saved_render_cond_cond; - unsigned saved_render_cond_mode; + struct r600_atom render_cond_atom; + struct pipe_query *render_cond; + unsigned render_cond_mode; + bool render_cond_invert; + bool render_cond_force_off; /* for u_blitter */ /* MSAA sample locations. * The first index is the sample index. @@ -434,10 +605,29 @@ struct r600_common_context { float sample_locations_8x[8][2]; float sample_locations_16x[16][2]; - /* The list of all texture buffer objects in this context. - * This list is walked when a buffer is invalidated/reallocated and - * the GPU addresses are updated. */ - struct list_head texture_buffers; + /* Statistics gathering for the DCC enablement heuristic. It can't be + * in r600_texture because r600_texture can be shared by multiple + * contexts. This is for back buffers only. We shouldn't get too many + * of those. + * + * X11 DRI3 rotates among a finite set of back buffers. They should + * all fit in this array. If they don't, separate DCC might never be + * enabled by DCC stat gathering. + */ + struct { + struct r600_texture *tex; + /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */ + struct pipe_query *ps_stats[3]; + /* If all slots are used and another slot is needed, + * the least recently used slot is evicted based on this. */ + int64_t last_use_timestamp; + bool query_active; + } dcc_stats[5]; + + struct pipe_debug_callback debug; + struct pipe_device_reset_callback device_reset_callback; + + void *query_result_shader; /* Copy one resource to another using async DMA. */ void (*dma_copy)(struct pipe_context *ctx, @@ -449,8 +639,8 @@ struct r600_common_context { const struct pipe_box *src_box); void (*clear_buffer)(struct pipe_context *ctx, struct pipe_resource *dst, - unsigned offset, unsigned size, unsigned value, - bool is_framebuffer); + uint64_t offset, uint64_t size, unsigned value, + enum r600_coherency coher); void (*blit_decompress_depth)(struct pipe_context *ctx, struct r600_texture *texture, @@ -459,6 +649,9 @@ struct r600_common_context { unsigned first_layer, unsigned last_layer, unsigned first_sample, unsigned last_sample); + void (*decompress_dcc)(struct pipe_context *ctx, + struct r600_texture *rtex); + /* Reallocate the buffer and update all resource bindings where * the buffer is bound, including all resource descriptors. */ void (*invalidate_buffer)(struct pipe_context *ctx, struct pipe_resource *buf); @@ -466,34 +659,58 @@ struct r600_common_context { /* Enable or disable occlusion queries. */ void (*set_occlusion_query_state)(struct pipe_context *ctx, bool enable); + void (*save_qbo_state)(struct pipe_context *ctx, struct r600_qbo_state *st); + /* This ensures there is enough space in the command stream. */ void (*need_gfx_cs_space)(struct pipe_context *ctx, unsigned num_dw, bool include_draw_vbo); void (*set_atom_dirty)(struct r600_common_context *ctx, struct r600_atom *atom, bool dirty); + + void (*check_vm_faults)(struct r600_common_context *ctx, + struct radeon_saved_cs *saved, + enum ring_type ring); }; /* r600_buffer.c */ -boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx, - struct radeon_winsys_cs_handle *buf, - enum radeon_bo_usage usage); +bool r600_rings_is_buffer_referenced(struct r600_common_context *ctx, + struct pb_buffer *buf, + enum radeon_bo_usage usage); void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, struct r600_resource *resource, unsigned usage); -bool r600_init_resource(struct r600_common_screen *rscreen, - struct r600_resource *res, - unsigned size, unsigned alignment, - bool use_reusable_pool); +void r600_buffer_subdata(struct pipe_context *ctx, + struct pipe_resource *buffer, + unsigned usage, unsigned offset, + unsigned size, const void *data); +void r600_init_resource_fields(struct r600_common_screen *rscreen, + struct r600_resource *res, + uint64_t size, unsigned alignment); +bool r600_alloc_resource(struct r600_common_screen *rscreen, + struct r600_resource *res); struct pipe_resource *r600_buffer_create(struct pipe_screen *screen, const struct pipe_resource *templ, unsigned alignment); +struct pipe_resource * r600_aligned_buffer_create(struct pipe_screen *screen, + unsigned bind, + unsigned usage, + unsigned size, + unsigned alignment); struct pipe_resource * r600_buffer_from_user_memory(struct pipe_screen *screen, const struct pipe_resource *templ, void *user_memory); +void +r600_invalidate_resource(struct pipe_context *ctx, + struct pipe_resource *resource); /* r600_common_pipe.c */ +void r600_gfx_write_fence(struct r600_common_context *ctx, struct r600_resource *buf, + uint64_t va, uint32_t old_value, uint32_t new_value); +unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen); +void r600_gfx_wait_fence(struct r600_common_context *ctx, + uint64_t va, uint32_t ref, uint32_t mask); void r600_draw_rectangle(struct blitter_context *blitter, int x1, int y1, int x2, int y2, float depth, enum blitter_attrib_type type, @@ -504,30 +721,40 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen); void r600_preflush_suspend_features(struct r600_common_context *ctx); void r600_postflush_resume_features(struct r600_common_context *ctx); bool r600_common_context_init(struct r600_common_context *rctx, - struct r600_common_screen *rscreen); + struct r600_common_screen *rscreen, + unsigned context_flags); void r600_common_context_cleanup(struct r600_common_context *rctx); -void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r); bool r600_can_dump_shader(struct r600_common_screen *rscreen, - const struct tgsi_token *tokens); + unsigned processor); +bool r600_extra_shader_checks(struct r600_common_screen *rscreen, + unsigned processor); void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst, - unsigned offset, unsigned size, unsigned value, - bool is_framebuffer); + uint64_t offset, uint64_t size, unsigned value, + enum r600_coherency coher); struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen, const struct pipe_resource *templ); const char *r600_get_llvm_processor_name(enum radeon_family family); -void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw); +void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, + struct r600_resource *dst, struct r600_resource *src); +void r600_dma_emit_wait_idle(struct r600_common_context *rctx); +void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs, + struct radeon_saved_cs *saved); +void radeon_clear_saved_cs(struct radeon_saved_cs *saved); +bool r600_check_device_reset(struct r600_common_context *rctx); /* r600_gpu_load.c */ void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen); uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen); unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin); +/* r600_perfcounters.c */ +void r600_perfcounters_destroy(struct r600_common_screen *rscreen); + /* r600_query.c */ +void r600_init_screen_query_functions(struct r600_common_screen *rscreen); void r600_query_init(struct r600_common_context *rctx); -void r600_suspend_nontimer_queries(struct r600_common_context *ctx); -void r600_resume_nontimer_queries(struct r600_common_context *ctx); -void r600_suspend_timer_queries(struct r600_common_context *ctx); -void r600_resume_timer_queries(struct r600_common_context *ctx); +void r600_suspend_queries(struct r600_common_context *ctx); +void r600_resume_queries(struct r600_common_context *ctx); void r600_query_init_backend_mask(struct r600_common_context *ctx); /* r600_streamout.c */ @@ -541,7 +768,17 @@ void r600_update_prims_generated_query_state(struct r600_common_context *rctx, unsigned type, int diff); void r600_streamout_init(struct r600_common_context *rctx); +/* r600_test_dma.c */ +void r600_test_dma(struct r600_common_screen *rscreen); + /* r600_texture.c */ +bool r600_prepare_for_dma_blit(struct r600_common_context *rctx, + struct r600_texture *rdst, + unsigned dst_level, unsigned dstx, + unsigned dsty, unsigned dstz, + struct r600_texture *rsrc, + unsigned src_level, + const struct pipe_box *src_box); void r600_texture_get_fmask_info(struct r600_common_screen *rscreen, struct r600_texture *rtex, unsigned nr_samples, @@ -552,21 +789,48 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen, bool r600_init_flushed_depth_texture(struct pipe_context *ctx, struct pipe_resource *texture, struct r600_texture **staging); +void r600_print_texture_info(struct r600_texture *rtex, FILE *f); struct pipe_resource *r600_texture_create(struct pipe_screen *screen, const struct pipe_resource *templ); +bool vi_dcc_formats_compatible(enum pipe_format format1, + enum pipe_format format2); +void vi_dcc_disable_if_incompatible_format(struct r600_common_context *rctx, + struct pipe_resource *tex, + unsigned level, + enum pipe_format view_format); struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe, struct pipe_resource *texture, const struct pipe_surface *templ, unsigned width, unsigned height); -unsigned r600_translate_colorswap(enum pipe_format format); +unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap); +void vi_separate_dcc_start_query(struct pipe_context *ctx, + struct r600_texture *tex); +void vi_separate_dcc_stop_query(struct pipe_context *ctx, + struct r600_texture *tex); +void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, + struct r600_texture *tex); +void vi_dcc_clear_level(struct r600_common_context *rctx, + struct r600_texture *rtex, + unsigned level, unsigned clear_value); void evergreen_do_fast_color_clear(struct r600_common_context *rctx, struct pipe_framebuffer_state *fb, struct r600_atom *fb_state, - unsigned *buffers, + unsigned *buffers, unsigned *dirty_cbufs, const union pipe_color_union *color); +bool r600_texture_disable_dcc(struct r600_common_context *rctx, + struct r600_texture *rtex); void r600_init_screen_texture_functions(struct r600_common_screen *rscreen); void r600_init_context_texture_functions(struct r600_common_context *rctx); +/* r600_viewport.c */ +void evergreen_apply_scissor_bug_workaround(struct r600_common_context *rctx, + struct pipe_scissor_state *scissor); +void r600_viewport_set_rast_deps(struct r600_common_context *rctx, + bool scissor_enable, bool clip_halfz); +void r600_update_vs_writes_viewport_index(struct r600_common_context *rctx, + struct tgsi_shader_info *info); +void r600_init_viewport_functions(struct r600_common_context *rctx); + /* cayman_msaa.c */ extern const uint32_t eg_sample_locs_2x[4]; extern const unsigned eg_max_dist_2x; @@ -577,7 +841,8 @@ void cayman_get_sample_position(struct pipe_context *ctx, unsigned sample_count, void cayman_init_msaa(struct pipe_context *ctx); void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples); void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, - int ps_iter_samples, int overrast_samples); + int ps_iter_samples, int overrast_samples, + unsigned sc_mode_cntl_1); /* Inline helpers. */ @@ -594,13 +859,57 @@ r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res) (struct pipe_resource *)res); } +static inline void +r600_texture_reference(struct r600_texture **ptr, struct r600_texture *res) +{ + pipe_resource_reference((struct pipe_resource **)ptr, &res->resource.b.b); +} + +static inline void +r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_resource *res = (struct r600_resource *)r; + + if (res) { + /* Add memory usage for need_gfx_cs_space */ + rctx->vram += res->vram_usage; + rctx->gtt += res->gart_usage; + } +} + +static inline bool r600_get_strmout_en(struct r600_common_context *rctx) +{ + return rctx->streamout.streamout_enabled || + rctx->streamout.prims_gen_query_enabled; +} + +#define SQ_TEX_XY_FILTER_POINT 0x00 +#define SQ_TEX_XY_FILTER_BILINEAR 0x01 +#define SQ_TEX_XY_FILTER_ANISO_POINT 0x02 +#define SQ_TEX_XY_FILTER_ANISO_BILINEAR 0x03 + +static inline unsigned eg_tex_filter(unsigned filter, unsigned max_aniso) +{ + if (filter == PIPE_TEX_FILTER_LINEAR) + return max_aniso > 1 ? SQ_TEX_XY_FILTER_ANISO_BILINEAR + : SQ_TEX_XY_FILTER_BILINEAR; + else + return max_aniso > 1 ? SQ_TEX_XY_FILTER_ANISO_POINT + : SQ_TEX_XY_FILTER_POINT; +} + static inline unsigned r600_tex_aniso_filter(unsigned filter) { - if (filter <= 1) return 0; - if (filter <= 2) return 1; - if (filter <= 4) return 2; - if (filter <= 8) return 3; - /* else */ return 4; + if (filter < 2) + return 0; + if (filter < 4) + return 1; + if (filter < 8) + return 2; + if (filter < 16) + return 3; + return 4; } static inline unsigned r600_wavefront_size(enum radeon_family family) @@ -623,19 +932,38 @@ static inline unsigned r600_wavefront_size(enum radeon_family family) } } +static inline enum radeon_bo_priority +r600_get_sampler_view_priority(struct r600_resource *res) +{ + if (res->b.b.target == PIPE_BUFFER) + return RADEON_PRIO_SAMPLER_BUFFER; + + if (res->b.b.nr_samples > 1) + return RADEON_PRIO_SAMPLER_TEXTURE_MSAA; + + return RADEON_PRIO_SAMPLER_TEXTURE; +} + +static inline bool +r600_can_sample_zs(struct r600_texture *tex, bool stencil_sampler) +{ + return (stencil_sampler && tex->can_sample_s) || + (!stencil_sampler && tex->can_sample_z); +} + #define COMPUTE_DBG(rscreen, fmt, args...) \ do { \ if ((rscreen->b.debug_flags & DBG_COMPUTE)) fprintf(stderr, fmt, ##args); \ } while (0); #define R600_ERR(fmt, args...) \ - fprintf(stderr, "EE %s:%d %s - "fmt, __FILE__, __LINE__, __func__, ##args) + fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) /* For MSAA sample positions. */ #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \ - (((s0x) & 0xf) | (((s0y) & 0xf) << 4) | \ - (((s1x) & 0xf) << 8) | (((s1y) & 0xf) << 12) | \ - (((s2x) & 0xf) << 16) | (((s2y) & 0xf) << 20) | \ - (((s3x) & 0xf) << 24) | (((s3y) & 0xf) << 28)) + (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) | \ + (((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) | \ + (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \ + (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28)) #endif diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_query.c b/lib/mesa/src/gallium/drivers/radeon/r600_query.c index 65339bbb6..4b6767dd3 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_query.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_query.c @@ -22,81 +22,317 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include "r600_query.h" #include "r600_cs.h" #include "util/u_memory.h" +#include "util/u_upload_mgr.h" +#include "tgsi/tgsi_text.h" -struct r600_query_buffer { - /* The buffer where query results are stored. */ - struct r600_resource *buf; - /* Offset of the next free result after current query data */ - unsigned results_end; - /* If a query buffer is full, a new buffer is created and the old one - * is put in here. When we calculate the result, we sum up the samples - * from all buffers. */ - struct r600_query_buffer *previous; +struct r600_hw_query_params { + unsigned start_offset; + unsigned end_offset; + unsigned fence_offset; + unsigned pair_stride; + unsigned pair_count; }; -struct r600_query { - /* The query buffer and how many results are in it. */ - struct r600_query_buffer buffer; - /* The type of query */ - unsigned type; - /* Size of the result in memory for both begin_query and end_query, - * this can be one or two numbers, or it could even be a size of a structure. */ - unsigned result_size; - /* The number of dwords for begin_query or end_query. */ - unsigned num_cs_dw; - /* linked list of queries */ - struct list_head list; - /* for custom non-GPU queries */ +/* Queries without buffer handling or suspend/resume. */ +struct r600_query_sw { + struct r600_query b; + uint64_t begin_result; uint64_t end_result; /* Fence for GPU_FINISHED. */ struct pipe_fence_handle *fence; - /* For transform feedback: which stream the query is for */ - unsigned stream; }; - -static bool r600_is_timer_query(unsigned type) +static void r600_query_sw_destroy(struct r600_common_context *rctx, + struct r600_query *rquery) { - return type == PIPE_QUERY_TIME_ELAPSED || - type == PIPE_QUERY_TIMESTAMP; + struct pipe_screen *screen = rctx->b.screen; + struct r600_query_sw *query = (struct r600_query_sw *)rquery; + + screen->fence_reference(screen, &query->fence, NULL); + FREE(query); } -static bool r600_query_needs_begin(unsigned type) +static enum radeon_value_id winsys_id_from_type(unsigned type) { - return type != PIPE_QUERY_GPU_FINISHED && - type != PIPE_QUERY_TIMESTAMP; + switch (type) { + case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY; + case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY; + case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM; + case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT; + case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS; + case R600_QUERY_NUM_CTX_FLUSHES: return RADEON_NUM_CS_FLUSHES; + case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED; + case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS; + case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE; + case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE; + case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE; + case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK; + case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK; + default: unreachable("query type does not correspond to winsys id"); + } } -static struct r600_resource *r600_new_query_buffer(struct r600_common_context *ctx, unsigned type) +static bool r600_query_sw_begin(struct r600_common_context *rctx, + struct r600_query *rquery) { - unsigned j, i, num_results, buf_size = 4096; - uint32_t *results; + struct r600_query_sw *query = (struct r600_query_sw *)rquery; - /* Non-GPU queries. */ - switch (type) { + switch(query->b.type) { case PIPE_QUERY_TIMESTAMP_DISJOINT: case PIPE_QUERY_GPU_FINISHED: + break; case R600_QUERY_DRAW_CALLS: + query->begin_result = rctx->num_draw_calls; + break; + case R600_QUERY_SPILL_DRAW_CALLS: + query->begin_result = rctx->num_spill_draw_calls; + break; + case R600_QUERY_COMPUTE_CALLS: + query->begin_result = rctx->num_compute_calls; + break; + case R600_QUERY_SPILL_COMPUTE_CALLS: + query->begin_result = rctx->num_spill_compute_calls; + break; + case R600_QUERY_DMA_CALLS: + query->begin_result = rctx->num_dma_calls; + break; + case R600_QUERY_NUM_VS_FLUSHES: + query->begin_result = rctx->num_vs_flushes; + break; + case R600_QUERY_NUM_PS_FLUSHES: + query->begin_result = rctx->num_ps_flushes; + break; + case R600_QUERY_NUM_CS_FLUSHES: + query->begin_result = rctx->num_cs_flushes; + break; case R600_QUERY_REQUESTED_VRAM: case R600_QUERY_REQUESTED_GTT: + case R600_QUERY_MAPPED_VRAM: + case R600_QUERY_MAPPED_GTT: + case R600_QUERY_VRAM_USAGE: + case R600_QUERY_GTT_USAGE: + case R600_QUERY_GPU_TEMPERATURE: + case R600_QUERY_CURRENT_GPU_SCLK: + case R600_QUERY_CURRENT_GPU_MCLK: + case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO: + query->begin_result = 0; + break; case R600_QUERY_BUFFER_WAIT_TIME: - case R600_QUERY_NUM_CS_FLUSHES: + case R600_QUERY_NUM_CTX_FLUSHES: case R600_QUERY_NUM_BYTES_MOVED: + case R600_QUERY_NUM_EVICTIONS: { + enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); + query->begin_result = rctx->ws->query_value(rctx->ws, ws_id); + break; + } + case R600_QUERY_GPU_LOAD: + query->begin_result = r600_gpu_load_begin(rctx->screen); + break; + case R600_QUERY_NUM_COMPILATIONS: + query->begin_result = p_atomic_read(&rctx->screen->num_compilations); + break; + case R600_QUERY_NUM_SHADERS_CREATED: + query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created); + break; + case R600_QUERY_GPIN_ASIC_ID: + case R600_QUERY_GPIN_NUM_SIMD: + case R600_QUERY_GPIN_NUM_RB: + case R600_QUERY_GPIN_NUM_SPI: + case R600_QUERY_GPIN_NUM_SE: + break; + default: + unreachable("r600_query_sw_begin: bad query type"); + } + + return true; +} + +static bool r600_query_sw_end(struct r600_common_context *rctx, + struct r600_query *rquery) +{ + struct r600_query_sw *query = (struct r600_query_sw *)rquery; + + switch(query->b.type) { + case PIPE_QUERY_TIMESTAMP_DISJOINT: + break; + case PIPE_QUERY_GPU_FINISHED: + rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED); + break; + case R600_QUERY_DRAW_CALLS: + query->end_result = rctx->num_draw_calls; + break; + case R600_QUERY_SPILL_DRAW_CALLS: + query->end_result = rctx->num_spill_draw_calls; + break; + case R600_QUERY_COMPUTE_CALLS: + query->end_result = rctx->num_compute_calls; + break; + case R600_QUERY_SPILL_COMPUTE_CALLS: + query->end_result = rctx->num_spill_compute_calls; + break; + case R600_QUERY_DMA_CALLS: + query->end_result = rctx->num_dma_calls; + break; + case R600_QUERY_NUM_VS_FLUSHES: + query->end_result = rctx->num_vs_flushes; + break; + case R600_QUERY_NUM_PS_FLUSHES: + query->end_result = rctx->num_ps_flushes; + break; + case R600_QUERY_NUM_CS_FLUSHES: + query->end_result = rctx->num_cs_flushes; + break; + case R600_QUERY_REQUESTED_VRAM: + case R600_QUERY_REQUESTED_GTT: + case R600_QUERY_MAPPED_VRAM: + case R600_QUERY_MAPPED_GTT: case R600_QUERY_VRAM_USAGE: case R600_QUERY_GTT_USAGE: case R600_QUERY_GPU_TEMPERATURE: case R600_QUERY_CURRENT_GPU_SCLK: case R600_QUERY_CURRENT_GPU_MCLK: + case R600_QUERY_BUFFER_WAIT_TIME: + case R600_QUERY_NUM_CTX_FLUSHES: + case R600_QUERY_NUM_BYTES_MOVED: + case R600_QUERY_NUM_EVICTIONS: { + enum radeon_value_id ws_id = winsys_id_from_type(query->b.type); + query->end_result = rctx->ws->query_value(rctx->ws, ws_id); + break; + } case R600_QUERY_GPU_LOAD: + query->end_result = r600_gpu_load_end(rctx->screen, + query->begin_result); + query->begin_result = 0; + break; case R600_QUERY_NUM_COMPILATIONS: + query->end_result = p_atomic_read(&rctx->screen->num_compilations); + break; case R600_QUERY_NUM_SHADERS_CREATED: + query->end_result = p_atomic_read(&rctx->screen->num_shaders_created); + break; + case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO: + query->end_result = rctx->last_tex_ps_draw_ratio; + break; + case R600_QUERY_GPIN_ASIC_ID: + case R600_QUERY_GPIN_NUM_SIMD: + case R600_QUERY_GPIN_NUM_RB: + case R600_QUERY_GPIN_NUM_SPI: + case R600_QUERY_GPIN_NUM_SE: + break; + default: + unreachable("r600_query_sw_end: bad query type"); + } + + return true; +} + +static bool r600_query_sw_get_result(struct r600_common_context *rctx, + struct r600_query *rquery, + bool wait, + union pipe_query_result *result) +{ + struct r600_query_sw *query = (struct r600_query_sw *)rquery; + + switch (query->b.type) { + case PIPE_QUERY_TIMESTAMP_DISJOINT: + /* Convert from cycles per millisecond to cycles per second (Hz). */ + result->timestamp_disjoint.frequency = + (uint64_t)rctx->screen->info.clock_crystal_freq * 1000; + result->timestamp_disjoint.disjoint = false; + return true; + case PIPE_QUERY_GPU_FINISHED: { + struct pipe_screen *screen = rctx->b.screen; + result->b = screen->fence_finish(screen, &rctx->b, query->fence, + wait ? PIPE_TIMEOUT_INFINITE : 0); + return result->b; + } + + case R600_QUERY_GPIN_ASIC_ID: + result->u32 = 0; + return true; + case R600_QUERY_GPIN_NUM_SIMD: + result->u32 = rctx->screen->info.num_good_compute_units; + return true; + case R600_QUERY_GPIN_NUM_RB: + result->u32 = rctx->screen->info.num_render_backends; + return true; + case R600_QUERY_GPIN_NUM_SPI: + result->u32 = 1; /* all supported chips have one SPI per SE */ + return true; + case R600_QUERY_GPIN_NUM_SE: + result->u32 = rctx->screen->info.max_se; + return true; + } + + result->u64 = query->end_result - query->begin_result; + + switch (query->b.type) { + case R600_QUERY_BUFFER_WAIT_TIME: + case R600_QUERY_GPU_TEMPERATURE: + result->u64 /= 1000; + break; + case R600_QUERY_CURRENT_GPU_SCLK: + case R600_QUERY_CURRENT_GPU_MCLK: + result->u64 *= 1000000; + break; + } + + return true; +} + + +static struct r600_query_ops sw_query_ops = { + .destroy = r600_query_sw_destroy, + .begin = r600_query_sw_begin, + .end = r600_query_sw_end, + .get_result = r600_query_sw_get_result, + .get_result_resource = NULL +}; + +static struct pipe_query *r600_query_sw_create(struct pipe_context *ctx, + unsigned query_type) +{ + struct r600_query_sw *query; + + query = CALLOC_STRUCT(r600_query_sw); + if (!query) return NULL; + + query->b.type = query_type; + query->b.ops = &sw_query_ops; + + return (struct pipe_query *)query; +} + +void r600_query_hw_destroy(struct r600_common_context *rctx, + struct r600_query *rquery) +{ + struct r600_query_hw *query = (struct r600_query_hw *)rquery; + struct r600_query_buffer *prev = query->buffer.previous; + + /* Release all query buffers. */ + while (prev) { + struct r600_query_buffer *qbuf = prev; + prev = prev->previous; + r600_resource_reference(&qbuf->buf, NULL); + FREE(qbuf); } + r600_resource_reference(&query->buffer.buf, NULL); + FREE(rquery); +} + +static struct r600_resource *r600_new_query_buffer(struct r600_common_context *ctx, + struct r600_query_hw *query) +{ + unsigned buf_size = MAX2(query->result_size, + ctx->screen->info.min_alloc_size); + /* Queries are normally read by the CPU after * being written by the gpu, hence staging is probably a good * usage pattern. @@ -104,15 +340,37 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c struct r600_resource *buf = (struct r600_resource*) pipe_buffer_create(ctx->b.screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING, buf_size); + if (!buf) + return NULL; - switch (type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - results = r600_buffer_map_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE); - memset(results, 0, buf_size); + if (!query->ops->prepare_buffer(ctx, query, buf)) { + r600_resource_reference(&buf, NULL); + return NULL; + } + + return buf; +} + +static bool r600_query_hw_prepare_buffer(struct r600_common_context *ctx, + struct r600_query_hw *query, + struct r600_resource *buffer) +{ + /* Callers ensure that the buffer is currently unused by the GPU. */ + uint32_t *results = ctx->ws->buffer_map(buffer->buf, NULL, + PIPE_TRANSFER_WRITE | + PIPE_TRANSFER_UNSYNCHRONIZED); + if (!results) + return false; + + memset(results, 0, buffer->b.b.width0); + + if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER || + query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) { + unsigned num_results; + unsigned i, j; /* Set top bits for unused backends. */ - num_results = buf_size / (16 * ctx->max_db); + num_results = buffer->b.b.width0 / query->result_size; for (j = 0; j < num_results; j++) { for (i = 0; i < ctx->max_db; i++) { if (!(ctx->backend_mask & (1<<i))) { @@ -122,22 +380,118 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c } results += 4 * ctx->max_db; } + } + + return true; +} + +static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, + struct r600_query *rquery, + bool wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset); + +static struct r600_query_ops query_hw_ops = { + .destroy = r600_query_hw_destroy, + .begin = r600_query_hw_begin, + .end = r600_query_hw_end, + .get_result = r600_query_hw_get_result, + .get_result_resource = r600_query_hw_get_result_resource, +}; + +static void r600_query_hw_do_emit_start(struct r600_common_context *ctx, + struct r600_query_hw *query, + struct r600_resource *buffer, + uint64_t va); +static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, + struct r600_query_hw *query, + struct r600_resource *buffer, + uint64_t va); +static void r600_query_hw_add_result(struct r600_common_context *ctx, + struct r600_query_hw *, void *buffer, + union pipe_query_result *result); +static void r600_query_hw_clear_result(struct r600_query_hw *, + union pipe_query_result *); + +static struct r600_query_hw_ops query_hw_default_hw_ops = { + .prepare_buffer = r600_query_hw_prepare_buffer, + .emit_start = r600_query_hw_do_emit_start, + .emit_stop = r600_query_hw_do_emit_stop, + .clear_result = r600_query_hw_clear_result, + .add_result = r600_query_hw_add_result, +}; + +bool r600_query_hw_init(struct r600_common_context *rctx, + struct r600_query_hw *query) +{ + query->buffer.buf = r600_new_query_buffer(rctx, query); + if (!query->buffer.buf) + return false; + + return true; +} + +static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx, + unsigned query_type, + unsigned index) +{ + struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw); + if (!query) + return NULL; + + query->b.type = query_type; + query->b.ops = &query_hw_ops; + query->ops = &query_hw_default_hw_ops; + + switch (query_type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + query->result_size = 16 * rctx->max_db; + query->result_size += 16; /* for the fence + alignment */ + query->num_cs_dw_begin = 6; + query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen); break; case PIPE_QUERY_TIME_ELAPSED: + query->result_size = 24; + query->num_cs_dw_begin = 8; + query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen); + break; case PIPE_QUERY_TIMESTAMP: + query->result_size = 16; + query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen); + query->flags = R600_QUERY_HW_FLAG_NO_START; break; case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_PRIMITIVES_GENERATED: case PIPE_QUERY_SO_STATISTICS: case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ + query->result_size = 32; + query->num_cs_dw_begin = 6; + query->num_cs_dw_end = 6; + query->stream = index; + break; case PIPE_QUERY_PIPELINE_STATISTICS: - results = r600_buffer_map_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE); - memset(results, 0, buf_size); + /* 11 values on EG, 8 on R600. */ + query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16; + query->result_size += 8; /* for the fence + alignment */ + query->num_cs_dw_begin = 6; + query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen); break; default: assert(0); + FREE(query); + return NULL; } - return buf; + + if (!r600_query_hw_init(rctx, query)) { + FREE(query); + return NULL; + } + + return (struct pipe_query *)query; } static void r600_update_occlusion_query_state(struct r600_common_context *rctx, @@ -146,20 +500,28 @@ static void r600_update_occlusion_query_state(struct r600_common_context *rctx, if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE) { bool old_enable = rctx->num_occlusion_queries != 0; - bool enable; + bool old_perfect_enable = + rctx->num_perfect_occlusion_queries != 0; + bool enable, perfect_enable; rctx->num_occlusion_queries += diff; assert(rctx->num_occlusion_queries >= 0); + if (type == PIPE_QUERY_OCCLUSION_COUNTER) { + rctx->num_perfect_occlusion_queries += diff; + assert(rctx->num_perfect_occlusion_queries >= 0); + } + enable = rctx->num_occlusion_queries != 0; + perfect_enable = rctx->num_perfect_occlusion_queries != 0; - if (enable != old_enable) { + if (enable != old_enable || perfect_enable != old_perfect_enable) { rctx->set_occlusion_query_state(&rctx->b, enable); } } } -static unsigned event_type_for_stream(struct r600_query *query) +static unsigned event_type_for_stream(struct r600_query_hw *query) { switch (query->stream) { default: @@ -170,28 +532,14 @@ static unsigned event_type_for_stream(struct r600_query *query) } } -static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_query *query) +static void r600_query_hw_do_emit_start(struct r600_common_context *ctx, + struct r600_query_hw *query, + struct r600_resource *buffer, + uint64_t va) { - struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; - uint64_t va; - - r600_update_occlusion_query_state(ctx, query->type, 1); - r600_update_prims_generated_query_state(ctx, query->type, 1); - ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw * 2, TRUE); - - /* Get a new query buffer if needed. */ - if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) { - struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer); - *qbuf = query->buffer; - query->buffer.buf = r600_new_query_buffer(ctx, query->type); - query->buffer.results_end = 0; - query->buffer.previous = qbuf; - } + struct radeon_winsys_cs *cs = ctx->gfx.cs; - /* emit begin query */ - va = query->buffer.buf->gpu_address + query->buffer.results_end; - - switch (query->type) { + switch (query->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); @@ -210,7 +558,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q break; case PIPE_QUERY_TIME_ELAPSED: radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5)); radeon_emit(cs, va); radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF)); radeon_emit(cs, 0); @@ -225,226 +573,210 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q default: assert(0); } - r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE, - RADEON_PRIO_MIN); - - if (r600_is_timer_query(query->type)) - ctx->num_cs_dw_timer_queries_suspend += query->num_cs_dw; - else - ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw; + r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE, + RADEON_PRIO_QUERY); } -static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_query *query) +static void r600_query_hw_emit_start(struct r600_common_context *ctx, + struct r600_query_hw *query) { - struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; uint64_t va; - /* The queries which need begin already called this in begin_query. */ - if (!r600_query_needs_begin(query->type)) { - ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw, FALSE); + if (!query->buffer.buf) + return; // previous buffer allocation failure + + r600_update_occlusion_query_state(ctx, query->b.type, 1); + r600_update_prims_generated_query_state(ctx, query->b.type, 1); + + ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end, + true); + + /* Get a new query buffer if needed. */ + if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) { + struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer); + *qbuf = query->buffer; + query->buffer.results_end = 0; + query->buffer.previous = qbuf; + query->buffer.buf = r600_new_query_buffer(ctx, query); + if (!query->buffer.buf) + return; } - va = query->buffer.buf->gpu_address; + /* emit begin query */ + va = query->buffer.buf->gpu_address + query->buffer.results_end; - /* emit end query */ - switch (query->type) { + query->ops->emit_start(ctx, query, query->buffer.buf, va); + + ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end; +} + +static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, + struct r600_query_hw *query, + struct r600_resource *buffer, + uint64_t va) +{ + struct radeon_winsys_cs *cs = ctx->gfx.cs; + uint64_t fence_va = 0; + + switch (query->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: - va += query->buffer.results_end + 8; + va += 8; radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1)); radeon_emit(cs, va); radeon_emit(cs, (va >> 32) & 0xFFFF); + + fence_va = va + ctx->max_db * 16 - 8; break; case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_PRIMITIVES_GENERATED: case PIPE_QUERY_SO_STATISTICS: case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - va += query->buffer.results_end + query->result_size/2; + va += query->result_size/2; radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3)); radeon_emit(cs, va); radeon_emit(cs, (va >> 32) & 0xFFFF); break; case PIPE_QUERY_TIME_ELAPSED: - va += query->buffer.results_end + query->result_size/2; + va += 8; /* fall through */ case PIPE_QUERY_TIMESTAMP: radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0)); - radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5)); radeon_emit(cs, va); radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF)); radeon_emit(cs, 0); radeon_emit(cs, 0); + + fence_va = va + 8; break; - case PIPE_QUERY_PIPELINE_STATISTICS: - va += query->buffer.results_end + query->result_size/2; + case PIPE_QUERY_PIPELINE_STATISTICS: { + unsigned sample_size = (query->result_size - 8) / 2; + + va += sample_size; radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); radeon_emit(cs, va); radeon_emit(cs, (va >> 32) & 0xFFFF); + + fence_va = va + sample_size; break; + } default: assert(0); } - r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE, - RADEON_PRIO_MIN); + r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE, + RADEON_PRIO_QUERY); - query->buffer.results_end += query->result_size; + if (fence_va) + r600_gfx_write_fence(ctx, query->buffer.buf, fence_va, 0, 0x80000000); +} - if (r600_query_needs_begin(query->type)) { - if (r600_is_timer_query(query->type)) - ctx->num_cs_dw_timer_queries_suspend -= query->num_cs_dw; - else - ctx->num_cs_dw_nontimer_queries_suspend -= query->num_cs_dw; +static void r600_query_hw_emit_stop(struct r600_common_context *ctx, + struct r600_query_hw *query) +{ + uint64_t va; + + if (!query->buffer.buf) + return; // previous buffer allocation failure + + /* The queries which need begin already called this in begin_query. */ + if (query->flags & R600_QUERY_HW_FLAG_NO_START) { + ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, false); } - r600_update_occlusion_query_state(ctx, query->type, -1); - r600_update_prims_generated_query_state(ctx, query->type, -1); -} + /* emit end query */ + va = query->buffer.buf->gpu_address + query->buffer.results_end; -static void r600_emit_query_predication(struct r600_common_context *ctx, struct r600_query *query, - int operation, bool flag_wait) -{ - struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; - uint32_t op = PRED_OP(operation); + query->ops->emit_stop(ctx, query, query->buffer.buf, va); - /* if true then invert, see GL_ARB_conditional_render_inverted */ - if (ctx->current_render_cond_cond) - op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */ - else - op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */ + query->buffer.results_end += query->result_size; - if (operation == PREDICATION_OP_CLEAR) { - ctx->need_gfx_cs_space(&ctx->b, 3, FALSE); + if (!(query->flags & R600_QUERY_HW_FLAG_NO_START)) + ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end; - radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); - radeon_emit(cs, 0); - radeon_emit(cs, PRED_OP(PREDICATION_OP_CLEAR)); - } else { - struct r600_query_buffer *qbuf; - unsigned count; - /* Find how many results there are. */ - count = 0; - for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { - count += qbuf->results_end / query->result_size; - } - - ctx->need_gfx_cs_space(&ctx->b, 5 * count, TRUE); - - op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; - - /* emit predicate packets for all data blocks */ - for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { - unsigned results_base = 0; - uint64_t va = qbuf->buf->gpu_address; - - while (results_base < qbuf->results_end) { - radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); - radeon_emit(cs, (va + results_base) & 0xFFFFFFFFUL); - radeon_emit(cs, op | (((va + results_base) >> 32UL) & 0xFF)); - r600_emit_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ, - RADEON_PRIO_MIN); - results_base += query->result_size; - - /* set CONTINUE bit for all packets except the first */ - op |= PREDICATION_CONTINUE; - } - } - } + r600_update_occlusion_query_state(ctx, query->b.type, -1); + r600_update_prims_generated_query_state(ctx, query->b.type, -1); } -static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index) +static void r600_emit_query_predication(struct r600_common_context *ctx, + struct r600_atom *atom) { - struct r600_common_context *rctx = (struct r600_common_context *)ctx; - struct r600_query *query; - bool skip_allocation = false; + struct radeon_winsys_cs *cs = ctx->gfx.cs; + struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond; + struct r600_query_buffer *qbuf; + uint32_t op; + bool flag_wait; - query = CALLOC_STRUCT(r600_query); - if (query == NULL) - return NULL; + if (!query) + return; - query->type = query_type; + flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT || + ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT; - switch (query_type) { + switch (query->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: - query->result_size = 16 * rctx->max_db; - query->num_cs_dw = 6; - break; - break; - case PIPE_QUERY_TIME_ELAPSED: - query->result_size = 16; - query->num_cs_dw = 8; - break; - case PIPE_QUERY_TIMESTAMP: - query->result_size = 8; - query->num_cs_dw = 8; + op = PRED_OP(PREDICATION_OP_ZPASS); break; case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_PRIMITIVES_GENERATED: case PIPE_QUERY_SO_STATISTICS: case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ - query->result_size = 32; - query->num_cs_dw = 6; - query->stream = index; - break; - case PIPE_QUERY_PIPELINE_STATISTICS: - /* 11 values on EG, 8 on R600. */ - query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16; - query->num_cs_dw = 6; - break; - /* Non-GPU queries and queries not requiring a buffer. */ - case PIPE_QUERY_TIMESTAMP_DISJOINT: - case PIPE_QUERY_GPU_FINISHED: - case R600_QUERY_DRAW_CALLS: - case R600_QUERY_REQUESTED_VRAM: - case R600_QUERY_REQUESTED_GTT: - case R600_QUERY_BUFFER_WAIT_TIME: - case R600_QUERY_NUM_CS_FLUSHES: - case R600_QUERY_NUM_BYTES_MOVED: - case R600_QUERY_VRAM_USAGE: - case R600_QUERY_GTT_USAGE: - case R600_QUERY_GPU_TEMPERATURE: - case R600_QUERY_CURRENT_GPU_SCLK: - case R600_QUERY_CURRENT_GPU_MCLK: - case R600_QUERY_GPU_LOAD: - case R600_QUERY_NUM_COMPILATIONS: - case R600_QUERY_NUM_SHADERS_CREATED: - skip_allocation = true; + op = PRED_OP(PREDICATION_OP_PRIMCOUNT); break; default: assert(0); - FREE(query); - return NULL; + return; } - if (!skip_allocation) { - query->buffer.buf = r600_new_query_buffer(rctx, query_type); - if (!query->buffer.buf) { - FREE(query); - return NULL; + /* if true then invert, see GL_ARB_conditional_render_inverted */ + if (ctx->render_cond_invert) + op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */ + else + op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */ + + op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; + + /* emit predicate packets for all data blocks */ + for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { + unsigned results_base = 0; + uint64_t va = qbuf->buf->gpu_address; + + while (results_base < qbuf->results_end) { + radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); + radeon_emit(cs, va + results_base); + radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF)); + r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ, + RADEON_PRIO_QUERY); + results_base += query->result_size; + + /* set CONTINUE bit for all packets except the first */ + op |= PREDICATION_CONTINUE; } } - return (struct pipe_query*)query; } -static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query) +static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index) { - struct r600_query *rquery = (struct r600_query*)query; - struct r600_query_buffer *prev = rquery->buffer.previous; + struct r600_common_context *rctx = (struct r600_common_context *)ctx; - /* Release all query buffers. */ - while (prev) { - struct r600_query_buffer *qbuf = prev; - prev = prev->previous; - pipe_resource_reference((struct pipe_resource**)&qbuf->buf, NULL); - FREE(qbuf); - } + if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || + query_type == PIPE_QUERY_GPU_FINISHED || + query_type >= PIPE_QUERY_DRIVER_SPECIFIC) + return r600_query_sw_create(ctx, query_type); - pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL); - FREE(query); + return r600_query_hw_create(rctx, query_type, index); +} + +static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_query *rquery = (struct r600_query *)query; + + rquery->ops->destroy(rctx, rquery); } static boolean r600_begin_query(struct pipe_context *ctx, @@ -452,139 +784,141 @@ static boolean r600_begin_query(struct pipe_context *ctx, { struct r600_common_context *rctx = (struct r600_common_context *)ctx; struct r600_query *rquery = (struct r600_query *)query; - struct r600_query_buffer *prev = rquery->buffer.previous; - if (!r600_query_needs_begin(rquery->type)) { - assert(0); - return false; - } + return rquery->ops->begin(rctx, rquery); +} - /* Non-GPU queries. */ - switch (rquery->type) { - case PIPE_QUERY_TIMESTAMP_DISJOINT: - return true; - case R600_QUERY_DRAW_CALLS: - rquery->begin_result = rctx->num_draw_calls; - return true; - case R600_QUERY_REQUESTED_VRAM: - case R600_QUERY_REQUESTED_GTT: - case R600_QUERY_VRAM_USAGE: - case R600_QUERY_GTT_USAGE: - case R600_QUERY_GPU_TEMPERATURE: - case R600_QUERY_CURRENT_GPU_SCLK: - case R600_QUERY_CURRENT_GPU_MCLK: - rquery->begin_result = 0; - return true; - case R600_QUERY_BUFFER_WAIT_TIME: - rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000; - return true; - case R600_QUERY_NUM_CS_FLUSHES: - rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES); - return true; - case R600_QUERY_NUM_BYTES_MOVED: - rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_BYTES_MOVED); - return true; - case R600_QUERY_GPU_LOAD: - rquery->begin_result = r600_gpu_load_begin(rctx->screen); - return true; - case R600_QUERY_NUM_COMPILATIONS: - rquery->begin_result = p_atomic_read(&rctx->screen->num_compilations); - return true; - case R600_QUERY_NUM_SHADERS_CREATED: - rquery->begin_result = p_atomic_read(&rctx->screen->num_shaders_created); - return true; - } +void r600_query_hw_reset_buffers(struct r600_common_context *rctx, + struct r600_query_hw *query) +{ + struct r600_query_buffer *prev = query->buffer.previous; /* Discard the old query buffers. */ while (prev) { struct r600_query_buffer *qbuf = prev; prev = prev->previous; - pipe_resource_reference((struct pipe_resource**)&qbuf->buf, NULL); + r600_resource_reference(&qbuf->buf, NULL); FREE(qbuf); } + query->buffer.results_end = 0; + query->buffer.previous = NULL; + /* Obtain a new buffer if the current one can't be mapped without a stall. */ - if (r600_rings_is_buffer_referenced(rctx, rquery->buffer.buf->cs_buf, RADEON_USAGE_READWRITE) || - !rctx->ws->buffer_wait(rquery->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) { - pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL); - rquery->buffer.buf = r600_new_query_buffer(rctx, rquery->type); + if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) || + !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) { + r600_resource_reference(&query->buffer.buf, NULL); + query->buffer.buf = r600_new_query_buffer(rctx, query); + } else { + if (!query->ops->prepare_buffer(rctx, query, query->buffer.buf)) + r600_resource_reference(&query->buffer.buf, NULL); } +} + +bool r600_query_hw_begin(struct r600_common_context *rctx, + struct r600_query *rquery) +{ + struct r600_query_hw *query = (struct r600_query_hw *)rquery; - rquery->buffer.results_end = 0; - rquery->buffer.previous = NULL; + if (query->flags & R600_QUERY_HW_FLAG_NO_START) { + assert(0); + return false; + } - r600_emit_query_begin(rctx, rquery); + if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES)) + r600_query_hw_reset_buffers(rctx, query); - if (r600_is_timer_query(rquery->type)) - LIST_ADDTAIL(&rquery->list, &rctx->active_timer_queries); - else - LIST_ADDTAIL(&rquery->list, &rctx->active_nontimer_queries); - return true; + r600_query_hw_emit_start(rctx, query); + if (!query->buffer.buf) + return false; + + LIST_ADDTAIL(&query->list, &rctx->active_queries); + return true; } -static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query) +static bool r600_end_query(struct pipe_context *ctx, struct pipe_query *query) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; struct r600_query *rquery = (struct r600_query *)query; - /* Non-GPU queries. */ - switch (rquery->type) { - case PIPE_QUERY_TIMESTAMP_DISJOINT: - return; - case PIPE_QUERY_GPU_FINISHED: - rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC, &rquery->fence); - return; - case R600_QUERY_DRAW_CALLS: - rquery->end_result = rctx->num_draw_calls; - return; - case R600_QUERY_REQUESTED_VRAM: - rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_REQUESTED_VRAM_MEMORY); - return; - case R600_QUERY_REQUESTED_GTT: - rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_REQUESTED_GTT_MEMORY); - return; - case R600_QUERY_BUFFER_WAIT_TIME: - rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000; - return; - case R600_QUERY_NUM_CS_FLUSHES: - rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES); - return; - case R600_QUERY_NUM_BYTES_MOVED: - rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_BYTES_MOVED); - return; - case R600_QUERY_VRAM_USAGE: - rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_VRAM_USAGE); - return; - case R600_QUERY_GTT_USAGE: - rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_GTT_USAGE); - return; - case R600_QUERY_GPU_TEMPERATURE: - rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_GPU_TEMPERATURE) / 1000; - return; - case R600_QUERY_CURRENT_GPU_SCLK: - rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_CURRENT_SCLK) * 1000000; - return; - case R600_QUERY_CURRENT_GPU_MCLK: - rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_CURRENT_MCLK) * 1000000; - return; - case R600_QUERY_GPU_LOAD: - rquery->end_result = r600_gpu_load_end(rctx->screen, rquery->begin_result); - return; - case R600_QUERY_NUM_COMPILATIONS: - rquery->end_result = p_atomic_read(&rctx->screen->num_compilations); - return; - case R600_QUERY_NUM_SHADERS_CREATED: - rquery->end_result = p_atomic_read(&rctx->screen->num_shaders_created); - return; - } + return rquery->ops->end(rctx, rquery); +} - r600_emit_query_end(rctx, rquery); +bool r600_query_hw_end(struct r600_common_context *rctx, + struct r600_query *rquery) +{ + struct r600_query_hw *query = (struct r600_query_hw *)rquery; + + if (query->flags & R600_QUERY_HW_FLAG_NO_START) + r600_query_hw_reset_buffers(rctx, query); + + r600_query_hw_emit_stop(rctx, query); + + if (!(query->flags & R600_QUERY_HW_FLAG_NO_START)) + LIST_DELINIT(&query->list); + + if (!query->buffer.buf) + return false; + + return true; +} + +static void r600_get_hw_query_params(struct r600_common_context *rctx, + struct r600_query_hw *rquery, int index, + struct r600_hw_query_params *params) +{ + params->pair_stride = 0; + params->pair_count = 1; - if (r600_query_needs_begin(rquery->type)) - LIST_DELINIT(&rquery->list); + switch (rquery->b.type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + params->start_offset = 0; + params->end_offset = 8; + params->fence_offset = rctx->max_db * 16; + params->pair_stride = 16; + params->pair_count = rctx->max_db; + break; + case PIPE_QUERY_TIME_ELAPSED: + params->start_offset = 0; + params->end_offset = 8; + params->fence_offset = 16; + break; + case PIPE_QUERY_TIMESTAMP: + params->start_offset = 0; + params->end_offset = 0; + params->fence_offset = 8; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + params->start_offset = 8; + params->end_offset = 24; + params->fence_offset = params->end_offset + 4; + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + params->start_offset = 0; + params->end_offset = 16; + params->fence_offset = params->end_offset + 4; + break; + case PIPE_QUERY_SO_STATISTICS: + params->start_offset = 8 - index * 8; + params->end_offset = 24 - index * 8; + params->fence_offset = params->end_offset + 4; + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + { + /* Offsets apply to EG+ */ + static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80}; + params->start_offset = offsets[index]; + params->end_offset = 88 + offsets[index]; + params->fence_offset = 2 * 88; + break; + } + default: + unreachable("r600_get_hw_query_params unsupported"); + } } -static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index, +static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index, bool test_status_bit) { uint32_t *current_result = (uint32_t*)map; @@ -602,84 +936,34 @@ static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned return 0; } -static boolean r600_get_query_buffer_result(struct r600_common_context *ctx, - struct r600_query *query, - struct r600_query_buffer *qbuf, - boolean wait, - union pipe_query_result *result) +static void r600_query_hw_add_result(struct r600_common_context *ctx, + struct r600_query_hw *query, + void *buffer, + union pipe_query_result *result) { - struct pipe_screen *screen = ctx->b.screen; - unsigned results_base = 0; - char *map; - - /* Non-GPU queries. */ - switch (query->type) { - case PIPE_QUERY_TIMESTAMP_DISJOINT: - /* Convert from cycles per millisecond to cycles per second (Hz). */ - result->timestamp_disjoint.frequency = - (uint64_t)ctx->screen->info.r600_clock_crystal_freq * 1000; - result->timestamp_disjoint.disjoint = FALSE; - return TRUE; - case PIPE_QUERY_GPU_FINISHED: - result->b = screen->fence_finish(screen, query->fence, - wait ? PIPE_TIMEOUT_INFINITE : 0); - return result->b; - case R600_QUERY_DRAW_CALLS: - case R600_QUERY_REQUESTED_VRAM: - case R600_QUERY_REQUESTED_GTT: - case R600_QUERY_BUFFER_WAIT_TIME: - case R600_QUERY_NUM_CS_FLUSHES: - case R600_QUERY_NUM_BYTES_MOVED: - case R600_QUERY_VRAM_USAGE: - case R600_QUERY_GTT_USAGE: - case R600_QUERY_GPU_TEMPERATURE: - case R600_QUERY_CURRENT_GPU_SCLK: - case R600_QUERY_CURRENT_GPU_MCLK: - case R600_QUERY_NUM_COMPILATIONS: - case R600_QUERY_NUM_SHADERS_CREATED: - result->u64 = query->end_result - query->begin_result; - return TRUE; - case R600_QUERY_GPU_LOAD: - result->u64 = query->end_result; - return TRUE; - } - - map = r600_buffer_map_sync_with_rings(ctx, qbuf->buf, - PIPE_TRANSFER_READ | - (wait ? 0 : PIPE_TRANSFER_DONTBLOCK)); - if (!map) - return FALSE; - - /* count all results across all data blocks */ - switch (query->type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - while (results_base != qbuf->results_end) { + switch (query->b.type) { + case PIPE_QUERY_OCCLUSION_COUNTER: { + for (unsigned i = 0; i < ctx->max_db; ++i) { + unsigned results_base = i * 16; result->u64 += - r600_query_read_result(map + results_base, 0, 2, true); - results_base += 16; + r600_query_read_result(buffer + results_base, 0, 2, true); } break; - case PIPE_QUERY_OCCLUSION_PREDICATE: - while (results_base != qbuf->results_end) { + } + case PIPE_QUERY_OCCLUSION_PREDICATE: { + for (unsigned i = 0; i < ctx->max_db; ++i) { + unsigned results_base = i * 16; result->b = result->b || - r600_query_read_result(map + results_base, 0, 2, true) != 0; - results_base += 16; + r600_query_read_result(buffer + results_base, 0, 2, true) != 0; } break; + } case PIPE_QUERY_TIME_ELAPSED: - while (results_base != qbuf->results_end) { - result->u64 += - r600_query_read_result(map + results_base, 0, 2, false); - results_base += query->result_size; - } + result->u64 += r600_query_read_result(buffer, 0, 2, false); break; case PIPE_QUERY_TIMESTAMP: - { - uint32_t *current_result = (uint32_t*)map; - result->u64 = (uint64_t)current_result[0] | - (uint64_t)current_result[1] << 32; + result->u64 = *(uint64_t*)buffer; break; - } case PIPE_QUERY_PRIMITIVES_EMITTED: /* SAMPLE_STREAMOUTSTATS stores this structure: * { @@ -687,84 +971,64 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx, * u64 PrimitiveStorageNeeded; * } * We only need NumPrimitivesWritten here. */ - while (results_base != qbuf->results_end) { - result->u64 += - r600_query_read_result(map + results_base, 2, 6, true); - results_base += query->result_size; - } + result->u64 += r600_query_read_result(buffer, 2, 6, true); break; case PIPE_QUERY_PRIMITIVES_GENERATED: /* Here we read PrimitiveStorageNeeded. */ - while (results_base != qbuf->results_end) { - result->u64 += - r600_query_read_result(map + results_base, 0, 4, true); - results_base += query->result_size; - } + result->u64 += r600_query_read_result(buffer, 0, 4, true); break; case PIPE_QUERY_SO_STATISTICS: - while (results_base != qbuf->results_end) { - result->so_statistics.num_primitives_written += - r600_query_read_result(map + results_base, 2, 6, true); - result->so_statistics.primitives_storage_needed += - r600_query_read_result(map + results_base, 0, 4, true); - results_base += query->result_size; - } + result->so_statistics.num_primitives_written += + r600_query_read_result(buffer, 2, 6, true); + result->so_statistics.primitives_storage_needed += + r600_query_read_result(buffer, 0, 4, true); break; case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - while (results_base != qbuf->results_end) { - result->b = result->b || - r600_query_read_result(map + results_base, 2, 6, true) != - r600_query_read_result(map + results_base, 0, 4, true); - results_base += query->result_size; - } + result->b = result->b || + r600_query_read_result(buffer, 2, 6, true) != + r600_query_read_result(buffer, 0, 4, true); break; case PIPE_QUERY_PIPELINE_STATISTICS: if (ctx->chip_class >= EVERGREEN) { - while (results_base != qbuf->results_end) { - result->pipeline_statistics.ps_invocations += - r600_query_read_result(map + results_base, 0, 22, false); - result->pipeline_statistics.c_primitives += - r600_query_read_result(map + results_base, 2, 24, false); - result->pipeline_statistics.c_invocations += - r600_query_read_result(map + results_base, 4, 26, false); - result->pipeline_statistics.vs_invocations += - r600_query_read_result(map + results_base, 6, 28, false); - result->pipeline_statistics.gs_invocations += - r600_query_read_result(map + results_base, 8, 30, false); - result->pipeline_statistics.gs_primitives += - r600_query_read_result(map + results_base, 10, 32, false); - result->pipeline_statistics.ia_primitives += - r600_query_read_result(map + results_base, 12, 34, false); - result->pipeline_statistics.ia_vertices += - r600_query_read_result(map + results_base, 14, 36, false); - result->pipeline_statistics.hs_invocations += - r600_query_read_result(map + results_base, 16, 38, false); - result->pipeline_statistics.ds_invocations += - r600_query_read_result(map + results_base, 18, 40, false); - result->pipeline_statistics.cs_invocations += - r600_query_read_result(map + results_base, 20, 42, false); - results_base += query->result_size; - } + result->pipeline_statistics.ps_invocations += + r600_query_read_result(buffer, 0, 22, false); + result->pipeline_statistics.c_primitives += + r600_query_read_result(buffer, 2, 24, false); + result->pipeline_statistics.c_invocations += + r600_query_read_result(buffer, 4, 26, false); + result->pipeline_statistics.vs_invocations += + r600_query_read_result(buffer, 6, 28, false); + result->pipeline_statistics.gs_invocations += + r600_query_read_result(buffer, 8, 30, false); + result->pipeline_statistics.gs_primitives += + r600_query_read_result(buffer, 10, 32, false); + result->pipeline_statistics.ia_primitives += + r600_query_read_result(buffer, 12, 34, false); + result->pipeline_statistics.ia_vertices += + r600_query_read_result(buffer, 14, 36, false); + result->pipeline_statistics.hs_invocations += + r600_query_read_result(buffer, 16, 38, false); + result->pipeline_statistics.ds_invocations += + r600_query_read_result(buffer, 18, 40, false); + result->pipeline_statistics.cs_invocations += + r600_query_read_result(buffer, 20, 42, false); } else { - while (results_base != qbuf->results_end) { - result->pipeline_statistics.ps_invocations += - r600_query_read_result(map + results_base, 0, 16, false); - result->pipeline_statistics.c_primitives += - r600_query_read_result(map + results_base, 2, 18, false); - result->pipeline_statistics.c_invocations += - r600_query_read_result(map + results_base, 4, 20, false); - result->pipeline_statistics.vs_invocations += - r600_query_read_result(map + results_base, 6, 22, false); - result->pipeline_statistics.gs_invocations += - r600_query_read_result(map + results_base, 8, 24, false); - result->pipeline_statistics.gs_primitives += - r600_query_read_result(map + results_base, 10, 26, false); - result->pipeline_statistics.ia_primitives += - r600_query_read_result(map + results_base, 12, 28, false); - result->pipeline_statistics.ia_vertices += - r600_query_read_result(map + results_base, 14, 30, false); - results_base += query->result_size; - } + result->pipeline_statistics.ps_invocations += + r600_query_read_result(buffer, 0, 16, false); + result->pipeline_statistics.c_primitives += + r600_query_read_result(buffer, 2, 18, false); + result->pipeline_statistics.c_invocations += + r600_query_read_result(buffer, 4, 20, false); + result->pipeline_statistics.vs_invocations += + r600_query_read_result(buffer, 6, 22, false); + result->pipeline_statistics.gs_invocations += + r600_query_read_result(buffer, 8, 24, false); + result->pipeline_statistics.gs_primitives += + r600_query_read_result(buffer, 10, 26, false); + result->pipeline_statistics.ia_primitives += + r600_query_read_result(buffer, 12, 28, false); + result->pipeline_statistics.ia_vertices += + r600_query_read_result(buffer, 14, 30, false); } #if 0 /* for testing */ printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, " @@ -786,118 +1050,482 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx, default: assert(0); } - - return TRUE; } static boolean r600_get_query_result(struct pipe_context *ctx, - struct pipe_query *query, - boolean wait, union pipe_query_result *result) + struct pipe_query *query, boolean wait, + union pipe_query_result *result) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; struct r600_query *rquery = (struct r600_query *)query; + + return rquery->ops->get_result(rctx, rquery, wait, result); +} + +static void r600_get_query_result_resource(struct pipe_context *ctx, + struct pipe_query *query, + boolean wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_query *rquery = (struct r600_query *)query; + + rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index, + resource, offset); +} + +static void r600_query_hw_clear_result(struct r600_query_hw *query, + union pipe_query_result *result) +{ + util_query_clear_result(result, query->b.type); +} + +bool r600_query_hw_get_result(struct r600_common_context *rctx, + struct r600_query *rquery, + bool wait, union pipe_query_result *result) +{ + struct r600_query_hw *query = (struct r600_query_hw *)rquery; struct r600_query_buffer *qbuf; - util_query_clear_result(result, rquery->type); + query->ops->clear_result(query, result); + + for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { + unsigned results_base = 0; + void *map; - for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous) { - if (!r600_get_query_buffer_result(rctx, rquery, qbuf, wait, result)) { - return FALSE; + map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf, + PIPE_TRANSFER_READ | + (wait ? 0 : PIPE_TRANSFER_DONTBLOCK)); + if (!map) + return false; + + while (results_base != qbuf->results_end) { + query->ops->add_result(rctx, query, map + results_base, + result); + results_base += query->result_size; } } /* Convert the time to expected units. */ if (rquery->type == PIPE_QUERY_TIME_ELAPSED || rquery->type == PIPE_QUERY_TIMESTAMP) { - result->u64 = (1000000 * result->u64) / rctx->screen->info.r600_clock_crystal_freq; + result->u64 = (1000000 * result->u64) / rctx->screen->info.clock_crystal_freq; } - return TRUE; + return true; } -static void r600_render_condition(struct pipe_context *ctx, - struct pipe_query *query, - boolean condition, - uint mode) +/* Create the compute shader that is used to collect the results. + * + * One compute grid with a single thread is launched for every query result + * buffer. The thread (optionally) reads a previous summary buffer, then + * accumulates data from the query result buffer, and writes the result either + * to a summary buffer to be consumed by the next grid invocation or to the + * user-supplied buffer. + * + * Data layout: + * + * CONST + * 0.x = end_offset + * 0.y = result_stride + * 0.z = result_count + * 0.w = bit field: + * 1: read previously accumulated values + * 2: write accumulated values for chaining + * 4: write result available + * 8: convert result to boolean (0/1) + * 16: only read one dword and use that as result + * 32: apply timestamp conversion + * 64: store full 64 bits result + * 128: store signed 32 bits result + * 1.x = fence_offset + * 1.y = pair_stride + * 1.z = pair_count + * + * BUFFER[0] = query result buffer + * BUFFER[1] = previous summary buffer + * BUFFER[2] = next summary buffer or user-supplied buffer + */ +static void r600_create_query_result_shader(struct r600_common_context *rctx) { - struct r600_common_context *rctx = (struct r600_common_context *)ctx; - struct r600_query *rquery = (struct r600_query *)query; - bool wait_flag = false; - - rctx->current_render_cond = query; - rctx->current_render_cond_cond = condition; - rctx->current_render_cond_mode = mode; + /* TEMP[0].xy = accumulated result so far + * TEMP[0].z = result not available + * + * TEMP[1].x = current result index + * TEMP[1].y = current pair index + */ + static const char text_tmpl[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL BUFFER[0]\n" + "DCL BUFFER[1]\n" + "DCL BUFFER[2]\n" + "DCL CONST[0..1]\n" + "DCL TEMP[0..5]\n" + "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n" + "IMM[1] UINT32 {1, 2, 4, 8}\n" + "IMM[2] UINT32 {16, 32, 64, 128}\n" + "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */ + + "AND TEMP[5], CONST[0].wwww, IMM[2].xxxx\n" + "UIF TEMP[5]\n" + /* Check result availability. */ + "LOAD TEMP[1].x, BUFFER[0], CONST[1].xxxx\n" + "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n" + "MOV TEMP[1], TEMP[0].zzzz\n" + "NOT TEMP[0].z, TEMP[0].zzzz\n" + + /* Load result if available. */ + "UIF TEMP[1]\n" + "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n" + "ENDIF\n" + "ELSE\n" + /* Load previously accumulated result if requested. */ + "MOV TEMP[0], IMM[0].xxxx\n" + "AND TEMP[4], CONST[0].wwww, IMM[1].xxxx\n" + "UIF TEMP[4]\n" + "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n" + "ENDIF\n" + + "MOV TEMP[1].x, IMM[0].xxxx\n" + "BGNLOOP\n" + /* Break if accumulated result so far is not available. */ + "UIF TEMP[0].zzzz\n" + "BRK\n" + "ENDIF\n" + + /* Break if result_index >= result_count. */ + "USGE TEMP[5], TEMP[1].xxxx, CONST[0].zzzz\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + + /* Load fence and check result availability */ + "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy, CONST[1].xxxx\n" + "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" + "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n" + "NOT TEMP[0].z, TEMP[0].zzzz\n" + "UIF TEMP[0].zzzz\n" + "BRK\n" + "ENDIF\n" + + "MOV TEMP[1].y, IMM[0].xxxx\n" + "BGNLOOP\n" + /* Load start and end. */ + "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy\n" + "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[1].yyyy, TEMP[5].xxxx\n" + "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" + + "UADD TEMP[5].x, TEMP[5].xxxx, CONST[0].xxxx\n" + "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].xxxx\n" + + "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n" + "U64ADD TEMP[0].xy, TEMP[0], TEMP[3]\n" + + /* Increment pair index */ + "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n" + "USGE TEMP[5], TEMP[1].yyyy, CONST[1].zzzz\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + "ENDLOOP\n" + + /* Increment result index */ + "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n" + "ENDLOOP\n" + "ENDIF\n" + + "AND TEMP[4], CONST[0].wwww, IMM[1].yyyy\n" + "UIF TEMP[4]\n" + /* Store accumulated data for chaining. */ + "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n" + "ELSE\n" + "AND TEMP[4], CONST[0].wwww, IMM[1].zzzz\n" + "UIF TEMP[4]\n" + /* Store result availability. */ + "NOT TEMP[0].z, TEMP[0]\n" + "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n" + "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n" + + "AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n" + "UIF TEMP[4]\n" + "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n" + "ENDIF\n" + "ELSE\n" + /* Store result if it is available. */ + "NOT TEMP[4], TEMP[0].zzzz\n" + "UIF TEMP[4]\n" + /* Apply timestamp conversion */ + "AND TEMP[4], CONST[0].wwww, IMM[2].yyyy\n" + "UIF TEMP[4]\n" + "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n" + "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n" + "ENDIF\n" + + /* Convert to boolean */ + "AND TEMP[4], CONST[0].wwww, IMM[1].wwww\n" + "UIF TEMP[4]\n" + "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[0].xxxx\n" + "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n" + "MOV TEMP[0].y, IMM[0].xxxx\n" + "ENDIF\n" + + "AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n" + "UIF TEMP[4]\n" + "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n" + "ELSE\n" + /* Clamping */ + "UIF TEMP[0].yyyy\n" + "MOV TEMP[0].x, IMM[0].wwww\n" + "ENDIF\n" + + "AND TEMP[4], CONST[0].wwww, IMM[2].wwww\n" + "UIF TEMP[4]\n" + "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n" + "ENDIF\n" + + "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" + "ENDIF\n" + "ENDIF\n" + "ENDIF\n" + "ENDIF\n" + + "END\n"; + + char text[sizeof(text_tmpl) + 32]; + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {}; + + /* Hard code the frequency into the shader so that the backend can + * use the full range of optimizations for divide-by-constant. + */ + snprintf(text, sizeof(text), text_tmpl, + rctx->screen->info.clock_crystal_freq); - if (query == NULL) { - if (rctx->predicate_drawing) { - rctx->predicate_drawing = false; - r600_emit_query_predication(rctx, NULL, PREDICATION_OP_CLEAR, false); - } + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); return; } - if (mode == PIPE_RENDER_COND_WAIT || - mode == PIPE_RENDER_COND_BY_REGION_WAIT) { - wait_flag = true; + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state); +} + +static void r600_restore_qbo_state(struct r600_common_context *rctx, + struct r600_qbo_state *st) +{ + rctx->b.bind_compute_state(&rctx->b, st->saved_compute); + + rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); + pipe_resource_reference(&st->saved_const0.buffer, NULL); + + rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo); + for (unsigned i = 0; i < 3; ++i) + pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL); +} + +static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, + struct r600_query *rquery, + bool wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset) +{ + struct r600_query_hw *query = (struct r600_query_hw *)rquery; + struct r600_query_buffer *qbuf; + struct r600_query_buffer *qbuf_prev; + struct pipe_resource *tmp_buffer = NULL; + unsigned tmp_buffer_offset = 0; + struct r600_qbo_state saved_state = {}; + struct pipe_grid_info grid = {}; + struct pipe_constant_buffer constant_buffer = {}; + struct pipe_shader_buffer ssbo[3]; + struct r600_hw_query_params params; + struct { + uint32_t end_offset; + uint32_t result_stride; + uint32_t result_count; + uint32_t config; + uint32_t fence_offset; + uint32_t pair_stride; + uint32_t pair_count; + } consts; + + if (!rctx->query_result_shader) { + r600_create_query_result_shader(rctx); + if (!rctx->query_result_shader) + return; } - rctx->predicate_drawing = true; + if (query->buffer.previous) { + u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16, + &tmp_buffer_offset, &tmp_buffer); + if (!tmp_buffer) + return; + } - switch (rquery->type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - r600_emit_query_predication(rctx, rquery, PREDICATION_OP_ZPASS, wait_flag); + rctx->save_qbo_state(&rctx->b, &saved_state); + + r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, ¶ms); + consts.end_offset = params.end_offset - params.start_offset; + consts.fence_offset = params.fence_offset - params.start_offset; + consts.result_stride = query->result_size; + consts.pair_stride = params.pair_stride; + consts.pair_count = params.pair_count; + + constant_buffer.buffer_size = sizeof(consts); + constant_buffer.user_buffer = &consts; + + ssbo[1].buffer = tmp_buffer; + ssbo[1].buffer_offset = tmp_buffer_offset; + ssbo[1].buffer_size = 16; + + ssbo[2] = ssbo[1]; + + rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader); + + grid.block[0] = 1; + grid.block[1] = 1; + grid.block[2] = 1; + grid.grid[0] = 1; + grid.grid[1] = 1; + grid.grid[2] = 1; + + consts.config = 0; + if (index < 0) + consts.config |= 4; + if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE || + query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) + consts.config |= 8; + else if (query->b.type == PIPE_QUERY_TIMESTAMP || + query->b.type == PIPE_QUERY_TIME_ELAPSED) + consts.config |= 32; + + switch (result_type) { + case PIPE_QUERY_TYPE_U64: + case PIPE_QUERY_TYPE_I64: + consts.config |= 64; break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - case PIPE_QUERY_PRIMITIVES_GENERATED: - case PIPE_QUERY_SO_STATISTICS: - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - r600_emit_query_predication(rctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag); + case PIPE_QUERY_TYPE_I32: + consts.config |= 128; + break; + case PIPE_QUERY_TYPE_U32: break; - default: - assert(0); } -} -static void r600_suspend_queries(struct r600_common_context *ctx, - struct list_head *query_list, - unsigned *num_cs_dw_queries_suspend) -{ - struct r600_query *query; + rctx->flags |= rctx->screen->barrier_flags.cp_to_L2; + + for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) { + if (query->b.type != PIPE_QUERY_TIMESTAMP) { + qbuf_prev = qbuf->previous; + consts.result_count = qbuf->results_end / query->result_size; + consts.config &= ~3; + if (qbuf != &query->buffer) + consts.config |= 1; + if (qbuf->previous) + consts.config |= 2; + } else { + /* Only read the last timestamp. */ + qbuf_prev = NULL; + consts.result_count = 0; + consts.config |= 16; + params.start_offset += qbuf->results_end - query->result_size; + } - LIST_FOR_EACH_ENTRY(query, query_list, list) { - r600_emit_query_end(ctx, query); + rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); + + ssbo[0].buffer = &qbuf->buf->b.b; + ssbo[0].buffer_offset = params.start_offset; + ssbo[0].buffer_size = qbuf->results_end - params.start_offset; + + if (!qbuf->previous) { + ssbo[2].buffer = resource; + ssbo[2].buffer_offset = offset; + ssbo[2].buffer_size = 8; + + ((struct r600_resource *)resource)->TC_L2_dirty = true; + } + + rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo); + + if (wait && qbuf == &query->buffer) { + uint64_t va; + + /* Wait for result availability. Wait only for readiness + * of the last entry, since the fence writes should be + * serialized in the CP. + */ + va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size; + va += params.fence_offset; + + r600_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000); + } + + rctx->b.launch_grid(&rctx->b, &grid); + rctx->flags |= rctx->screen->barrier_flags.compute_to_L2; } - assert(*num_cs_dw_queries_suspend == 0); + + r600_restore_qbo_state(rctx, &saved_state); + pipe_resource_reference(&tmp_buffer, NULL); } -void r600_suspend_nontimer_queries(struct r600_common_context *ctx) +static void r600_render_condition(struct pipe_context *ctx, + struct pipe_query *query, + boolean condition, + uint mode) { - r600_suspend_queries(ctx, &ctx->active_nontimer_queries, - &ctx->num_cs_dw_nontimer_queries_suspend); + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + struct r600_query_hw *rquery = (struct r600_query_hw *)query; + struct r600_query_buffer *qbuf; + struct r600_atom *atom = &rctx->render_cond_atom; + + rctx->render_cond = query; + rctx->render_cond_invert = condition; + rctx->render_cond_mode = mode; + + /* Compute the size of SET_PREDICATION packets. */ + atom->num_dw = 0; + if (query) { + for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous) + atom->num_dw += (qbuf->results_end / rquery->result_size) * 5; + } + + rctx->set_atom_dirty(rctx, atom, query != NULL); } -void r600_suspend_timer_queries(struct r600_common_context *ctx) +void r600_suspend_queries(struct r600_common_context *ctx) { - r600_suspend_queries(ctx, &ctx->active_timer_queries, - &ctx->num_cs_dw_timer_queries_suspend); + struct r600_query_hw *query; + + LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) { + r600_query_hw_emit_stop(ctx, query); + } + assert(ctx->num_cs_dw_queries_suspend == 0); } static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx, struct list_head *query_list) { - struct r600_query *query; + struct r600_query_hw *query; unsigned num_dw = 0; LIST_FOR_EACH_ENTRY(query, query_list, list) { /* begin + end */ - num_dw += query->num_cs_dw * 2; + num_dw += query->num_cs_dw_begin + query->num_cs_dw_end; /* Workaround for the fact that * num_cs_dw_nontimer_queries_suspend is incremented for every * resumed query, which raises the bar in need_cs_space for * queries about to be resumed. */ - num_dw += query->num_cs_dw; + num_dw += query->num_cs_dw_end; } /* primitives generated query */ num_dw += ctx->streamout.enable_atom.num_dw; @@ -907,48 +1535,34 @@ static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context * return num_dw; } -static void r600_resume_queries(struct r600_common_context *ctx, - struct list_head *query_list, - unsigned *num_cs_dw_queries_suspend) +void r600_resume_queries(struct r600_common_context *ctx) { - struct r600_query *query; - unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, query_list); + struct r600_query_hw *query; + unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries); - assert(*num_cs_dw_queries_suspend == 0); + assert(ctx->num_cs_dw_queries_suspend == 0); /* Check CS space here. Resuming must not be interrupted by flushes. */ - ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, TRUE); + ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, true); - LIST_FOR_EACH_ENTRY(query, query_list, list) { - r600_emit_query_begin(ctx, query); + LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) { + r600_query_hw_emit_start(ctx, query); } } -void r600_resume_nontimer_queries(struct r600_common_context *ctx) -{ - r600_resume_queries(ctx, &ctx->active_nontimer_queries, - &ctx->num_cs_dw_nontimer_queries_suspend); -} - -void r600_resume_timer_queries(struct r600_common_context *ctx) -{ - r600_resume_queries(ctx, &ctx->active_timer_queries, - &ctx->num_cs_dw_timer_queries_suspend); -} - /* Get backends mask */ void r600_query_init_backend_mask(struct r600_common_context *ctx) { - struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->gfx.cs; struct r600_resource *buffer; uint32_t *results; - unsigned num_backends = ctx->screen->info.r600_num_backends; + unsigned num_backends = ctx->screen->info.num_render_backends; unsigned i, mask = 0; /* if backend_map query is supported by the kernel */ - if (ctx->screen->info.r600_backend_map_valid) { - unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes; - unsigned backend_map = ctx->screen->info.r600_backend_map; + if (ctx->screen->info.r600_gb_backend_map_valid) { + unsigned num_tile_pipes = ctx->screen->info.num_tile_pipes; + unsigned backend_map = ctx->screen->info.r600_gb_backend_map; unsigned item_width, item_mask; if (ctx->chip_class >= EVERGREEN) { @@ -959,7 +1573,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx) item_mask = 0x3; } - while(num_tile_pipes--) { + while (num_tile_pipes--) { i = backend_map & item_mask; mask |= (1<<i); backend_map >>= item_width; @@ -990,7 +1604,8 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx) radeon_emit(cs, buffer->gpu_address); radeon_emit(cs, buffer->gpu_address >> 32); - r600_emit_reloc(ctx, &ctx->rings.gfx, buffer, RADEON_USAGE_WRITE, RADEON_PRIO_MIN); + r600_emit_reloc(ctx, &ctx->gfx, buffer, + RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); /* analyze results */ results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ); @@ -1003,7 +1618,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx) } } - pipe_resource_reference((struct pipe_resource**)&buffer, NULL); + r600_resource_reference(&buffer, NULL); if (mask != 0) { ctx->backend_mask = mask; @@ -1016,17 +1631,167 @@ err: return; } +#define XFULL(name_, query_type_, type_, result_type_, group_id_) \ + { \ + .name = name_, \ + .query_type = R600_QUERY_##query_type_, \ + .type = PIPE_DRIVER_QUERY_TYPE_##type_, \ + .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \ + .group_id = group_id_ \ + } + +#define X(name_, query_type_, type_, result_type_) \ + XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0) + +#define XG(group_, name_, query_type_, type_, result_type_) \ + XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_) + +static struct pipe_driver_query_info r600_driver_query_list[] = { + X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE), + X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE), + X("draw-calls", DRAW_CALLS, UINT64, AVERAGE), + X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE), + X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE), + X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE), + X("dma-calls", DMA_CALLS, UINT64, AVERAGE), + X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE), + X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE), + X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE), + X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE), + X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE), + X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE), + X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE), + X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE), + X("num-ctx-flushes", NUM_CTX_FLUSHES, UINT64, AVERAGE), + X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE), + X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE), + X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE), + X("GTT-usage", GTT_USAGE, BYTES, AVERAGE), + X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE), + + /* GPIN queries are for the benefit of old versions of GPUPerfStudio, + * which use it as a fallback path to detect the GPU type. + * + * Note: The names of these queries are significant for GPUPerfStudio + * (and possibly their order as well). */ + XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE), + XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE), + XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE), + XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE), + XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE), + + /* The following queries must be at the end of the list because their + * availability is adjusted dynamically based on the DRM version. */ + X("GPU-load", GPU_LOAD, UINT64, AVERAGE), + X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE), + X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE), + X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE), +}; + +#undef X +#undef XG +#undef XFULL + +static unsigned r600_get_num_queries(struct r600_common_screen *rscreen) +{ + if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42) + return ARRAY_SIZE(r600_driver_query_list); + else if (rscreen->info.drm_major == 3) + return ARRAY_SIZE(r600_driver_query_list) - 3; + else + return ARRAY_SIZE(r600_driver_query_list) - 4; +} + +static int r600_get_driver_query_info(struct pipe_screen *screen, + unsigned index, + struct pipe_driver_query_info *info) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + unsigned num_queries = r600_get_num_queries(rscreen); + + if (!info) { + unsigned num_perfcounters = + r600_get_perfcounter_info(rscreen, 0, NULL); + + return num_queries + num_perfcounters; + } + + if (index >= num_queries) + return r600_get_perfcounter_info(rscreen, index - num_queries, info); + + *info = r600_driver_query_list[index]; + + switch (info->query_type) { + case R600_QUERY_REQUESTED_VRAM: + case R600_QUERY_VRAM_USAGE: + case R600_QUERY_MAPPED_VRAM: + info->max_value.u64 = rscreen->info.vram_size; + break; + case R600_QUERY_REQUESTED_GTT: + case R600_QUERY_GTT_USAGE: + case R600_QUERY_MAPPED_GTT: + info->max_value.u64 = rscreen->info.gart_size; + break; + case R600_QUERY_GPU_TEMPERATURE: + info->max_value.u64 = 125; + break; + } + + if (info->group_id != ~(unsigned)0 && rscreen->perfcounters) + info->group_id += rscreen->perfcounters->num_groups; + + return 1; +} + +/* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware + * performance counter groups, so be careful when changing this and related + * functions. + */ +static int r600_get_driver_query_group_info(struct pipe_screen *screen, + unsigned index, + struct pipe_driver_query_group_info *info) +{ + struct r600_common_screen *rscreen = (struct r600_common_screen *)screen; + unsigned num_pc_groups = 0; + + if (rscreen->perfcounters) + num_pc_groups = rscreen->perfcounters->num_groups; + + if (!info) + return num_pc_groups + R600_NUM_SW_QUERY_GROUPS; + + if (index < num_pc_groups) + return r600_get_perfcounter_group_info(rscreen, index, info); + + index -= num_pc_groups; + if (index >= R600_NUM_SW_QUERY_GROUPS) + return 0; + + info->name = "GPIN"; + info->max_active_queries = 5; + info->num_queries = 5; + return 1; +} + void r600_query_init(struct r600_common_context *rctx) { rctx->b.create_query = r600_create_query; + rctx->b.create_batch_query = r600_create_batch_query; rctx->b.destroy_query = r600_destroy_query; rctx->b.begin_query = r600_begin_query; rctx->b.end_query = r600_end_query; rctx->b.get_query_result = r600_get_query_result; + rctx->b.get_query_result_resource = r600_get_query_result_resource; + rctx->render_cond_atom.emit = r600_emit_query_predication; - if (((struct r600_common_screen*)rctx->b.screen)->info.r600_num_backends > 0) + if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0) rctx->b.render_condition = r600_render_condition; - LIST_INITHEAD(&rctx->active_nontimer_queries); - LIST_INITHEAD(&rctx->active_timer_queries); + LIST_INITHEAD(&rctx->active_queries); +} + +void r600_init_screen_query_functions(struct r600_common_screen *rscreen) +{ + rscreen->b.get_driver_query_info = r600_get_driver_query_info; + rscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info; } diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_query.h b/lib/mesa/src/gallium/drivers/radeon/r600_query.h index 8b2c4e3fe..14c433d91 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_query.h +++ b/lib/mesa/src/gallium/drivers/radeon/r600_query.h @@ -29,10 +29,12 @@ #define R600_QUERY_H #include "pipe/p_defines.h" +#include "pipe/p_state.h" #include "util/list.h" struct pipe_context; struct pipe_query; +struct pipe_resource; struct r600_common_context; struct r600_common_screen; @@ -40,26 +42,40 @@ struct r600_query; struct r600_query_hw; struct r600_resource; -#define R600_QUERY_DRAW_CALLS (PIPE_QUERY_DRIVER_SPECIFIC + 0) -#define R600_QUERY_REQUESTED_VRAM (PIPE_QUERY_DRIVER_SPECIFIC + 1) -#define R600_QUERY_REQUESTED_GTT (PIPE_QUERY_DRIVER_SPECIFIC + 2) -#define R600_QUERY_BUFFER_WAIT_TIME (PIPE_QUERY_DRIVER_SPECIFIC + 3) -#define R600_QUERY_NUM_CS_FLUSHES (PIPE_QUERY_DRIVER_SPECIFIC + 4) -#define R600_QUERY_NUM_BYTES_MOVED (PIPE_QUERY_DRIVER_SPECIFIC + 5) -#define R600_QUERY_VRAM_USAGE (PIPE_QUERY_DRIVER_SPECIFIC + 6) -#define R600_QUERY_GTT_USAGE (PIPE_QUERY_DRIVER_SPECIFIC + 7) -#define R600_QUERY_GPU_TEMPERATURE (PIPE_QUERY_DRIVER_SPECIFIC + 8) -#define R600_QUERY_CURRENT_GPU_SCLK (PIPE_QUERY_DRIVER_SPECIFIC + 9) -#define R600_QUERY_CURRENT_GPU_MCLK (PIPE_QUERY_DRIVER_SPECIFIC + 10) -#define R600_QUERY_GPU_LOAD (PIPE_QUERY_DRIVER_SPECIFIC + 11) -#define R600_QUERY_NUM_COMPILATIONS (PIPE_QUERY_DRIVER_SPECIFIC + 12) -#define R600_QUERY_NUM_SHADERS_CREATED (PIPE_QUERY_DRIVER_SPECIFIC + 13) -#define R600_QUERY_GPIN_ASIC_ID (PIPE_QUERY_DRIVER_SPECIFIC + 14) -#define R600_QUERY_GPIN_NUM_SIMD (PIPE_QUERY_DRIVER_SPECIFIC + 15) -#define R600_QUERY_GPIN_NUM_RB (PIPE_QUERY_DRIVER_SPECIFIC + 16) -#define R600_QUERY_GPIN_NUM_SPI (PIPE_QUERY_DRIVER_SPECIFIC + 17) -#define R600_QUERY_GPIN_NUM_SE (PIPE_QUERY_DRIVER_SPECIFIC + 18) -#define R600_QUERY_FIRST_PERFCOUNTER (PIPE_QUERY_DRIVER_SPECIFIC + 100) +enum { + R600_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC, + R600_QUERY_SPILL_DRAW_CALLS, + R600_QUERY_COMPUTE_CALLS, + R600_QUERY_SPILL_COMPUTE_CALLS, + R600_QUERY_DMA_CALLS, + R600_QUERY_NUM_VS_FLUSHES, + R600_QUERY_NUM_PS_FLUSHES, + R600_QUERY_NUM_CS_FLUSHES, + R600_QUERY_REQUESTED_VRAM, + R600_QUERY_REQUESTED_GTT, + R600_QUERY_MAPPED_VRAM, + R600_QUERY_MAPPED_GTT, + R600_QUERY_BUFFER_WAIT_TIME, + R600_QUERY_NUM_CTX_FLUSHES, + R600_QUERY_NUM_BYTES_MOVED, + R600_QUERY_NUM_EVICTIONS, + R600_QUERY_VRAM_USAGE, + R600_QUERY_GTT_USAGE, + R600_QUERY_GPU_TEMPERATURE, + R600_QUERY_CURRENT_GPU_SCLK, + R600_QUERY_CURRENT_GPU_MCLK, + R600_QUERY_GPU_LOAD, + R600_QUERY_NUM_COMPILATIONS, + R600_QUERY_NUM_SHADERS_CREATED, + R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO, + R600_QUERY_GPIN_ASIC_ID, + R600_QUERY_GPIN_NUM_SIMD, + R600_QUERY_GPIN_NUM_RB, + R600_QUERY_GPIN_NUM_SPI, + R600_QUERY_GPIN_NUM_SE, + + R600_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100, +}; enum { R600_QUERY_GROUP_GPIN = 0, @@ -68,11 +84,17 @@ enum { struct r600_query_ops { void (*destroy)(struct r600_common_context *, struct r600_query *); - boolean (*begin)(struct r600_common_context *, struct r600_query *); - void (*end)(struct r600_common_context *, struct r600_query *); - boolean (*get_result)(struct r600_common_context *, - struct r600_query *, boolean wait, - union pipe_query_result *result); + bool (*begin)(struct r600_common_context *, struct r600_query *); + bool (*end)(struct r600_common_context *, struct r600_query *); + bool (*get_result)(struct r600_common_context *, + struct r600_query *, bool wait, + union pipe_query_result *result); + void (*get_result_resource)(struct r600_common_context *, + struct r600_query *, bool wait, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset); }; struct r600_query { @@ -84,12 +106,13 @@ struct r600_query { enum { R600_QUERY_HW_FLAG_NO_START = (1 << 0), - R600_QUERY_HW_FLAG_TIMER = (1 << 1), - R600_QUERY_HW_FLAG_PREDICATE = (1 << 2), + /* gap */ + /* whether begin_query doesn't clear the result */ + R600_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2), }; struct r600_query_hw_ops { - void (*prepare_buffer)(struct r600_common_context *, + bool (*prepare_buffer)(struct r600_common_context *, struct r600_query_hw *, struct r600_resource *); void (*emit_start)(struct r600_common_context *, @@ -134,18 +157,18 @@ struct r600_query_hw { unsigned stream; }; -boolean r600_query_hw_init(struct r600_common_context *rctx, - struct r600_query_hw *query); +bool r600_query_hw_init(struct r600_common_context *rctx, + struct r600_query_hw *query); void r600_query_hw_destroy(struct r600_common_context *rctx, struct r600_query *rquery); -boolean r600_query_hw_begin(struct r600_common_context *rctx, - struct r600_query *rquery); -void r600_query_hw_end(struct r600_common_context *rctx, +bool r600_query_hw_begin(struct r600_common_context *rctx, + struct r600_query *rquery); +bool r600_query_hw_end(struct r600_common_context *rctx, struct r600_query *rquery); -boolean r600_query_hw_get_result(struct r600_common_context *rctx, - struct r600_query *rquery, - boolean wait, - union pipe_query_result *result); +bool r600_query_hw_get_result(struct r600_common_context *rctx, + struct r600_query *rquery, + bool wait, + union pipe_query_result *result); /* Performance counters */ enum { @@ -227,8 +250,8 @@ struct r600_perfcounters { void (*cleanup)(struct r600_common_screen *); - boolean separate_se; - boolean separate_instance; + bool separate_se; + bool separate_instance; }; struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, @@ -242,12 +265,20 @@ int r600_get_perfcounter_group_info(struct r600_common_screen *, unsigned index, struct pipe_driver_query_group_info *info); -boolean r600_perfcounters_init(struct r600_perfcounters *, unsigned num_blocks); +bool r600_perfcounters_init(struct r600_perfcounters *, unsigned num_blocks); void r600_perfcounters_add_block(struct r600_common_screen *, struct r600_perfcounters *, const char *name, unsigned flags, unsigned counters, unsigned selectors, unsigned instances, void *data); void r600_perfcounters_do_destroy(struct r600_perfcounters *); +void r600_query_hw_reset_buffers(struct r600_common_context *rctx, + struct r600_query_hw *query); + +struct r600_qbo_state { + void *saved_compute; + struct pipe_constant_buffer saved_const0; + struct pipe_shader_buffer saved_ssbo[3]; +}; #endif /* R600_QUERY_H */ diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c b/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c index 0853f636a..b5296aa56 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c @@ -46,7 +46,7 @@ r600_create_so_target(struct pipe_context *ctx, return NULL; } - u_suballocator_alloc(rctx->allocator_so_filled_size, 4, + u_suballocator_alloc(rctx->allocator_zeroed_memory, 4, 4, &t->buf_filled_size_offset, (struct pipe_resource**)&t->buf_filled_size); if (!t->buf_filled_size) { @@ -70,7 +70,7 @@ static void r600_so_target_destroy(struct pipe_context *ctx, { struct r600_so_target *t = (struct r600_so_target*)target; pipe_resource_reference(&t->b.buffer, NULL); - pipe_resource_reference((struct pipe_resource**)&t->buf_filled_size, NULL); + r600_resource_reference(&t->buf_filled_size, NULL); FREE(t); } @@ -116,7 +116,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx, { struct r600_common_context *rctx = (struct r600_common_context *)ctx; unsigned i; - unsigned append_bitmask = 0; + unsigned enabled_mask = 0, append_bitmask = 0; /* Stop streamout. */ if (rctx->streamout.num_targets && rctx->streamout.begin_emitted) { @@ -126,18 +126,19 @@ void r600_set_streamout_targets(struct pipe_context *ctx, /* Set the new targets. */ for (i = 0; i < num_targets; i++) { pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->streamout.targets[i], targets[i]); + if (!targets[i]) + continue; + r600_context_add_resource_size(ctx, targets[i]->buffer); + enabled_mask |= 1 << i; if (offsets[i] == ((unsigned)-1)) - append_bitmask |= 1 << i; + append_bitmask |= 1 << i; } for (; i < rctx->streamout.num_targets; i++) { pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->streamout.targets[i], NULL); } - rctx->streamout.enabled_mask = (num_targets >= 1 && targets[0] ? 1 : 0) | - (num_targets >= 2 && targets[1] ? 2 : 0) | - (num_targets >= 3 && targets[2] ? 4 : 0) | - (num_targets >= 4 && targets[3] ? 8 : 0); + rctx->streamout.enabled_mask = enabled_mask; rctx->streamout.num_targets = num_targets; rctx->streamout.append_bitmask = append_bitmask; @@ -152,7 +153,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx, static void r600_flush_vgt_streamout(struct r600_common_context *rctx) { - struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->gfx.cs; unsigned reg_strmout_cntl; /* The register is at different places on different ASICs. */ @@ -165,9 +166,9 @@ static void r600_flush_vgt_streamout(struct r600_common_context *rctx) } if (rctx->chip_class >= CIK) { - cik_write_uconfig_reg(cs, reg_strmout_cntl, 0); + radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0); } else { - r600_write_config_reg(cs, reg_strmout_cntl, 0); + radeon_set_config_reg(cs, reg_strmout_cntl, 0); } radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); @@ -184,7 +185,7 @@ static void r600_flush_vgt_streamout(struct r600_common_context *rctx) static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->gfx.cs; struct r600_so_target **t = rctx->streamout.targets; unsigned *stride_in_dw = rctx->streamout.stride_in_dw; unsigned i, update_flags = 0; @@ -201,7 +202,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r /* SI binds streamout buffers as shader resources. * VGT only counts primitives and tells the shader * through SGPRs what to do. */ - r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2); + radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2); radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ @@ -210,14 +211,14 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i); - r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 3); + radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 3); radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ radeon_emit(cs, va >> 8); /* BUFFER_BASE */ - r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer), - RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RESOURCE_RW); + r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer), + RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RW_BUFFER); /* R7xx requires this packet after updating BUFFER_BASE. * Without this, R7xx locks up. */ @@ -226,8 +227,8 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r radeon_emit(cs, i); radeon_emit(cs, va >> 8); - r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer), - RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RESOURCE_RW); + r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer), + RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RW_BUFFER); } } @@ -244,8 +245,8 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r radeon_emit(cs, va); /* src address lo */ radeon_emit(cs, va >> 32); /* src address hi */ - r600_emit_reloc(rctx, &rctx->rings.gfx, t[i]->buf_filled_size, - RADEON_USAGE_READ, RADEON_PRIO_MIN); + r600_emit_reloc(rctx, &rctx->gfx, t[i]->buf_filled_size, + RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE); } else { /* Start from the beginning. */ radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); @@ -267,7 +268,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r void r600_emit_streamout_end(struct r600_common_context *rctx) { - struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->gfx.cs; struct r600_so_target **t = rctx->streamout.targets; unsigned i; uint64_t va; @@ -288,14 +289,14 @@ void r600_emit_streamout_end(struct r600_common_context *rctx) radeon_emit(cs, 0); /* unused */ radeon_emit(cs, 0); /* unused */ - r600_emit_reloc(rctx, &rctx->rings.gfx, t[i]->buf_filled_size, - RADEON_USAGE_WRITE, RADEON_PRIO_MIN); + r600_emit_reloc(rctx, &rctx->gfx, t[i]->buf_filled_size, + RADEON_USAGE_WRITE, RADEON_PRIO_SO_FILLED_SIZE); /* Zero the buffer size. The counters (primitives generated, * primitives emitted) may be enabled even if there is not * buffer bound. This ensures that the primitives-emitted query * won't increment. */ - r600_write_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0); + radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0); t[i]->buf_filled_size_valid = true; } @@ -311,12 +312,6 @@ void r600_emit_streamout_end(struct r600_common_context *rctx) * are no buffers bound. */ -static bool r600_get_strmout_en(struct r600_common_context *rctx) -{ - return rctx->streamout.streamout_enabled || - rctx->streamout.prims_gen_query_enabled; -} - static void r600_emit_streamout_enable(struct r600_common_context *rctx, struct r600_atom *atom) { @@ -336,8 +331,8 @@ static void r600_emit_streamout_enable(struct r600_common_context *rctx, S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) | S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx)); } - r600_write_context_reg(rctx->rings.gfx.cs, strmout_buffer_reg, strmout_buffer_val); - r600_write_context_reg(rctx->rings.gfx.cs, strmout_config_reg, strmout_config_val); + radeon_set_context_reg(rctx->gfx.cs, strmout_buffer_reg, strmout_buffer_val); + radeon_set_context_reg(rctx->gfx.cs, strmout_config_reg, strmout_config_val); } static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable) diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_texture.c b/lib/mesa/src/gallium/drivers/radeon/r600_texture.c index e9bd4a21f..27035c0fa 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_texture.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_texture.c @@ -26,12 +26,82 @@ */ #include "r600_pipe_common.h" #include "r600_cs.h" +#include "r600_query.h" #include "util/u_format.h" #include "util/u_memory.h" #include "util/u_pack_color.h" +#include "util/u_surface.h" +#include "os/os_time.h" #include <errno.h> #include <inttypes.h> +static void r600_texture_discard_cmask(struct r600_common_screen *rscreen, + struct r600_texture *rtex); +static unsigned r600_choose_tiling(struct r600_common_screen *rscreen, + const struct pipe_resource *templ); + + +bool r600_prepare_for_dma_blit(struct r600_common_context *rctx, + struct r600_texture *rdst, + unsigned dst_level, unsigned dstx, + unsigned dsty, unsigned dstz, + struct r600_texture *rsrc, + unsigned src_level, + const struct pipe_box *src_box) +{ + if (!rctx->dma.cs) + return false; + + if (util_format_get_blocksizebits(rdst->resource.b.b.format) != + util_format_get_blocksizebits(rsrc->resource.b.b.format)) + return false; + + /* MSAA: Blits don't exist in the real world. */ + if (rsrc->resource.b.b.nr_samples > 1 || + rdst->resource.b.b.nr_samples > 1) + return false; + + /* Depth-stencil surfaces: + * When dst is linear, the DB->CB copy preserves HTILE. + * When dst is tiled, the 3D path must be used to update HTILE. + */ + if (rsrc->is_depth || rdst->is_depth) + return false; + + /* DCC as: + * src: Use the 3D path. DCC decompression is expensive. + * dst: Use the 3D path to compress the pixels with DCC. + */ + if ((rsrc->dcc_offset && rsrc->surface.level[src_level].dcc_enabled) || + (rdst->dcc_offset && rdst->surface.level[dst_level].dcc_enabled)) + return false; + + /* CMASK as: + * src: Both texture and SDMA paths need decompression. Use SDMA. + * dst: If overwriting the whole texture, discard CMASK and use + * SDMA. Otherwise, use the 3D path. + */ + if (rdst->cmask.size && rdst->dirty_level_mask & (1 << dst_level)) { + /* The CMASK clear is only enabled for the first level. */ + assert(dst_level == 0); + if (!util_texrange_covers_whole_level(&rdst->resource.b.b, dst_level, + dstx, dsty, dstz, src_box->width, + src_box->height, src_box->depth)) + return false; + + r600_texture_discard_cmask(rctx->screen, rdst); + } + + /* All requirements are met. Prepare textures for SDMA. */ + if (rsrc->cmask.size && rsrc->dirty_level_mask & (1 << src_level)) + rctx->b.flush_resource(&rctx->b, &rsrc->resource.b.b); + + assert(!(rsrc->dirty_level_mask & (1 << src_level))); + assert(!(rdst->dirty_level_mask & (1 << dst_level))); + + return true; +} + /* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */ static void r600_copy_region_with_blit(struct pipe_context *pipe, struct pipe_resource *dst, @@ -122,7 +192,8 @@ static int r600_init_surface(struct r600_common_screen *rscreen, struct radeon_surf *surface, const struct pipe_resource *ptex, unsigned array_mode, - bool is_flushed_depth) + bool is_flushed_depth, + bool tc_compatible_htile) { const struct util_format_description *desc = util_format_description(ptex->format); @@ -169,8 +240,9 @@ static int r600_init_surface(struct r600_common_screen *rscreen, surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_1D_ARRAY, TYPE); surface->array_size = ptex->array_size; break; - case PIPE_TEXTURE_2D_ARRAY: case PIPE_TEXTURE_CUBE_ARRAY: /* cube array layout like 2d array */ + assert(ptex->array_size % 6 == 0); + case PIPE_TEXTURE_2D_ARRAY: surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_2D_ARRAY, TYPE); surface->array_size = ptex->array_size; break; @@ -181,29 +253,55 @@ static int r600_init_surface(struct r600_common_screen *rscreen, default: return -EINVAL; } - if (ptex->bind & PIPE_BIND_SCANOUT) { - surface->flags |= RADEON_SURF_SCANOUT; - } if (!is_flushed_depth && is_depth) { surface->flags |= RADEON_SURF_ZBUFFER; + if (tc_compatible_htile && + array_mode == RADEON_SURF_MODE_2D) { + /* TC-compatible HTILE only supports Z32_FLOAT. + * Promote Z16 to Z32. DB->CB copies will convert + * the format for transfers. + */ + surface->bpe = 4; + surface->flags |= RADEON_SURF_TC_COMPATIBLE_HTILE; + } + if (is_stencil) { surface->flags |= RADEON_SURF_SBUFFER | RADEON_SURF_HAS_SBUFFER_MIPTREE; } } + if (rscreen->chip_class >= SI) { surface->flags |= RADEON_SURF_HAS_TILE_MODE_INDEX; } + + if (rscreen->chip_class >= VI && + (ptex->flags & R600_RESOURCE_FLAG_DISABLE_DCC || + ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT)) + surface->flags |= RADEON_SURF_DISABLE_DCC; + + if (ptex->bind & PIPE_BIND_SCANOUT) { + /* This should catch bugs in gallium users setting incorrect flags. */ + assert(surface->nsamples == 1 && + surface->array_size == 1 && + surface->npix_z == 1 && + surface->last_level == 0 && + !(surface->flags & RADEON_SURF_Z_OR_SBUFFER)); + + surface->flags |= RADEON_SURF_SCANOUT; + } return 0; } static int r600_setup_surface(struct pipe_screen *screen, struct r600_texture *rtex, - unsigned pitch_in_bytes_override) + unsigned pitch_in_bytes_override, + unsigned offset) { struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + unsigned i; int r; r = rscreen->ws->surface_init(rscreen->ws, &rtex->surface); @@ -220,39 +318,292 @@ static int r600_setup_surface(struct pipe_screen *screen, rtex->surface.level[0].nblk_x = pitch_in_bytes_override / rtex->surface.bpe; rtex->surface.level[0].pitch_bytes = pitch_in_bytes_override; rtex->surface.level[0].slice_size = pitch_in_bytes_override * rtex->surface.level[0].nblk_y; - if (rtex->surface.flags & RADEON_SURF_SBUFFER) { - rtex->surface.stencil_offset = - rtex->surface.stencil_level[0].offset = rtex->surface.level[0].slice_size; - } + } + + if (offset) { + for (i = 0; i < ARRAY_SIZE(rtex->surface.level); ++i) + rtex->surface.level[i].offset += offset; } return 0; } -static boolean r600_texture_get_handle(struct pipe_screen* screen, - struct pipe_resource *ptex, - struct winsys_handle *whandle) +static void r600_texture_init_metadata(struct r600_texture *rtex, + struct radeon_bo_metadata *metadata) { - struct r600_texture *rtex = (struct r600_texture*)ptex; - struct r600_resource *resource = &rtex->resource; struct radeon_surf *surface = &rtex->surface; + + memset(metadata, 0, sizeof(*metadata)); + metadata->microtile = surface->level[0].mode >= RADEON_SURF_MODE_1D ? + RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; + metadata->macrotile = surface->level[0].mode >= RADEON_SURF_MODE_2D ? + RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR; + metadata->pipe_config = surface->pipe_config; + metadata->bankw = surface->bankw; + metadata->bankh = surface->bankh; + metadata->tile_split = surface->tile_split; + metadata->mtilea = surface->mtilea; + metadata->num_banks = surface->num_banks; + metadata->stride = surface->level[0].pitch_bytes; + metadata->scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0; +} + +static void r600_dirty_all_framebuffer_states(struct r600_common_screen *rscreen) +{ + p_atomic_inc(&rscreen->dirty_fb_counter); +} + +static void r600_eliminate_fast_color_clear(struct r600_common_context *rctx, + struct r600_texture *rtex) +{ + struct r600_common_screen *rscreen = rctx->screen; + struct pipe_context *ctx = &rctx->b; + + if (ctx == rscreen->aux_context) + pipe_mutex_lock(rscreen->aux_context_lock); + + ctx->flush_resource(ctx, &rtex->resource.b.b); + ctx->flush(ctx, NULL, 0); + + if (ctx == rscreen->aux_context) + pipe_mutex_unlock(rscreen->aux_context_lock); +} + +static void r600_texture_discard_cmask(struct r600_common_screen *rscreen, + struct r600_texture *rtex) +{ + if (!rtex->cmask.size) + return; + + assert(rtex->resource.b.b.nr_samples <= 1); + + /* Disable CMASK. */ + memset(&rtex->cmask, 0, sizeof(rtex->cmask)); + rtex->cmask.base_address_reg = rtex->resource.gpu_address >> 8; + rtex->dirty_level_mask = 0; + + if (rscreen->chip_class >= SI) + rtex->cb_color_info &= ~SI_S_028C70_FAST_CLEAR(1); + else + rtex->cb_color_info &= ~EG_S_028C70_FAST_CLEAR(1); + + if (rtex->cmask_buffer != &rtex->resource) + r600_resource_reference(&rtex->cmask_buffer, NULL); + + /* Notify all contexts about the change. */ + r600_dirty_all_framebuffer_states(rscreen); + p_atomic_inc(&rscreen->compressed_colortex_counter); +} + +static bool r600_can_disable_dcc(struct r600_texture *rtex) +{ + /* We can't disable DCC if it can be written by another process. */ + return rtex->dcc_offset && + (!rtex->resource.is_shared || + !(rtex->resource.external_usage & PIPE_HANDLE_USAGE_WRITE)); +} + +static bool r600_texture_discard_dcc(struct r600_common_screen *rscreen, + struct r600_texture *rtex) +{ + if (!r600_can_disable_dcc(rtex)) + return false; + + assert(rtex->dcc_separate_buffer == NULL); + + /* Disable DCC. */ + rtex->dcc_offset = 0; + + /* Notify all contexts about the change. */ + r600_dirty_all_framebuffer_states(rscreen); + return true; +} + +/** + * Disable DCC for the texture. (first decompress, then discard metadata). + * + * There is unresolved multi-context synchronization issue between + * screen::aux_context and the current context. If applications do this with + * multiple contexts, it's already undefined behavior for them and we don't + * have to worry about that. The scenario is: + * + * If context 1 disables DCC and context 2 has queued commands that write + * to the texture via CB with DCC enabled, and the order of operations is + * as follows: + * context 2 queues draw calls rendering to the texture, but doesn't flush + * context 1 disables DCC and flushes + * context 1 & 2 reset descriptors and FB state + * context 2 flushes (new compressed tiles written by the draw calls) + * context 1 & 2 read garbage, because DCC is disabled, yet there are + * compressed tiled + * + * \param rctx the current context if you have one, or rscreen->aux_context + * if you don't. + */ +bool r600_texture_disable_dcc(struct r600_common_context *rctx, + struct r600_texture *rtex) +{ + struct r600_common_screen *rscreen = rctx->screen; + + if (!r600_can_disable_dcc(rtex)) + return false; + + if (&rctx->b == rscreen->aux_context) + pipe_mutex_lock(rscreen->aux_context_lock); + + /* Decompress DCC. */ + rctx->decompress_dcc(&rctx->b, rtex); + rctx->b.flush(&rctx->b, NULL, 0); + + if (&rctx->b == rscreen->aux_context) + pipe_mutex_unlock(rscreen->aux_context_lock); + + return r600_texture_discard_dcc(rscreen, rtex); +} + +static void r600_degrade_tile_mode_to_linear(struct r600_common_context *rctx, + struct r600_texture *rtex, + bool invalidate_storage) +{ + struct pipe_screen *screen = rctx->b.screen; + struct r600_texture *new_tex; + struct pipe_resource templ = rtex->resource.b.b; + unsigned i; + + templ.bind |= PIPE_BIND_LINEAR; + + /* r600g doesn't react to dirty_tex_descriptor_counter */ + if (rctx->chip_class < SI) + return; + + if (rtex->resource.is_shared || + rtex->surface.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED) + return; + + /* This fails with MSAA, depth, and compressed textures. */ + if (r600_choose_tiling(rctx->screen, &templ) != + RADEON_SURF_MODE_LINEAR_ALIGNED) + return; + + new_tex = (struct r600_texture*)screen->resource_create(screen, &templ); + if (!new_tex) + return; + + /* Copy the pixels to the new texture. */ + if (!invalidate_storage) { + for (i = 0; i <= templ.last_level; i++) { + struct pipe_box box; + + u_box_3d(0, 0, 0, + u_minify(templ.width0, i), u_minify(templ.height0, i), + util_max_layer(&templ, i) + 1, &box); + + rctx->dma_copy(&rctx->b, &new_tex->resource.b.b, i, 0, 0, 0, + &rtex->resource.b.b, i, &box); + } + } + + r600_texture_discard_cmask(rctx->screen, rtex); + r600_texture_discard_dcc(rctx->screen, rtex); + + /* Replace the structure fields of rtex. */ + rtex->resource.b.b.bind = templ.bind; + pb_reference(&rtex->resource.buf, new_tex->resource.buf); + rtex->resource.gpu_address = new_tex->resource.gpu_address; + rtex->resource.vram_usage = new_tex->resource.vram_usage; + rtex->resource.gart_usage = new_tex->resource.gart_usage; + rtex->resource.bo_size = new_tex->resource.bo_size; + rtex->resource.bo_alignment = new_tex->resource.bo_alignment; + rtex->resource.domains = new_tex->resource.domains; + rtex->resource.flags = new_tex->resource.flags; + rtex->size = new_tex->size; + rtex->surface = new_tex->surface; + rtex->non_disp_tiling = new_tex->non_disp_tiling; + rtex->cb_color_info = new_tex->cb_color_info; + rtex->cmask = new_tex->cmask; /* needed even without CMASK */ + + assert(!rtex->htile_buffer); + assert(!rtex->cmask.size); + assert(!rtex->fmask.size); + assert(!rtex->dcc_offset); + assert(!rtex->is_depth); + + r600_texture_reference(&new_tex, NULL); + + r600_dirty_all_framebuffer_states(rctx->screen); + p_atomic_inc(&rctx->screen->dirty_tex_descriptor_counter); +} + +static boolean r600_texture_get_handle(struct pipe_screen* screen, + struct pipe_context *ctx, + struct pipe_resource *resource, + struct winsys_handle *whandle, + unsigned usage) +{ struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + struct r600_common_context *rctx = (struct r600_common_context*) + (ctx ? ctx : rscreen->aux_context); + struct r600_resource *res = (struct r600_resource*)resource; + struct r600_texture *rtex = (struct r600_texture*)resource; + struct radeon_bo_metadata metadata; + bool update_metadata = false; + + /* This is not supported now, but it might be required for OpenCL + * interop in the future. + */ + if (resource->target != PIPE_BUFFER && + (resource->nr_samples > 1 || rtex->is_depth)) + return false; + + if (resource->target != PIPE_BUFFER) { + /* Since shader image stores don't support DCC on VI, + * disable it for external clients that want write + * access. + */ + if (usage & PIPE_HANDLE_USAGE_WRITE && rtex->dcc_offset) { + if (r600_texture_disable_dcc(rctx, rtex)) + update_metadata = true; + } - rscreen->ws->buffer_set_tiling(resource->buf, - NULL, - surface->level[0].mode >= RADEON_SURF_MODE_1D ? - RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR, - surface->level[0].mode >= RADEON_SURF_MODE_2D ? - RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR, - surface->pipe_config, - surface->bankw, surface->bankh, - surface->tile_split, - surface->stencil_tile_split, - surface->mtilea, surface->num_banks, - surface->level[0].pitch_bytes, - (surface->flags & RADEON_SURF_SCANOUT) != 0); - - return rscreen->ws->buffer_get_handle(resource->buf, - surface->level[0].pitch_bytes, whandle); + if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && + (rtex->cmask.size || rtex->dcc_offset)) { + /* Eliminate fast clear (both CMASK and DCC) */ + r600_eliminate_fast_color_clear(rctx, rtex); + + /* Disable CMASK if flush_resource isn't going + * to be called. + */ + if (rtex->cmask.size) + r600_texture_discard_cmask(rscreen, rtex); + } + + /* Set metadata. */ + if (!res->is_shared || update_metadata) { + r600_texture_init_metadata(rtex, &metadata); + if (rscreen->query_opaque_metadata) + rscreen->query_opaque_metadata(rscreen, rtex, + &metadata); + + rscreen->ws->buffer_set_metadata(res->buf, &metadata); + } + } + + if (res->is_shared) { + /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user + * doesn't set it. + */ + res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; + if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) + res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH; + } else { + res->is_shared = true; + res->external_usage = usage; + } + + return rscreen->ws->buffer_get_handle(res->buf, + rtex->surface.level[0].pitch_bytes, + rtex->surface.level[0].offset, + rtex->surface.level[0].slice_size, + whandle); } static void r600_texture_destroy(struct pipe_screen *screen, @@ -261,14 +612,15 @@ static void r600_texture_destroy(struct pipe_screen *screen, struct r600_texture *rtex = (struct r600_texture*)ptex; struct r600_resource *resource = &rtex->resource; - if (rtex->flushed_depth_texture) - pipe_resource_reference((struct pipe_resource **)&rtex->flushed_depth_texture, NULL); + r600_texture_reference(&rtex->flushed_depth_texture, NULL); - pipe_resource_reference((struct pipe_resource**)&rtex->htile_buffer, NULL); + r600_resource_reference(&rtex->htile_buffer, NULL); if (rtex->cmask_buffer != &rtex->resource) { - pipe_resource_reference((struct pipe_resource**)&rtex->cmask_buffer, NULL); + r600_resource_reference(&rtex->cmask_buffer, NULL); } pb_reference(&resource->buf, NULL); + r600_resource_reference(&rtex->dcc_separate_buffer, NULL); + r600_resource_reference(&rtex->last_dcc_separate_buffer, NULL); FREE(rtex); } @@ -335,7 +687,7 @@ void r600_texture_get_fmask_info(struct r600_common_screen *rscreen, out->slice_tile_max -= 1; out->tile_mode_index = fmask.tiling_index[0]; - out->pitch = fmask.level[0].nblk_x; + out->pitch_in_pixels = fmask.level[0].nblk_x; out->bank_height = fmask.bankh; out->alignment = MAX2(256, fmask.bo_alignment); out->size = fmask.bo_size; @@ -347,7 +699,7 @@ static void r600_texture_allocate_fmask(struct r600_common_screen *rscreen, r600_texture_get_fmask_info(rscreen, rtex, rtex->resource.b.b.nr_samples, &rtex->fmask); - rtex->fmask.offset = align(rtex->size, rtex->fmask.alignment); + rtex->fmask.offset = align64(rtex->size, rtex->fmask.alignment); rtex->size = rtex->fmask.offset + rtex->fmask.size; } @@ -360,8 +712,8 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen, unsigned cmask_tile_elements = cmask_tile_width * cmask_tile_height; unsigned element_bits = 4; unsigned cmask_cache_bits = 1024; - unsigned num_pipes = rscreen->tiling_info.num_channels; - unsigned pipe_interleave_bytes = rscreen->tiling_info.group_bytes; + unsigned num_pipes = rscreen->info.num_tile_pipes; + unsigned pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes; unsigned elements_per_macro_tile = (cmask_cache_bits / element_bits) * num_pipes; unsigned pixels_per_macro_tile = elements_per_macro_tile * cmask_tile_elements; @@ -379,6 +731,10 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen, assert(macro_tile_width % 128 == 0); assert(macro_tile_height % 128 == 0); + out->pitch = pitch_elements; + out->height = height; + out->xalign = macro_tile_width; + out->yalign = macro_tile_height; out->slice_tile_max = ((pitch_elements * height) / (128*128)) - 1; out->alignment = MAX2(256, base_align); out->size = (util_max_layer(&rtex->resource.b.b, 0) + 1) * @@ -389,8 +745,8 @@ static void si_texture_get_cmask_info(struct r600_common_screen *rscreen, struct r600_texture *rtex, struct r600_cmask_info *out) { - unsigned pipe_interleave_bytes = rscreen->tiling_info.group_bytes; - unsigned num_pipes = rscreen->tiling_info.num_channels; + unsigned pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes; + unsigned num_pipes = rscreen->info.num_tile_pipes; unsigned cl_width, cl_height; switch (num_pipes) { @@ -424,6 +780,10 @@ static void si_texture_get_cmask_info(struct r600_common_screen *rscreen, /* Each element of CMASK is a nibble. */ unsigned slice_bytes = slice_elements / 2; + out->pitch = width; + out->height = height; + out->xalign = cl_width * 8; + out->yalign = cl_height * 8; out->slice_tile_max = (width * height) / (128*128); if (out->slice_tile_max) out->slice_tile_max -= 1; @@ -442,7 +802,7 @@ static void r600_texture_allocate_cmask(struct r600_common_screen *rscreen, r600_texture_get_cmask_info(rscreen, rtex, &rtex->cmask); } - rtex->cmask.offset = align(rtex->size, rtex->cmask.alignment); + rtex->cmask.offset = align64(rtex->size, rtex->cmask.alignment); rtex->size = rtex->cmask.offset + rtex->cmask.size; if (rscreen->chip_class >= SI) @@ -466,8 +826,9 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen } rtex->cmask_buffer = (struct r600_resource *) - pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM, - PIPE_USAGE_DEFAULT, rtex->cmask.size); + r600_aligned_buffer_create(&rscreen->b, 0, PIPE_USAGE_DEFAULT, + rtex->cmask.size, + rtex->cmask.alignment); if (rtex->cmask_buffer == NULL) { rtex->cmask.size = 0; return; @@ -480,6 +841,8 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen rtex->cb_color_info |= SI_S_028C70_FAST_CLEAR(1); else rtex->cb_color_info |= EG_S_028C70_FAST_CLEAR(1); + + p_atomic_inc(&rscreen->compressed_colortex_counter); } static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen, @@ -487,7 +850,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen, { unsigned cl_width, cl_height, width, height; unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align; - unsigned num_pipes = rscreen->tiling_info.num_channels; + unsigned num_pipes = rscreen->info.num_tile_pipes; if (rscreen->chip_class <= EVERGREEN && rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 26) @@ -505,6 +868,16 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen, rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38) return 0; + /* Overalign HTILE on P2 configs to work around GPU hangs in + * piglit/depthstencil-render-miplevels 585. + * + * This has been confirmed to help Kabini & Stoney, where the hangs + * are always reproducible. I think I have seen the test hang + * on Carrizo too, though it was very rare there. + */ + if (rscreen->chip_class >= CIK && num_pipes < 4) + num_pipes = 4; + switch (num_pipes) { case 1: cl_width = 32; @@ -537,9 +910,15 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen, slice_elements = (width * height) / (8 * 8); slice_bytes = slice_elements * 4; - pipe_interleave_bytes = rscreen->tiling_info.group_bytes; + pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes; base_align = num_pipes * pipe_interleave_bytes; + rtex->htile.pitch = width; + rtex->htile.height = height; + rtex->htile.xalign = cl_width * 8; + rtex->htile.yalign = cl_height * 8; + rtex->htile.alignment = base_align; + return (util_max_layer(&rtex->resource.b.b, 0) + 1) * align(slice_bytes, base_align); } @@ -547,21 +926,126 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen, static void r600_texture_allocate_htile(struct r600_common_screen *rscreen, struct r600_texture *rtex) { - unsigned htile_size = r600_texture_get_htile_size(rscreen, rtex); + uint64_t htile_size, alignment; + uint32_t clear_value; + + if (rtex->tc_compatible_htile) { + htile_size = rtex->surface.htile_size; + alignment = rtex->surface.htile_alignment; + clear_value = 0x0000030F; + } else { + htile_size = r600_texture_get_htile_size(rscreen, rtex); + alignment = rtex->htile.alignment; + clear_value = 0; + } if (!htile_size) return; rtex->htile_buffer = (struct r600_resource*) - pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM, - PIPE_USAGE_DEFAULT, htile_size); + r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM, + PIPE_USAGE_DEFAULT, + htile_size, alignment); if (rtex->htile_buffer == NULL) { /* this is not a fatal error as we can still keep rendering * without htile buffer */ R600_ERR("Failed to create buffer object for htile buffer.\n"); } else { - r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b, 0, - htile_size, 0, true); + r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b, + 0, htile_size, clear_value, + R600_COHERENCY_NONE); + } +} + +void r600_print_texture_info(struct r600_texture *rtex, FILE *f) +{ + int i; + + fprintf(f, " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, " + "blk_h=%u, blk_d=%u, array_size=%u, last_level=%u, " + "bpe=%u, nsamples=%u, flags=0x%x, %s\n", + rtex->surface.npix_x, rtex->surface.npix_y, + rtex->surface.npix_z, rtex->surface.blk_w, + rtex->surface.blk_h, rtex->surface.blk_d, + rtex->surface.array_size, rtex->surface.last_level, + rtex->surface.bpe, rtex->surface.nsamples, + rtex->surface.flags, util_format_short_name(rtex->resource.b.b.format)); + + fprintf(f, " Layout: size=%"PRIu64", alignment=%"PRIu64", bankw=%u, " + "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n", + rtex->surface.bo_size, rtex->surface.bo_alignment, rtex->surface.bankw, + rtex->surface.bankh, rtex->surface.num_banks, rtex->surface.mtilea, + rtex->surface.tile_split, rtex->surface.pipe_config, + (rtex->surface.flags & RADEON_SURF_SCANOUT) != 0); + + if (rtex->fmask.size) + fprintf(f, " FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, " + "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n", + rtex->fmask.offset, rtex->fmask.size, rtex->fmask.alignment, + rtex->fmask.pitch_in_pixels, rtex->fmask.bank_height, + rtex->fmask.slice_tile_max, rtex->fmask.tile_mode_index); + + if (rtex->cmask.size) + fprintf(f, " CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch=%u, " + "height=%u, xalign=%u, yalign=%u, slice_tile_max=%u\n", + rtex->cmask.offset, rtex->cmask.size, rtex->cmask.alignment, + rtex->cmask.pitch, rtex->cmask.height, rtex->cmask.xalign, + rtex->cmask.yalign, rtex->cmask.slice_tile_max); + + if (rtex->htile_buffer) + fprintf(f, " HTile: size=%u, alignment=%u, pitch=%u, height=%u, " + "xalign=%u, yalign=%u, TC_compatible = %u\n", + rtex->htile_buffer->b.b.width0, + rtex->htile_buffer->buf->alignment, rtex->htile.pitch, + rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign, + rtex->tc_compatible_htile); + + if (rtex->dcc_offset) { + fprintf(f, " DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%"PRIu64"\n", + rtex->dcc_offset, rtex->surface.dcc_size, + rtex->surface.dcc_alignment); + for (i = 0; i <= rtex->surface.last_level; i++) + fprintf(f, " DCCLevel[%i]: enabled=%u, offset=%"PRIu64", " + "fast_clear_size=%"PRIu64"\n", + i, rtex->surface.level[i].dcc_enabled, + rtex->surface.level[i].dcc_offset, + rtex->surface.level[i].dcc_fast_clear_size); + } + + for (i = 0; i <= rtex->surface.last_level; i++) + fprintf(f, " Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", " + "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " + "nblk_z=%u, pitch_bytes=%u, mode=%u\n", + i, rtex->surface.level[i].offset, + rtex->surface.level[i].slice_size, + u_minify(rtex->resource.b.b.width0, i), + u_minify(rtex->resource.b.b.height0, i), + u_minify(rtex->resource.b.b.depth0, i), + rtex->surface.level[i].nblk_x, + rtex->surface.level[i].nblk_y, + rtex->surface.level[i].nblk_z, + rtex->surface.level[i].pitch_bytes, + rtex->surface.level[i].mode); + + if (rtex->surface.flags & RADEON_SURF_SBUFFER) { + fprintf(f, " StencilLayout: tilesplit=%u\n", + rtex->surface.stencil_tile_split); + for (i = 0; i <= rtex->surface.last_level; i++) { + fprintf(f, " StencilLevel[%i]: offset=%"PRIu64", " + "slice_size=%"PRIu64", npix_x=%u, " + "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " + "nblk_z=%u, pitch_bytes=%u, mode=%u\n", + i, rtex->surface.stencil_level[i].offset, + rtex->surface.stencil_level[i].slice_size, + u_minify(rtex->resource.b.b.width0, i), + u_minify(rtex->resource.b.b.height0, i), + u_minify(rtex->resource.b.b.depth0, i), + rtex->surface.stencil_level[i].nblk_x, + rtex->surface.stencil_level[i].nblk_y, + rtex->surface.stencil_level[i].nblk_z, + rtex->surface.stencil_level[i].pitch_bytes, + rtex->surface.stencil_level[i].mode); + } } } @@ -570,6 +1054,7 @@ static struct r600_texture * r600_texture_create_object(struct pipe_screen *screen, const struct pipe_resource *base, unsigned pitch_in_bytes_override, + unsigned offset, struct pb_buffer *buf, struct radeon_surf *surface) { @@ -578,36 +1063,67 @@ r600_texture_create_object(struct pipe_screen *screen, struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; rtex = CALLOC_STRUCT(r600_texture); - if (rtex == NULL) + if (!rtex) return NULL; resource = &rtex->resource; resource->b.b = *base; + resource->b.b.next = NULL; resource->b.vtbl = &r600_texture_vtbl; pipe_reference_init(&resource->b.b.reference, 1); resource->b.b.screen = screen; - rtex->pitch_override = pitch_in_bytes_override; /* don't include stencil-only formats which we don't support for rendering */ rtex->is_depth = util_format_has_depth(util_format_description(rtex->resource.b.b.format)); rtex->surface = *surface; - if (r600_setup_surface(screen, rtex, pitch_in_bytes_override)) { + if (r600_setup_surface(screen, rtex, pitch_in_bytes_override, offset)) { FREE(rtex); return NULL; } + rtex->tc_compatible_htile = rtex->surface.htile_size != 0; + assert(!!(rtex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE) == + rtex->tc_compatible_htile); + + /* TC-compatible HTILE only supports Z32_FLOAT. */ + if (rtex->tc_compatible_htile) + rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT; + else + rtex->db_render_format = base->format; + /* Tiled depth textures utilize the non-displayable tile order. * This must be done after r600_setup_surface. * Applies to R600-Cayman. */ rtex->non_disp_tiling = rtex->is_depth && rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D; + /* Applies to GCN. */ + rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode; + + /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers + * between frames, so the only thing that can enable separate DCC + * with DRI2 is multiple slow clears within a frame. + */ + rtex->ps_draw_ratio = 0; if (rtex->is_depth) { + if (base->flags & (R600_RESOURCE_FLAG_TRANSFER | + R600_RESOURCE_FLAG_FLUSHED_DEPTH) || + rscreen->chip_class >= EVERGREEN) { + rtex->can_sample_z = !rtex->surface.depth_adjusted; + rtex->can_sample_s = !rtex->surface.stencil_adjusted; + } else { + if (rtex->resource.b.b.nr_samples <= 1 && + (rtex->resource.b.b.format == PIPE_FORMAT_Z16_UNORM || + rtex->resource.b.b.format == PIPE_FORMAT_Z32_FLOAT)) + rtex->can_sample_z = true; + } + if (!(base->flags & (R600_RESOURCE_FLAG_TRANSFER | - R600_RESOURCE_FLAG_FLUSHED_DEPTH)) && - !(rscreen->debug_flags & DBG_NO_HYPERZ)) { + R600_RESOURCE_FLAG_FLUSHED_DEPTH))) { + rtex->db_compatible = true; - r600_texture_allocate_htile(rscreen, rtex); + if (!(rscreen->debug_flags & DBG_NO_HYPERZ)) + r600_texture_allocate_htile(rscreen, rtex); } } else { if (base->nr_samples > 1) { @@ -621,27 +1137,56 @@ r600_texture_create_object(struct pipe_screen *screen, return NULL; } } + + /* Shared textures must always set up DCC here. + * If it's not present, it will be disabled by + * apply_opaque_metadata later. + */ + if (rtex->surface.dcc_size && + (buf || !(rscreen->debug_flags & DBG_NO_DCC)) && + !(rtex->surface.flags & RADEON_SURF_SCANOUT)) { + /* Reserve space for the DCC buffer. */ + rtex->dcc_offset = align64(rtex->size, rtex->surface.dcc_alignment); + rtex->size = rtex->dcc_offset + rtex->surface.dcc_size; + } } /* Now create the backing buffer. */ if (!buf) { - if (!r600_init_resource(rscreen, resource, rtex->size, - rtex->surface.bo_alignment, TRUE)) { + r600_init_resource_fields(rscreen, resource, rtex->size, + rtex->surface.bo_alignment); + + resource->flags |= RADEON_FLAG_HANDLE; + + if (!r600_alloc_resource(rscreen, resource)) { FREE(rtex); return NULL; } } else { resource->buf = buf; - resource->cs_buf = rscreen->ws->buffer_get_cs_handle(buf); - resource->gpu_address = rscreen->ws->buffer_get_virtual_address(resource->cs_buf); - resource->domains = rscreen->ws->buffer_get_initial_domain(resource->cs_buf); + resource->gpu_address = rscreen->ws->buffer_get_virtual_address(resource->buf); + resource->bo_size = buf->size; + resource->bo_alignment = buf->alignment; + resource->domains = rscreen->ws->buffer_get_initial_domain(resource->buf); + if (resource->domains & RADEON_DOMAIN_VRAM) + resource->vram_usage = buf->size; + else if (resource->domains & RADEON_DOMAIN_GTT) + resource->gart_usage = buf->size; } if (rtex->cmask.size) { /* Initialize the cmask to 0xCC (= compressed state). */ r600_screen_clear_buffer(rscreen, &rtex->cmask_buffer->b.b, rtex->cmask.offset, rtex->cmask.size, - 0xCCCCCCCC, true); + 0xCCCCCCCC, R600_COHERENCY_NONE); + } + + /* Initialize DCC only if the texture is not being imported. */ + if (!buf && rtex->dcc_offset) { + r600_screen_clear_buffer(rscreen, &rtex->resource.b.b, + rtex->dcc_offset, + rtex->surface.dcc_size, + 0xFFFFFFFF, R600_COHERENCY_NONE); } /* Initialize the CMASK base register value. */ @@ -656,50 +1201,12 @@ r600_texture_create_object(struct pipe_screen *screen, base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format)); } - if (rscreen->debug_flags & DBG_TEX || - (rtex->resource.b.b.last_level > 0 && rscreen->debug_flags & DBG_TEXMIP)) { - printf("Texture: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, " - "blk_h=%u, blk_d=%u, array_size=%u, last_level=%u, " - "bpe=%u, nsamples=%u, flags=0x%x, %s\n", - rtex->surface.npix_x, rtex->surface.npix_y, - rtex->surface.npix_z, rtex->surface.blk_w, - rtex->surface.blk_h, rtex->surface.blk_d, - rtex->surface.array_size, rtex->surface.last_level, - rtex->surface.bpe, rtex->surface.nsamples, - rtex->surface.flags, util_format_short_name(base->format)); - for (int i = 0; i <= rtex->surface.last_level; i++) { - printf(" L %i: offset=%"PRIu64", slice_size=%"PRIu64", npix_x=%u, " - "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " - "nblk_z=%u, pitch_bytes=%u, mode=%u\n", - i, rtex->surface.level[i].offset, - rtex->surface.level[i].slice_size, - u_minify(rtex->resource.b.b.width0, i), - u_minify(rtex->resource.b.b.height0, i), - u_minify(rtex->resource.b.b.depth0, i), - rtex->surface.level[i].nblk_x, - rtex->surface.level[i].nblk_y, - rtex->surface.level[i].nblk_z, - rtex->surface.level[i].pitch_bytes, - rtex->surface.level[i].mode); - } - if (rtex->surface.flags & RADEON_SURF_SBUFFER) { - for (int i = 0; i <= rtex->surface.last_level; i++) { - printf(" S %i: offset=%"PRIu64", slice_size=%"PRIu64", npix_x=%u, " - "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " - "nblk_z=%u, pitch_bytes=%u, mode=%u\n", - i, rtex->surface.stencil_level[i].offset, - rtex->surface.stencil_level[i].slice_size, - u_minify(rtex->resource.b.b.width0, i), - u_minify(rtex->resource.b.b.height0, i), - u_minify(rtex->resource.b.b.depth0, i), - rtex->surface.stencil_level[i].nblk_x, - rtex->surface.stencil_level[i].nblk_y, - rtex->surface.stencil_level[i].nblk_z, - rtex->surface.stencil_level[i].pitch_bytes, - rtex->surface.stencil_level[i].mode); - } - } + if (rscreen->debug_flags & DBG_TEX) { + puts("Texture:"); + r600_print_texture_info(rtex, stdout); + fflush(stdout); } + return rtex; } @@ -725,13 +1232,12 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen, force_tiling = true; /* Handle common candidates for the linear mode. - * Compressed textures must always be tiled. */ - if (!force_tiling && !util_format_is_compressed(templ->format)) { - /* Not everything can be linear, so we cannot enforce it - * for all textures. */ - if ((rscreen->debug_flags & DBG_NO_TILING) && - (!util_format_is_depth_or_stencil(templ->format) || - !(templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH))) + * Compressed textures and DB surfaces must always be tiled. + */ + if (!force_tiling && !util_format_is_compressed(templ->format) && + (!util_format_is_depth_or_stencil(templ->format) || + templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH)) { + if (rscreen->debug_flags & DBG_NO_TILING) return RADEON_SURF_MODE_LINEAR_ALIGNED; /* Tiling doesn't work with the 422 (SUBSAMPLED) formats on R600+. */ @@ -773,11 +1279,20 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen, { struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; struct radeon_surf surface = {0}; + bool is_flushed_depth = templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH; + bool tc_compatible_htile = + rscreen->chip_class >= VI && + (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) && + !(rscreen->debug_flags & DBG_NO_HYPERZ) && + !is_flushed_depth && + templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */ + util_format_is_depth_or_stencil(templ->format); + int r; r = r600_init_surface(rscreen, &surface, templ, r600_choose_tiling(rscreen, templ), - templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH); + is_flushed_depth, tc_compatible_htile); if (r) { return NULL; } @@ -785,55 +1300,70 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen, if (r) { return NULL; } - return (struct pipe_resource *)r600_texture_create_object(screen, templ, + return (struct pipe_resource *)r600_texture_create_object(screen, templ, 0, 0, NULL, &surface); } static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen, const struct pipe_resource *templ, - struct winsys_handle *whandle) + struct winsys_handle *whandle, + unsigned usage) { struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; struct pb_buffer *buf = NULL; - unsigned stride = 0; + unsigned stride = 0, offset = 0; unsigned array_mode; - enum radeon_bo_layout micro, macro; struct radeon_surf surface; - bool scanout; int r; + struct radeon_bo_metadata metadata = {}; + struct r600_texture *rtex; /* Support only 2D textures without mipmaps */ if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) || templ->depth0 != 1 || templ->last_level != 0) return NULL; - buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride); + buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride, &offset); if (!buf) return NULL; - rscreen->ws->buffer_get_tiling(buf, µ, ¯o, - &surface.bankw, &surface.bankh, - &surface.tile_split, - &surface.stencil_tile_split, - &surface.mtilea, &scanout); + rscreen->ws->buffer_get_metadata(buf, &metadata); + + surface.pipe_config = metadata.pipe_config; + surface.bankw = metadata.bankw; + surface.bankh = metadata.bankh; + surface.tile_split = metadata.tile_split; + surface.mtilea = metadata.mtilea; + surface.num_banks = metadata.num_banks; - if (macro == RADEON_LAYOUT_TILED) + if (metadata.macrotile == RADEON_LAYOUT_TILED) array_mode = RADEON_SURF_MODE_2D; - else if (micro == RADEON_LAYOUT_TILED) + else if (metadata.microtile == RADEON_LAYOUT_TILED) array_mode = RADEON_SURF_MODE_1D; else array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED; - r = r600_init_surface(rscreen, &surface, templ, array_mode, false); + r = r600_init_surface(rscreen, &surface, templ, array_mode, + false, false); if (r) { return NULL; } - if (scanout) + if (metadata.scanout) surface.flags |= RADEON_SURF_SCANOUT; - return (struct pipe_resource *)r600_texture_create_object(screen, templ, - stride, buf, &surface); + rtex = r600_texture_create_object(screen, templ, stride, + offset, buf, &surface); + if (!rtex) + return NULL; + + rtex->resource.is_shared = true; + rtex->resource.external_usage = usage; + + if (rscreen->apply_opaque_metadata) + rscreen->apply_opaque_metadata(rscreen, rtex, &metadata); + + return &rtex->resource.b.b; } bool r600_init_flushed_depth_texture(struct pipe_context *ctx, @@ -844,12 +1374,44 @@ bool r600_init_flushed_depth_texture(struct pipe_context *ctx, struct pipe_resource resource; struct r600_texture **flushed_depth_texture = staging ? staging : &rtex->flushed_depth_texture; + enum pipe_format pipe_format = texture->format; + + if (!staging) { + if (rtex->flushed_depth_texture) + return true; /* it's ready */ + + if (!rtex->can_sample_z && rtex->can_sample_s) { + switch (pipe_format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + /* Save memory by not allocating the S plane. */ + pipe_format = PIPE_FORMAT_Z32_FLOAT; + break; + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + /* Save memory bandwidth by not copying the + * stencil part during flush. + * + * This potentially increases memory bandwidth + * if an application uses both Z and S texturing + * simultaneously (a flushed Z24S8 texture + * would be stored compactly), but how often + * does that really happen? + */ + pipe_format = PIPE_FORMAT_Z24X8_UNORM; + break; + default:; + } + } else if (!rtex->can_sample_s && rtex->can_sample_z) { + assert(util_format_has_stencil(util_format_description(pipe_format))); - if (!staging && rtex->flushed_depth_texture) - return true; /* it's ready */ + /* DB->CB copies to an 8bpp surface don't work. */ + pipe_format = PIPE_FORMAT_X24S8_UINT; + } + } + memset(&resource, 0, sizeof(resource)); resource.target = texture->target; - resource.format = texture->format; + resource.format = pipe_format; resource.width0 = texture->width0; resource.height0 = texture->height0; resource.depth0 = texture->depth0; @@ -869,7 +1431,6 @@ bool r600_init_flushed_depth_texture(struct pipe_context *ctx, return false; } - (*flushed_depth_texture)->is_flushing_texture = TRUE; (*flushed_depth_texture)->non_disp_tiling = false; return true; } @@ -894,24 +1455,52 @@ static void r600_init_temp_resource_from_box(struct pipe_resource *res, res->flags = flags; /* We must set the correct texture target and dimensions for a 3D box. */ - if (box->depth > 1 && util_max_layer(orig, level) > 0) - res->target = orig->target; - else - res->target = PIPE_TEXTURE_2D; - - switch (res->target) { - case PIPE_TEXTURE_1D_ARRAY: - case PIPE_TEXTURE_2D_ARRAY: - case PIPE_TEXTURE_CUBE_ARRAY: + if (box->depth > 1 && util_max_layer(orig, level) > 0) { + res->target = PIPE_TEXTURE_2D_ARRAY; res->array_size = box->depth; - break; - case PIPE_TEXTURE_3D: - res->depth0 = box->depth; - break; - default:; + } else { + res->target = PIPE_TEXTURE_2D; } } +static bool r600_can_invalidate_texture(struct r600_common_screen *rscreen, + struct r600_texture *rtex, + unsigned transfer_usage, + const struct pipe_box *box) +{ + /* r600g doesn't react to dirty_tex_descriptor_counter */ + return rscreen->chip_class >= SI && + !rtex->resource.is_shared && + !(transfer_usage & PIPE_TRANSFER_READ) && + rtex->resource.b.b.last_level == 0 && + util_texrange_covers_whole_level(&rtex->resource.b.b, 0, + box->x, box->y, box->z, + box->width, box->height, + box->depth); +} + +static void r600_texture_invalidate_storage(struct r600_common_context *rctx, + struct r600_texture *rtex) +{ + struct r600_common_screen *rscreen = rctx->screen; + + /* There is no point in discarding depth and tiled buffers. */ + assert(!rtex->is_depth); + assert(rtex->surface.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED); + + /* Reallocate the buffer in the same pipe_resource. */ + r600_alloc_resource(rscreen, &rtex->resource); + + /* Initialize the CMASK base address (needed even without CMASK). */ + rtex->cmask.base_address_reg = + (rtex->resource.gpu_address + rtex->cmask.offset) >> 8; + + r600_dirty_all_framebuffer_states(rscreen); + p_atomic_inc(&rscreen->dirty_tex_descriptor_counter); + + rctx->num_alloc_tex_transfer_bytes += rtex->size; +} + static void *r600_texture_transfer_map(struct pipe_context *ctx, struct pipe_resource *texture, unsigned level, @@ -922,41 +1511,61 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_texture *rtex = (struct r600_texture*)texture; struct r600_transfer *trans; - boolean use_staging_texture = FALSE; struct r600_resource *buf; unsigned offset = 0; char *map; + bool use_staging_texture = false; - /* We cannot map a tiled texture directly because the data is - * in a different order, therefore we do detiling using a blit. - * - * Also, use a temporary in GTT memory for read transfers, as - * the CPU is much happier reading out of cached system memory - * than uncached VRAM. - */ - if (rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) { - use_staging_texture = TRUE; - } else if ((usage & PIPE_TRANSFER_READ) && !(usage & PIPE_TRANSFER_MAP_DIRECTLY) && - (rtex->resource.domains == RADEON_DOMAIN_VRAM)) { - /* Untiled buffers in VRAM, which is slow for CPU reads */ - use_staging_texture = TRUE; - } else if (!(usage & PIPE_TRANSFER_READ) && - (r600_rings_is_buffer_referenced(rctx, rtex->resource.cs_buf, RADEON_USAGE_READWRITE) || - !rctx->ws->buffer_wait(rtex->resource.buf, 0, RADEON_USAGE_READWRITE))) { - /* Use a staging texture for uploads if the underlying BO is busy. */ - use_staging_texture = TRUE; - } + assert(!(texture->flags & R600_RESOURCE_FLAG_TRANSFER)); - if (texture->flags & R600_RESOURCE_FLAG_TRANSFER) { - use_staging_texture = FALSE; - } + /* Depth textures use staging unconditionally. */ + if (!rtex->is_depth) { + /* Degrade the tile mode if we get too many transfers on APUs. + * On dGPUs, the staging texture is always faster. + * Only count uploads that are at least 4x4 pixels large. + */ + if (!rctx->screen->info.has_dedicated_vram && + level == 0 && + box->width >= 4 && box->height >= 4 && + p_atomic_inc_return(&rtex->num_level0_transfers) == 10) { + bool can_invalidate = + r600_can_invalidate_texture(rctx->screen, rtex, + usage, box); + + r600_degrade_tile_mode_to_linear(rctx, rtex, + can_invalidate); + } - if (use_staging_texture && (usage & PIPE_TRANSFER_MAP_DIRECTLY)) { - return NULL; + /* Tiled textures need to be converted into a linear texture for CPU + * access. The staging texture is always linear and is placed in GART. + * + * Reading from VRAM is slow, always use the staging texture in + * this case. + * + * Use the staging texture for uploads if the underlying BO + * is busy. + */ + if (rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) + use_staging_texture = true; + else if (usage & PIPE_TRANSFER_READ) + use_staging_texture = (rtex->resource.domains & + RADEON_DOMAIN_VRAM) != 0; + /* Write & linear only: */ + else if (r600_rings_is_buffer_referenced(rctx, rtex->resource.buf, + RADEON_USAGE_READWRITE) || + !rctx->ws->buffer_wait(rtex->resource.buf, 0, + RADEON_USAGE_READWRITE)) { + /* It's busy. */ + if (r600_can_invalidate_texture(rctx->screen, rtex, + usage, box)) + r600_texture_invalidate_storage(rctx, rtex); + else + use_staging_texture = true; + } } trans = CALLOC_STRUCT(r600_transfer); - if (trans == NULL) + if (!trans) return NULL; trans->transfer.resource = texture; trans->transfer.level = level; @@ -998,7 +1607,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, r600_copy_region_with_blit(ctx, temp, 0, 0, 0, 0, texture, level, box); rctx->blit_decompress_depth(ctx, (struct r600_texture*)temp, staging_depth, 0, 0, 0, box->depth, 0, 0); - pipe_resource_reference((struct pipe_resource**)&temp, NULL); + pipe_resource_reference(&temp, NULL); } } else { @@ -1021,6 +1630,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, trans->transfer.stride = staging_depth->surface.level[level].pitch_bytes; trans->transfer.layer_stride = staging_depth->surface.level[level].slice_size; trans->staging = (struct r600_resource*)staging_depth; + buf = trans->staging; } else if (use_staging_texture) { struct pipe_resource resource; struct r600_texture *staging; @@ -1032,7 +1642,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, /* Create the temporary texture. */ staging = (struct r600_texture*)ctx->screen->resource_create(ctx->screen, &resource); - if (staging == NULL) { + if (!staging) { R600_ERR("failed to create temporary texture to hold untiled copy\n"); FREE(trans); return NULL; @@ -1040,26 +1650,23 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, trans->staging = &staging->resource; trans->transfer.stride = staging->surface.level[0].pitch_bytes; trans->transfer.layer_stride = staging->surface.level[0].slice_size; - if (usage & PIPE_TRANSFER_READ) { + + if (usage & PIPE_TRANSFER_READ) r600_copy_to_staging_texture(ctx, trans); - } + else + usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + + buf = trans->staging; } else { /* the resource is mapped directly */ trans->transfer.stride = rtex->surface.level[level].pitch_bytes; trans->transfer.layer_stride = rtex->surface.level[level].slice_size; offset = r600_texture_get_offset(rtex, level, box); - } - - if (trans->staging) { - buf = trans->staging; - if (!rtex->is_depth && !(usage & PIPE_TRANSFER_READ)) - usage |= PIPE_TRANSFER_UNSYNCHRONIZED; - } else { buf = &rtex->resource; } if (!(map = r600_buffer_map_sync_with_rings(rctx, buf, usage))) { - pipe_resource_reference((struct pipe_resource**)&trans->staging, NULL); + r600_resource_reference(&trans->staging, NULL); FREE(trans); return NULL; } @@ -1071,6 +1678,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, static void r600_texture_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer* transfer) { + struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; struct pipe_resource *texture = transfer->resource; struct r600_texture *rtex = (struct r600_texture*)texture; @@ -1086,8 +1694,28 @@ static void r600_texture_transfer_unmap(struct pipe_context *ctx, } } - if (rtransfer->staging) - pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL); + if (rtransfer->staging) { + rctx->num_alloc_tex_transfer_bytes += rtransfer->staging->buf->size; + r600_resource_reference(&rtransfer->staging, NULL); + } + + /* Heuristic for {upload, draw, upload, draw, ..}: + * + * Flush the gfx IB if we've allocated too much texture storage. + * + * The idea is that we don't want to build IBs that use too much + * memory and put pressure on the kernel memory manager and we also + * want to make temporary and invalidated buffers go idle ASAP to + * decrease the total memory usage or make them reusable. The memory + * usage will be slightly higher than given here because of the buffer + * cache in the winsys. + * + * The result is that the kernel memory manager is never a bottleneck. + */ + if (rctx->num_alloc_tex_transfer_bytes > rctx->screen->info.gart_size / 4) { + rctx->gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL); + rctx->num_alloc_tex_transfer_bytes = 0; + } FREE(transfer); } @@ -1097,19 +1725,118 @@ static const struct u_resource_vtbl r600_texture_vtbl = NULL, /* get_handle */ r600_texture_destroy, /* resource_destroy */ r600_texture_transfer_map, /* transfer_map */ - NULL, /* transfer_flush_region */ + u_default_transfer_flush_region, /* transfer_flush_region */ r600_texture_transfer_unmap, /* transfer_unmap */ - NULL /* transfer_inline_write */ }; +/* DCC channel type categories within which formats can be reinterpreted + * while keeping the same DCC encoding. The swizzle must also match. */ +enum dcc_channel_type { + dcc_channel_float32, + dcc_channel_uint32, + dcc_channel_sint32, + dcc_channel_float16, + dcc_channel_uint16, + dcc_channel_sint16, + dcc_channel_uint_10_10_10_2, + dcc_channel_uint8, + dcc_channel_sint8, + dcc_channel_incompatible, +}; + +/* Return the type of DCC encoding. */ +static enum dcc_channel_type +vi_get_dcc_channel_type(const struct util_format_description *desc) +{ + int i; + + /* Find the first non-void channel. */ + for (i = 0; i < desc->nr_channels; i++) + if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) + break; + if (i == desc->nr_channels) + return dcc_channel_incompatible; + + switch (desc->channel[i].size) { + case 32: + if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) + return dcc_channel_float32; + if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) + return dcc_channel_uint32; + return dcc_channel_sint32; + case 16: + if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) + return dcc_channel_float16; + if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) + return dcc_channel_uint16; + return dcc_channel_sint16; + case 10: + return dcc_channel_uint_10_10_10_2; + case 8: + if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) + return dcc_channel_uint8; + return dcc_channel_sint8; + default: + return dcc_channel_incompatible; + } +} + +/* Return if it's allowed to reinterpret one format as another with DCC enabled. */ +bool vi_dcc_formats_compatible(enum pipe_format format1, + enum pipe_format format2) +{ + const struct util_format_description *desc1, *desc2; + enum dcc_channel_type type1, type2; + int i; + + if (format1 == format2) + return true; + + desc1 = util_format_description(format1); + desc2 = util_format_description(format2); + + if (desc1->nr_channels != desc2->nr_channels) + return false; + + /* Swizzles must be the same. */ + for (i = 0; i < desc1->nr_channels; i++) + if (desc1->swizzle[i] <= PIPE_SWIZZLE_W && + desc2->swizzle[i] <= PIPE_SWIZZLE_W && + desc1->swizzle[i] != desc2->swizzle[i]) + return false; + + type1 = vi_get_dcc_channel_type(desc1); + type2 = vi_get_dcc_channel_type(desc2); + + return type1 != dcc_channel_incompatible && + type2 != dcc_channel_incompatible && + type1 == type2; +} + +void vi_dcc_disable_if_incompatible_format(struct r600_common_context *rctx, + struct pipe_resource *tex, + unsigned level, + enum pipe_format view_format) +{ + struct r600_texture *rtex = (struct r600_texture *)tex; + + if (rtex->dcc_offset && + rtex->surface.level[level].dcc_enabled && + !vi_dcc_formats_compatible(tex->format, view_format)) + if (!r600_texture_disable_dcc(rctx, (struct r600_texture*)tex)) + rctx->decompress_dcc(&rctx->b, rtex); +} + struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe, struct pipe_resource *texture, const struct pipe_surface *templ, unsigned width, unsigned height) { + struct r600_common_context *rctx = (struct r600_common_context*)pipe; + struct r600_texture *rtex = (struct r600_texture*)texture; struct r600_surface *surface = CALLOC_STRUCT(r600_surface); - if (surface == NULL) + if (!surface) return NULL; assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level)); @@ -1122,6 +1849,13 @@ struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe, surface->base.width = width; surface->base.height = height; surface->base.u = templ->u; + surface->level_info = &rtex->surface.level[templ->u.tex.level]; + + if (texture->target != PIPE_BUFFER) + vi_dcc_disable_if_incompatible_format(rctx, texture, + templ->u.tex.level, + templ->format); + return &surface->base; } @@ -1130,27 +1864,112 @@ static struct pipe_surface *r600_create_surface(struct pipe_context *pipe, const struct pipe_surface *templ) { unsigned level = templ->u.tex.level; + unsigned width = u_minify(tex->width0, level); + unsigned height = u_minify(tex->height0, level); + + if (tex->target != PIPE_BUFFER && templ->format != tex->format) { + const struct util_format_description *tex_desc + = util_format_description(tex->format); + const struct util_format_description *templ_desc + = util_format_description(templ->format); + + assert(tex_desc->block.bits == templ_desc->block.bits); + + /* Adjust size of surface if and only if the block width or + * height is changed. */ + if (tex_desc->block.width != templ_desc->block.width || + tex_desc->block.height != templ_desc->block.height) { + unsigned nblks_x = util_format_get_nblocksx(tex->format, width); + unsigned nblks_y = util_format_get_nblocksy(tex->format, height); + + width = nblks_x * templ_desc->block.width; + height = nblks_y * templ_desc->block.height; + } + } - return r600_create_surface_custom(pipe, tex, templ, - u_minify(tex->width0, level), - u_minify(tex->height0, level)); + return r600_create_surface_custom(pipe, tex, templ, width, height); } static void r600_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surface) { struct r600_surface *surf = (struct r600_surface*)surface; - pipe_resource_reference((struct pipe_resource**)&surf->cb_buffer_fmask, NULL); - pipe_resource_reference((struct pipe_resource**)&surf->cb_buffer_cmask, NULL); + r600_resource_reference(&surf->cb_buffer_fmask, NULL); + r600_resource_reference(&surf->cb_buffer_cmask, NULL); pipe_resource_reference(&surface->texture, NULL); FREE(surface); } -unsigned r600_translate_colorswap(enum pipe_format format) +static void r600_clear_texture(struct pipe_context *pipe, + struct pipe_resource *tex, + unsigned level, + const struct pipe_box *box, + const void *data) +{ + struct pipe_screen *screen = pipe->screen; + struct r600_texture *rtex = (struct r600_texture*)tex; + struct pipe_surface tmpl = {{0}}; + struct pipe_surface *sf; + const struct util_format_description *desc = + util_format_description(tex->format); + + tmpl.format = tex->format; + tmpl.u.tex.first_layer = box->z; + tmpl.u.tex.last_layer = box->z + box->depth - 1; + tmpl.u.tex.level = level; + sf = pipe->create_surface(pipe, tex, &tmpl); + if (!sf) + return; + + if (rtex->is_depth) { + unsigned clear; + float depth; + uint8_t stencil = 0; + + /* Depth is always present. */ + clear = PIPE_CLEAR_DEPTH; + desc->unpack_z_float(&depth, 0, data, 0, 1, 1); + + if (rtex->surface.flags & RADEON_SURF_SBUFFER) { + clear |= PIPE_CLEAR_STENCIL; + desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1); + } + + pipe->clear_depth_stencil(pipe, sf, clear, depth, stencil, + box->x, box->y, + box->width, box->height, false); + } else { + union pipe_color_union color; + + /* pipe_color_union requires the full vec4 representation. */ + if (util_format_is_pure_uint(tex->format)) + desc->unpack_rgba_uint(color.ui, 0, data, 0, 1, 1); + else if (util_format_is_pure_sint(tex->format)) + desc->unpack_rgba_sint(color.i, 0, data, 0, 1, 1); + else + desc->unpack_rgba_float(color.f, 0, data, 0, 1, 1); + + if (screen->is_format_supported(screen, tex->format, + tex->target, 0, + PIPE_BIND_RENDER_TARGET)) { + pipe->clear_render_target(pipe, sf, &color, + box->x, box->y, + box->width, box->height, false); + } else { + /* Software fallback - just for R9G9B9E5_FLOAT */ + util_clear_render_target(pipe, sf, &color, + box->x, box->y, + box->width, box->height); + } + } + pipe_surface_reference(&sf, NULL); +} + +unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap) { const struct util_format_description *desc = util_format_description(format); -#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == UTIL_FORMAT_SWIZZLE_##swz) +#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz) if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */ return V_0280A0_SWAP_STD; @@ -1173,7 +1992,8 @@ unsigned r600_translate_colorswap(enum pipe_format format) else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) || (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) || (HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X))) - return V_0280A0_SWAP_STD_REV; /* YX__ */ + /* YX__ */ + return (do_endian_swap ? V_0280A0_SWAP_STD : V_0280A0_SWAP_STD_REV); else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y)) return V_0280A0_SWAP_ALT; /* X__Y */ else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X)) @@ -1181,25 +2001,269 @@ unsigned r600_translate_colorswap(enum pipe_format format) break; case 3: if (HAS_SWIZZLE(0,X)) - return V_0280A0_SWAP_STD; /* XYZ */ + return (do_endian_swap ? V_0280A0_SWAP_STD_REV : V_0280A0_SWAP_STD); else if (HAS_SWIZZLE(0,Z)) return V_0280A0_SWAP_STD_REV; /* ZYX */ break; case 4: /* check the middle channels, the 1st and 4th channel can be NONE */ - if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) + if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) { return V_0280A0_SWAP_STD; /* XYZW */ - else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) + } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) { return V_0280A0_SWAP_STD_REV; /* WZYX */ - else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) + } else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) { return V_0280A0_SWAP_ALT; /* ZYXW */ - else if (HAS_SWIZZLE(1,X) && HAS_SWIZZLE(2,Y)) - return V_0280A0_SWAP_ALT_REV; /* WXYZ */ + } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) { + /* YZWX */ + if (desc->is_array) + return V_0280A0_SWAP_ALT_REV; + else + return (do_endian_swap ? V_0280A0_SWAP_ALT : V_0280A0_SWAP_ALT_REV); + } break; } return ~0U; } +/* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */ + +static void vi_dcc_clean_up_context_slot(struct r600_common_context *rctx, + int slot) +{ + int i; + + if (rctx->dcc_stats[slot].query_active) + vi_separate_dcc_stop_query(&rctx->b, + rctx->dcc_stats[slot].tex); + + for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats[slot].ps_stats); i++) + if (rctx->dcc_stats[slot].ps_stats[i]) { + rctx->b.destroy_query(&rctx->b, + rctx->dcc_stats[slot].ps_stats[i]); + rctx->dcc_stats[slot].ps_stats[i] = NULL; + } + + r600_texture_reference(&rctx->dcc_stats[slot].tex, NULL); +} + +/** + * Return the per-context slot where DCC statistics queries for the texture live. + */ +static unsigned vi_get_context_dcc_stats_index(struct r600_common_context *rctx, + struct r600_texture *tex) +{ + int i, empty_slot = -1; + + /* Remove zombie textures (textures kept alive by this array only). */ + for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) + if (rctx->dcc_stats[i].tex && + rctx->dcc_stats[i].tex->resource.b.b.reference.count == 1) + vi_dcc_clean_up_context_slot(rctx, i); + + /* Find the texture. */ + for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) { + /* Return if found. */ + if (rctx->dcc_stats[i].tex == tex) { + rctx->dcc_stats[i].last_use_timestamp = os_time_get(); + return i; + } + + /* Record the first seen empty slot. */ + if (empty_slot == -1 && !rctx->dcc_stats[i].tex) + empty_slot = i; + } + + /* Not found. Remove the oldest member to make space in the array. */ + if (empty_slot == -1) { + int oldest_slot = 0; + + /* Find the oldest slot. */ + for (i = 1; i < ARRAY_SIZE(rctx->dcc_stats); i++) + if (rctx->dcc_stats[oldest_slot].last_use_timestamp > + rctx->dcc_stats[i].last_use_timestamp) + oldest_slot = i; + + /* Clean up the oldest slot. */ + vi_dcc_clean_up_context_slot(rctx, oldest_slot); + empty_slot = oldest_slot; + } + + /* Add the texture to the new slot. */ + r600_texture_reference(&rctx->dcc_stats[empty_slot].tex, tex); + rctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get(); + return empty_slot; +} + +static struct pipe_query * +vi_create_resuming_pipestats_query(struct pipe_context *ctx) +{ + struct r600_query_hw *query = (struct r600_query_hw*) + ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0); + + query->flags |= R600_QUERY_HW_FLAG_BEGIN_RESUMES; + return (struct pipe_query*)query; +} + +/** + * Called when binding a color buffer. + */ +void vi_separate_dcc_start_query(struct pipe_context *ctx, + struct r600_texture *tex) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + unsigned i = vi_get_context_dcc_stats_index(rctx, tex); + + assert(!rctx->dcc_stats[i].query_active); + + if (!rctx->dcc_stats[i].ps_stats[0]) + rctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(ctx); + + /* begin or resume the query */ + ctx->begin_query(ctx, rctx->dcc_stats[i].ps_stats[0]); + rctx->dcc_stats[i].query_active = true; +} + +/** + * Called when unbinding a color buffer. + */ +void vi_separate_dcc_stop_query(struct pipe_context *ctx, + struct r600_texture *tex) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + unsigned i = vi_get_context_dcc_stats_index(rctx, tex); + + assert(rctx->dcc_stats[i].query_active); + assert(rctx->dcc_stats[i].ps_stats[0]); + + /* pause or end the query */ + ctx->end_query(ctx, rctx->dcc_stats[i].ps_stats[0]); + rctx->dcc_stats[i].query_active = false; +} + +static bool vi_should_enable_separate_dcc(struct r600_texture *tex) +{ + /* The minimum number of fullscreen draws per frame that is required + * to enable DCC. */ + return tex->ps_draw_ratio + tex->num_slow_clears >= 5; +} + +/* Called by fast clear. */ +static void vi_separate_dcc_try_enable(struct r600_common_context *rctx, + struct r600_texture *tex) +{ + /* The intent is to use this with shared displayable back buffers, + * but it's not strictly limited only to them. + */ + if (!tex->resource.is_shared || + !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) || + tex->resource.b.b.target != PIPE_TEXTURE_2D || + tex->surface.last_level > 0 || + !tex->surface.dcc_size) + return; + + if (tex->dcc_offset) + return; /* already enabled */ + + /* Enable the DCC stat gathering. */ + if (!tex->dcc_gather_statistics) { + tex->dcc_gather_statistics = true; + vi_separate_dcc_start_query(&rctx->b, tex); + } + + if (!vi_should_enable_separate_dcc(tex)) + return; /* stats show that DCC decompression is too expensive */ + + assert(tex->surface.level[0].dcc_enabled); + assert(!tex->dcc_separate_buffer); + + r600_texture_discard_cmask(rctx->screen, tex); + + /* Get a DCC buffer. */ + if (tex->last_dcc_separate_buffer) { + assert(tex->dcc_gather_statistics); + assert(!tex->dcc_separate_buffer); + tex->dcc_separate_buffer = tex->last_dcc_separate_buffer; + tex->last_dcc_separate_buffer = NULL; + } else { + tex->dcc_separate_buffer = (struct r600_resource*) + r600_aligned_buffer_create(rctx->b.screen, 0, + PIPE_USAGE_DEFAULT, + tex->surface.dcc_size, + tex->surface.dcc_alignment); + if (!tex->dcc_separate_buffer) + return; + } + + /* dcc_offset is the absolute GPUVM address. */ + tex->dcc_offset = tex->dcc_separate_buffer->gpu_address; + + /* no need to flag anything since this is called by fast clear that + * flags framebuffer state + */ +} + +/** + * Called by pipe_context::flush_resource, the place where DCC decompression + * takes place. + */ +void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx, + struct r600_texture *tex) +{ + struct r600_common_context *rctx = (struct r600_common_context*)ctx; + struct pipe_query *tmp; + unsigned i = vi_get_context_dcc_stats_index(rctx, tex); + bool query_active = rctx->dcc_stats[i].query_active; + bool disable = false; + + if (rctx->dcc_stats[i].ps_stats[2]) { + union pipe_query_result result; + + /* Read the results. */ + ctx->get_query_result(ctx, rctx->dcc_stats[i].ps_stats[2], + true, &result); + r600_query_hw_reset_buffers(rctx, + (struct r600_query_hw*) + rctx->dcc_stats[i].ps_stats[2]); + + /* Compute the approximate number of fullscreen draws. */ + tex->ps_draw_ratio = + result.pipeline_statistics.ps_invocations / + (tex->resource.b.b.width0 * tex->resource.b.b.height0); + rctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio; + + disable = tex->dcc_separate_buffer && + !vi_should_enable_separate_dcc(tex); + } + + tex->num_slow_clears = 0; + + /* stop the statistics query for ps_stats[0] */ + if (query_active) + vi_separate_dcc_stop_query(ctx, tex); + + /* Move the queries in the queue by one. */ + tmp = rctx->dcc_stats[i].ps_stats[2]; + rctx->dcc_stats[i].ps_stats[2] = rctx->dcc_stats[i].ps_stats[1]; + rctx->dcc_stats[i].ps_stats[1] = rctx->dcc_stats[i].ps_stats[0]; + rctx->dcc_stats[i].ps_stats[0] = tmp; + + /* create and start a new query as ps_stats[0] */ + if (query_active) + vi_separate_dcc_start_query(ctx, tex); + + if (disable) { + assert(!tex->last_dcc_separate_buffer); + tex->last_dcc_separate_buffer = tex->dcc_separate_buffer; + tex->dcc_separate_buffer = NULL; + tex->dcc_offset = 0; + /* no need to flag anything since this is called after + * decompression that re-sets framebuffer state + */ + } +} + +/* FAST COLOR CLEAR */ + static void evergreen_set_clear_color(struct r600_texture *rtex, enum pipe_format surface_format, const union pipe_color_union *color) @@ -1208,7 +2272,16 @@ static void evergreen_set_clear_color(struct r600_texture *rtex, memset(&uc, 0, sizeof(uc)); - if (util_format_is_pure_uint(surface_format)) { + if (util_format_get_blocksizebits(surface_format) == 128) { + /* DCC fast clear only: + * CLEAR_WORD0 = R = G = B + * CLEAR_WORD1 = A + */ + assert(color->ui[0] == color->ui[1] && + color->ui[0] == color->ui[2]); + uc.ui[0] = color->ui[0]; + uc.ui[1] = color->ui[3]; + } else if (util_format_is_pure_uint(surface_format)) { util_format_write_4ui(surface_format, color->ui, 0, &uc, 0, 0, 0, 1, 1); } else if (util_format_is_pure_sint(surface_format)) { util_format_write_4i(surface_format, color->i, 0, &uc, 0, 0, 0, 1, 1); @@ -1219,15 +2292,210 @@ static void evergreen_set_clear_color(struct r600_texture *rtex, memcpy(rtex->color_clear_value, &uc, 2 * sizeof(uint32_t)); } +static bool vi_get_fast_clear_parameters(enum pipe_format surface_format, + const union pipe_color_union *color, + uint32_t* reset_value, + bool* clear_words_needed) +{ + bool values[4] = {}; + int i; + bool main_value = false; + bool extra_value = false; + int extra_channel; + const struct util_format_description *desc = util_format_description(surface_format); + + if (desc->block.bits == 128 && + (color->ui[0] != color->ui[1] || + color->ui[0] != color->ui[2])) + return false; + + *clear_words_needed = true; + *reset_value = 0x20202020U; + + /* If we want to clear without needing a fast clear eliminate step, we + * can set each channel to 0 or 1 (or 0/max for integer formats). We + * have two sets of flags, one for the last or first channel(extra) and + * one for the other channels(main). + */ + + if (surface_format == PIPE_FORMAT_R11G11B10_FLOAT || + surface_format == PIPE_FORMAT_B5G6R5_UNORM || + surface_format == PIPE_FORMAT_B5G6R5_SRGB) { + extra_channel = -1; + } else if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) { + if(r600_translate_colorswap(surface_format, false) <= 1) + extra_channel = desc->nr_channels - 1; + else + extra_channel = 0; + } else + return true; + + for (i = 0; i < 4; ++i) { + int index = desc->swizzle[i] - PIPE_SWIZZLE_X; + + if (desc->swizzle[i] < PIPE_SWIZZLE_X || + desc->swizzle[i] > PIPE_SWIZZLE_W) + continue; + + if (desc->channel[i].pure_integer && + desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { + /* Use the maximum value for clamping the clear color. */ + int max = u_bit_consecutive(0, desc->channel[i].size - 1); + + values[i] = color->i[i] != 0; + if (color->i[i] != 0 && MIN2(color->i[i], max) != max) + return true; + } else if (desc->channel[i].pure_integer && + desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) { + /* Use the maximum value for clamping the clear color. */ + unsigned max = u_bit_consecutive(0, desc->channel[i].size); + + values[i] = color->ui[i] != 0U; + if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max) + return true; + } else { + values[i] = color->f[i] != 0.0F; + if (color->f[i] != 0.0F && color->f[i] != 1.0F) + return true; + } + + if (index == extra_channel) + extra_value = values[i]; + else + main_value = values[i]; + } + + for (int i = 0; i < 4; ++i) + if (values[i] != main_value && + desc->swizzle[i] - PIPE_SWIZZLE_X != extra_channel && + desc->swizzle[i] >= PIPE_SWIZZLE_X && + desc->swizzle[i] <= PIPE_SWIZZLE_W) + return true; + + *clear_words_needed = false; + if (main_value) + *reset_value |= 0x80808080U; + + if (extra_value) + *reset_value |= 0x40404040U; + return true; +} + +void vi_dcc_clear_level(struct r600_common_context *rctx, + struct r600_texture *rtex, + unsigned level, unsigned clear_value) +{ + struct pipe_resource *dcc_buffer; + uint64_t dcc_offset; + + assert(rtex->dcc_offset && rtex->surface.level[level].dcc_enabled); + + if (rtex->dcc_separate_buffer) { + dcc_buffer = &rtex->dcc_separate_buffer->b.b; + dcc_offset = 0; + } else { + dcc_buffer = &rtex->resource.b.b; + dcc_offset = rtex->dcc_offset; + } + + dcc_offset += rtex->surface.level[level].dcc_offset; + + rctx->clear_buffer(&rctx->b, dcc_buffer, dcc_offset, + rtex->surface.level[level].dcc_fast_clear_size, + clear_value, R600_COHERENCY_CB_META); +} + +/* Set the same micro tile mode as the destination of the last MSAA resolve. + * This allows hitting the MSAA resolve fast path, which requires that both + * src and dst micro tile modes match. + */ +static void si_set_optimal_micro_tile_mode(struct r600_common_screen *rscreen, + struct r600_texture *rtex) +{ + if (rtex->resource.is_shared || + rtex->surface.nsamples <= 1 || + rtex->surface.micro_tile_mode == rtex->last_msaa_resolve_target_micro_mode) + return; + + assert(rtex->surface.level[0].mode == RADEON_SURF_MODE_2D); + assert(rtex->surface.last_level == 0); + + /* These magic numbers were copied from addrlib. It doesn't use any + * definitions for them either. They are all 2D_TILED_THIN1 modes with + * different bpp and micro tile mode. + */ + if (rscreen->chip_class >= CIK) { + switch (rtex->last_msaa_resolve_target_micro_mode) { + case 0: /* displayable */ + rtex->surface.tiling_index[0] = 10; + break; + case 1: /* thin */ + rtex->surface.tiling_index[0] = 14; + break; + case 3: /* rotated */ + rtex->surface.tiling_index[0] = 28; + break; + default: /* depth, thick */ + assert(!"unexpected micro mode"); + return; + } + } else { /* SI */ + switch (rtex->last_msaa_resolve_target_micro_mode) { + case 0: /* displayable */ + switch (rtex->surface.bpe) { + case 1: + rtex->surface.tiling_index[0] = 10; + break; + case 2: + rtex->surface.tiling_index[0] = 11; + break; + default: /* 4, 8 */ + rtex->surface.tiling_index[0] = 12; + break; + } + break; + case 1: /* thin */ + switch (rtex->surface.bpe) { + case 1: + rtex->surface.tiling_index[0] = 14; + break; + case 2: + rtex->surface.tiling_index[0] = 15; + break; + case 4: + rtex->surface.tiling_index[0] = 16; + break; + default: /* 8, 16 */ + rtex->surface.tiling_index[0] = 17; + break; + } + break; + default: /* depth, thick */ + assert(!"unexpected micro mode"); + return; + } + } + + rtex->surface.micro_tile_mode = rtex->last_msaa_resolve_target_micro_mode; + + p_atomic_inc(&rscreen->dirty_fb_counter); + p_atomic_inc(&rscreen->dirty_tex_descriptor_counter); +} + void evergreen_do_fast_color_clear(struct r600_common_context *rctx, struct pipe_framebuffer_state *fb, struct r600_atom *fb_state, - unsigned *buffers, + unsigned *buffers, unsigned *dirty_cbufs, const union pipe_color_union *color) { int i; - if (rctx->current_render_cond) + /* This function is broken in BE, so just disable this path for now */ +#ifdef PIPE_ARCH_BIG_ENDIAN + return; +#endif + + if (rctx->render_cond) return; for (i = 0; i < fb->nr_cbufs; i++) { @@ -1243,11 +2511,6 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, tex = (struct r600_texture *)fb->cbufs[i]->texture; - /* 128-bit formats are unusupported */ - if (util_format_get_blocksizebits(fb->cbufs[i]->format) > 64) { - continue; - } - /* the clear is allowed if all layers are bound */ if (fb->cbufs[i]->u.tex.first_layer != 0 || fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->resource.b.b, 0)) { @@ -1264,6 +2527,14 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, continue; } + /* shared textures can't use fast clear without an explicit flush, + * because there is no way to communicate the clear color among + * all clients + */ + if (tex->resource.is_shared && + !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) + continue; + /* fast color clear with 1D tiling doesn't work on old kernels and CIK */ if (tex->surface.level[0].mode == RADEON_SURF_MODE_1D && rctx->chip_class >= CIK && @@ -1272,18 +2543,72 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, continue; } - /* ensure CMASK is enabled */ - r600_texture_alloc_cmask_separate(rctx->screen, tex); - if (tex->cmask.size == 0) { - continue; + /* Fast clear is the most appropriate place to enable DCC for + * displayable surfaces. + */ + if (rctx->chip_class >= VI && + !(rctx->screen->debug_flags & DBG_NO_DCC_FB)) { + vi_separate_dcc_try_enable(rctx, tex); + + /* Stoney can't do a CMASK-based clear, so all clears are + * considered to be hypothetically slow clears, which + * is weighed when determining to enable separate DCC. + */ + if (tex->dcc_gather_statistics && + rctx->family == CHIP_STONEY) + tex->num_slow_clears++; + } + + /* Try to clear DCC first, otherwise try CMASK. */ + if (tex->dcc_offset && tex->surface.level[0].dcc_enabled) { + uint32_t reset_value; + bool clear_words_needed; + + if (rctx->screen->debug_flags & DBG_NO_DCC_CLEAR) + continue; + + if (!vi_get_fast_clear_parameters(fb->cbufs[i]->format, + color, &reset_value, + &clear_words_needed)) + continue; + + vi_dcc_clear_level(rctx, tex, 0, reset_value); + + if (clear_words_needed) + tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level; + tex->separate_dcc_dirty = true; + } else { + /* 128-bit formats are unusupported */ + if (util_format_get_blocksizebits(fb->cbufs[i]->format) > 64) { + continue; + } + + /* Stoney/RB+ doesn't work with CMASK fast clear. */ + if (rctx->family == CHIP_STONEY) + continue; + + /* ensure CMASK is enabled */ + r600_texture_alloc_cmask_separate(rctx->screen, tex); + if (tex->cmask.size == 0) { + continue; + } + + /* Do the fast clear. */ + rctx->clear_buffer(&rctx->b, &tex->cmask_buffer->b.b, + tex->cmask.offset, tex->cmask.size, 0, + R600_COHERENCY_CB_META); + + tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level; } - /* Do the fast clear. */ + /* We can change the micro tile mode before a full clear. */ + if (rctx->screen->chip_class >= SI) + si_set_optimal_micro_tile_mode(rctx->screen, tex); + evergreen_set_clear_color(tex, fb->cbufs[i]->format, color); - rctx->clear_buffer(&rctx->b, &tex->cmask_buffer->b.b, - tex->cmask.offset, tex->cmask.size, 0, true); - tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level; + if (dirty_cbufs) + *dirty_cbufs |= 1 << i; rctx->set_atom_dirty(rctx, fb_state, true); *buffers &= ~clear_bit; } @@ -1299,4 +2624,5 @@ void r600_init_context_texture_functions(struct r600_common_context *rctx) { rctx->b.create_surface = r600_create_surface; rctx->b.surface_destroy = r600_surface_destroy; + rctx->b.clear_texture = r600_clear_texture; } diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.c b/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.c new file mode 100644 index 000000000..8aaa85d02 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.c @@ -0,0 +1,197 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Tom Stellard <thomas.stellard@amd.com> + * + */ + +#include "radeon_elf_util.h" +#include "r600_pipe_common.h" + +#include "util/u_memory.h" + +#include <gelf.h> +#include <libelf.h> +#include <stdio.h> + +static void parse_symbol_table(Elf_Data *symbol_table_data, + const GElf_Shdr *symbol_table_header, + struct radeon_shader_binary *binary) +{ + GElf_Sym symbol; + unsigned i = 0; + unsigned symbol_count = + symbol_table_header->sh_size / symbol_table_header->sh_entsize; + + /* We are over allocating this list, because symbol_count gives the + * total number of symbols, and we will only be filling the list + * with offsets of global symbols. The memory savings from + * allocating the correct size of this list will be small, and + * I don't think it is worth the cost of pre-computing the number + * of global symbols. + */ + binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t)); + + while (gelf_getsym(symbol_table_data, i++, &symbol)) { + unsigned i; + if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL || + symbol.st_shndx == 0 /* Undefined symbol */) { + continue; + } + + binary->global_symbol_offsets[binary->global_symbol_count] = + symbol.st_value; + + /* Sort the list using bubble sort. This list will usually + * be small. */ + for (i = binary->global_symbol_count; i > 0; --i) { + uint64_t lhs = binary->global_symbol_offsets[i - 1]; + uint64_t rhs = binary->global_symbol_offsets[i]; + if (lhs < rhs) { + break; + } + binary->global_symbol_offsets[i] = lhs; + binary->global_symbol_offsets[i - 1] = rhs; + } + ++binary->global_symbol_count; + } +} + +static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols, + unsigned symbol_sh_link, + struct radeon_shader_binary *binary) +{ + unsigned i; + + if (!relocs || !symbols || !binary->reloc_count) { + return; + } + binary->relocs = CALLOC(binary->reloc_count, + sizeof(struct radeon_shader_reloc)); + for (i = 0; i < binary->reloc_count; i++) { + GElf_Sym symbol; + GElf_Rel rel; + char *symbol_name; + struct radeon_shader_reloc *reloc = &binary->relocs[i]; + + gelf_getrel(relocs, i, &rel); + gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol); + symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name); + + reloc->offset = rel.r_offset; + strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1); + reloc->name[sizeof(reloc->name)-1] = 0; + } +} + +void radeon_elf_read(const char *elf_data, unsigned elf_size, + struct radeon_shader_binary *binary) +{ + char *elf_buffer; + Elf *elf; + Elf_Scn *section = NULL; + Elf_Data *symbols = NULL, *relocs = NULL; + size_t section_str_index; + unsigned symbol_sh_link = 0; + + /* One of the libelf implementations + * (http://www.mr511.de/software/english.htm) requires calling + * elf_version() before elf_memory(). + */ + elf_version(EV_CURRENT); + elf_buffer = MALLOC(elf_size); + memcpy(elf_buffer, elf_data, elf_size); + + elf = elf_memory(elf_buffer, elf_size); + + elf_getshdrstrndx(elf, §ion_str_index); + + while ((section = elf_nextscn(elf, section))) { + const char *name; + Elf_Data *section_data = NULL; + GElf_Shdr section_header; + if (gelf_getshdr(section, §ion_header) != §ion_header) { + fprintf(stderr, "Failed to read ELF section header\n"); + return; + } + name = elf_strptr(elf, section_str_index, section_header.sh_name); + if (!strcmp(name, ".text")) { + section_data = elf_getdata(section, section_data); + binary->code_size = section_data->d_size; + binary->code = MALLOC(binary->code_size * sizeof(unsigned char)); + memcpy(binary->code, section_data->d_buf, binary->code_size); + } else if (!strcmp(name, ".AMDGPU.config")) { + section_data = elf_getdata(section, section_data); + binary->config_size = section_data->d_size; + binary->config = MALLOC(binary->config_size * sizeof(unsigned char)); + memcpy(binary->config, section_data->d_buf, binary->config_size); + } else if (!strcmp(name, ".AMDGPU.disasm")) { + /* Always read disassembly if it's available. */ + section_data = elf_getdata(section, section_data); + binary->disasm_string = strndup(section_data->d_buf, + section_data->d_size); + } else if (!strncmp(name, ".rodata", 7)) { + section_data = elf_getdata(section, section_data); + binary->rodata_size = section_data->d_size; + binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char)); + memcpy(binary->rodata, section_data->d_buf, binary->rodata_size); + } else if (!strncmp(name, ".symtab", 7)) { + symbols = elf_getdata(section, section_data); + symbol_sh_link = section_header.sh_link; + parse_symbol_table(symbols, §ion_header, binary); + } else if (!strcmp(name, ".rel.text")) { + relocs = elf_getdata(section, section_data); + binary->reloc_count = section_header.sh_size / + section_header.sh_entsize; + } + } + + parse_relocs(elf, relocs, symbols, symbol_sh_link, binary); + + if (elf){ + elf_end(elf); + } + FREE(elf_buffer); + + /* Cache the config size per symbol */ + if (binary->global_symbol_count) { + binary->config_size_per_symbol = + binary->config_size / binary->global_symbol_count; + } else { + binary->global_symbol_count = 1; + binary->config_size_per_symbol = binary->config_size; + } +} + +const unsigned char *radeon_shader_binary_config_start( + const struct radeon_shader_binary *binary, + uint64_t symbol_offset) +{ + unsigned i; + for (i = 0; i < binary->global_symbol_count; ++i) { + if (binary->global_symbol_offsets[i] == symbol_offset) { + unsigned offset = i * binary->config_size_per_symbol; + return binary->config + offset; + } + } + return binary->config; +} diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.h b/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.h new file mode 100644 index 000000000..c2af9e0df --- /dev/null +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.h @@ -0,0 +1,50 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Tom Stellard <thomas.stellard@amd.com> + * + */ + +#ifndef RADEON_ELF_UTIL_H +#define RADEON_ELF_UTIL_H + +#include <stdint.h> + +struct radeon_shader_binary; +struct radeon_shader_reloc; + +/* + * Parse the elf binary stored in \p elf_data and create a + * radeon_shader_binary object. + */ +void radeon_elf_read(const char *elf_data, unsigned elf_size, + struct radeon_shader_binary *binary); + +/** + * @returns A pointer to the start of the configuration information for + * the function starting at \p symbol_offset of the binary. + */ +const unsigned char *radeon_shader_binary_config_start( + const struct radeon_shader_binary *binary, + uint64_t symbol_offset); + +#endif /* RADEON_ELF_UTIL_H */ diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c index 55c216aa5..fb1491a28 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c @@ -57,7 +57,9 @@ #define FB_BUFFER_OFFSET 0x1000 #define FB_BUFFER_SIZE 2048 +#define FB_BUFFER_SIZE_TONGA (2048 * 64) #define IT_SCALING_TABLE_SIZE 992 +#define UVD_SESSION_CONTEXT_SIZE (128 * 1024) /* UVD decoder representation */ struct ruvd_decoder { @@ -78,6 +80,7 @@ struct ruvd_decoder { struct rvid_buffer msg_fb_it_buffers[NUM_BUFFERS]; struct ruvd_msg *msg; uint32_t *fb; + unsigned fb_size; uint8_t *it; struct rvid_buffer bs_buffers[NUM_BUFFERS]; @@ -87,38 +90,40 @@ struct ruvd_decoder { struct rvid_buffer dpb; bool use_legacy; struct rvid_buffer ctx; + struct rvid_buffer sessionctx; }; /* flush IB to the hardware */ -static void flush(struct ruvd_decoder *dec) +static int flush(struct ruvd_decoder *dec, unsigned flags) { - dec->ws->cs_flush(dec->cs, RADEON_FLUSH_ASYNC, NULL, 0); + return dec->ws->cs_flush(dec->cs, flags, NULL); } /* add a new set register command to the IB */ static void set_reg(struct ruvd_decoder *dec, unsigned reg, uint32_t val) { - uint32_t *pm4 = dec->cs->buf; - pm4[dec->cs->cdw++] = RUVD_PKT0(reg >> 2, 0); - pm4[dec->cs->cdw++] = val; + radeon_emit(dec->cs, RUVD_PKT0(reg >> 2, 0)); + radeon_emit(dec->cs, val); } /* send a command to the VCPU through the GPCOM registers */ static void send_cmd(struct ruvd_decoder *dec, unsigned cmd, - struct radeon_winsys_cs_handle* cs_buf, uint32_t off, + struct pb_buffer* buf, uint32_t off, enum radeon_bo_usage usage, enum radeon_bo_domain domain) { int reloc_idx; - reloc_idx = dec->ws->cs_add_reloc(dec->cs, cs_buf, usage, domain, - RADEON_PRIO_MIN); + reloc_idx = dec->ws->cs_add_buffer(dec->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED, + domain, + RADEON_PRIO_UVD); if (!dec->use_legacy) { uint64_t addr; - addr = dec->ws->buffer_get_virtual_address(cs_buf); + addr = dec->ws->buffer_get_virtual_address(buf); addr = addr + off; set_reg(dec, RUVD_GPCOM_VCPU_DATA0, addr); set_reg(dec, RUVD_GPCOM_VCPU_DATA1, addr >> 32); } else { + off += dec->ws->buffer_get_reloc_offset(buf); set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off); set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4); } @@ -142,13 +147,13 @@ static void map_msg_fb_it_buf(struct ruvd_decoder *dec) buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; /* and map it for CPU access */ - ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs, PIPE_TRANSFER_WRITE); + ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE); /* calc buffer offsets */ dec->msg = (struct ruvd_msg *)ptr; dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET); if (have_it(dec)) - dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + FB_BUFFER_SIZE); + dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + dec->fb_size); } /* unmap and send a message command to the VCPU */ @@ -164,13 +169,19 @@ static void send_msg_buf(struct ruvd_decoder *dec) buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; /* unmap the buffer */ - dec->ws->buffer_unmap(buf->res->cs_buf); + dec->ws->buffer_unmap(buf->res->buf); dec->msg = NULL; dec->fb = NULL; dec->it = NULL; + + if (dec->sessionctx.res) + send_cmd(dec, RUVD_CMD_SESSION_CONTEXT_BUFFER, + dec->sessionctx.res->buf, 0, RADEON_USAGE_READWRITE, + RADEON_DOMAIN_VRAM); + /* and send it to the hardware */ - send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->cs_buf, 0, + send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->buf, 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT); } @@ -207,7 +218,61 @@ static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family) } } -static unsigned calc_ctx_size(struct ruvd_decoder *dec) +static unsigned calc_ctx_size_h264_perf(struct ruvd_decoder *dec) +{ + unsigned width_in_mb, height_in_mb, ctx_size; + unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); + unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); + + unsigned max_references = dec->base.max_references + 1; + + // picture width & height in 16 pixel units + width_in_mb = width / VL_MACROBLOCK_WIDTH; + height_in_mb = align(height / VL_MACROBLOCK_HEIGHT, 2); + + if (!dec->use_legacy) { + unsigned fs_in_mb = width_in_mb * height_in_mb; + unsigned num_dpb_buffer; + switch(dec->base.level) { + case 30: + num_dpb_buffer = 8100 / fs_in_mb; + break; + case 31: + num_dpb_buffer = 18000 / fs_in_mb; + break; + case 32: + num_dpb_buffer = 20480 / fs_in_mb; + break; + case 41: + num_dpb_buffer = 32768 / fs_in_mb; + break; + case 42: + num_dpb_buffer = 34816 / fs_in_mb; + break; + case 50: + num_dpb_buffer = 110400 / fs_in_mb; + break; + case 51: + num_dpb_buffer = 184320 / fs_in_mb; + break; + default: + num_dpb_buffer = 184320 / fs_in_mb; + break; + } + num_dpb_buffer++; + max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references); + ctx_size = max_references * align(width_in_mb * height_in_mb * 192, 256); + } else { + // the firmware seems to always assume a minimum of ref frames + max_references = MAX2(NUM_H264_REFS, max_references); + // macroblock context buffer + ctx_size = align(width_in_mb * height_in_mb * max_references * 192, 256); + } + + return ctx_size; +} + +static unsigned calc_ctx_size_h265_main(struct ruvd_decoder *dec) { unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); @@ -224,6 +289,39 @@ static unsigned calc_ctx_size(struct ruvd_decoder *dec) return ((width + 255) / 16) * ((height + 255) / 16) * 16 * max_references + 52 * 1024; } +static unsigned calc_ctx_size_h265_main10(struct ruvd_decoder *dec, struct pipe_h265_picture_desc *pic) +{ + unsigned block_size, log2_ctb_size, width_in_ctb, height_in_ctb, num_16x16_block_per_ctb; + unsigned context_buffer_size_per_ctb_row, cm_buffer_size, max_mb_address, db_left_tile_pxl_size; + unsigned db_left_tile_ctx_size = 4096 / 16 * (32 + 16 * 4); + + unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); + unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); + unsigned coeff_10bit = (pic->pps->sps->bit_depth_luma_minus8 || pic->pps->sps->bit_depth_chroma_minus8) ? 2 : 1; + + unsigned max_references = dec->base.max_references + 1; + + if (dec->base.width * dec->base.height >= 4096*2000) + max_references = MAX2(max_references, 8); + else + max_references = MAX2(max_references, 17); + + block_size = (1 << (pic->pps->sps->log2_min_luma_coding_block_size_minus3 + 3)); + log2_ctb_size = block_size + pic->pps->sps->log2_diff_max_min_luma_coding_block_size; + + width_in_ctb = (width + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size; + height_in_ctb = (height + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size; + + num_16x16_block_per_ctb = ((1 << log2_ctb_size) >> 4) * ((1 << log2_ctb_size) >> 4); + context_buffer_size_per_ctb_row = align(width_in_ctb * num_16x16_block_per_ctb * 16, 256); + max_mb_address = (unsigned) ceil(height * 8 / 2048.0); + + cm_buffer_size = max_references * context_buffer_size_per_ctb_row * height_in_ctb; + db_left_tile_pxl_size = coeff_10bit * (max_mb_address * 2 * 2048 + 1024); + + return cm_buffer_size + db_left_tile_ctx_size + db_left_tile_pxl_size; +} + /* calculate size of reference picture buffer */ static unsigned calc_dpb_size(struct ruvd_decoder *dec) { @@ -282,17 +380,23 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec) num_dpb_buffer++; max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references); dpb_size = image_size * max_references; - dpb_size += max_references * align(width_in_mb * height_in_mb * 192, alignment); - dpb_size += align(width_in_mb * height_in_mb * 32, alignment); + if ((dec->stream_type != RUVD_CODEC_H264_PERF) || + (((struct r600_common_screen*)dec->screen)->family < CHIP_POLARIS10)) { + dpb_size += max_references * align(width_in_mb * height_in_mb * 192, alignment); + dpb_size += align(width_in_mb * height_in_mb * 32, alignment); + } } else { // the firmware seems to allways assume a minimum of ref frames max_references = MAX2(NUM_H264_REFS, max_references); // reference picture buffer dpb_size = image_size * max_references; - // macroblock context buffer - dpb_size += width_in_mb * height_in_mb * max_references * 192; - // IT surface buffer - dpb_size += width_in_mb * height_in_mb * 32; + if ((dec->stream_type != RUVD_CODEC_H264_PERF) || + (((struct r600_common_screen*)dec->screen)->family < CHIP_POLARIS10)) { + // macroblock context buffer + dpb_size += width_in_mb * height_in_mb * max_references * 192; + // IT surface buffer + dpb_size += width_in_mb * height_in_mb * 32; + } } break; } @@ -305,7 +409,10 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec) width = align (width, 16); height = align (height, 16); - dpb_size = align((width * height * 3) / 2, 256) * max_references; + if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + dpb_size = align((width * height * 9) / 4, 256) * max_references; + else + dpb_size = align((width * height * 3) / 2, 256) * max_references; break; case PIPE_VIDEO_FORMAT_VC1: @@ -402,6 +509,9 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_ result.log2_max_pic_order_cnt_lsb_minus4 = pic->pps->sps->log2_max_pic_order_cnt_lsb_minus4; switch (dec->base.chroma_format) { + case PIPE_VIDEO_CHROMA_FORMAT_NONE: + /* TODO: assert? */ + break; case PIPE_VIDEO_CHROMA_FORMAT_400: result.chroma_format = 0; break; @@ -478,6 +588,8 @@ static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video result.sps_info_flags |= pic->pps->sps->separate_colour_plane_flag << 8; if (((struct r600_common_screen*)dec->screen)->family == CHIP_CARRIZO) result.sps_info_flags |= 1 << 9; + if (pic->UseRefPicList == true) + result.sps_info_flags |= 1 << 10; result.chroma_format = pic->pps->sps->chroma_format_idc; result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8; @@ -586,6 +698,20 @@ static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video memcpy(dec->it + 480, pic->pps->sps->ScalingList16x16, 6 * 64); memcpy(dec->it + 864, pic->pps->sps->ScalingList32x32, 2 * 64); + for (i = 0 ; i < 2 ; i++) { + for (int j = 0 ; j < 15 ; j++) + result.direct_reflist[i][j] = pic->RefPicList[i][j]; + } + + if ((pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) && + (target->buffer_format == PIPE_FORMAT_NV12)) { + result.p010_mode = 0; + result.luma_10to8 = 5; + result.chroma_10to8 = 5; + result.sclr_luma10to8 = 4; + result.sclr_chroma10to8 = 4; + } + /* TODO result.highestTid; result.isNonRef; @@ -811,7 +937,7 @@ static void ruvd_destroy(struct pipe_video_codec *decoder) dec->msg->stream_handle = dec->stream_handle; send_msg_buf(dec); - flush(dec); + flush(dec, 0); dec->ws->cs_destroy(dec->cs); @@ -821,8 +947,8 @@ static void ruvd_destroy(struct pipe_video_codec *decoder) } rvid_destroy_buffer(&dec->dpb); - if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) - rvid_destroy_buffer(&dec->ctx); + rvid_destroy_buffer(&dec->ctx); + rvid_destroy_buffer(&dec->sessionctx); FREE(dec); } @@ -845,7 +971,7 @@ static void ruvd_begin_frame(struct pipe_video_codec *decoder, dec->bs_size = 0; dec->bs_ptr = dec->ws->buffer_map( - dec->bs_buffers[dec->cur_buffer].res->cs_buf, + dec->bs_buffers[dec->cur_buffer].res->buf, dec->cs, PIPE_TRANSFER_WRITE); } @@ -885,13 +1011,13 @@ static void ruvd_decode_bitstream(struct pipe_video_codec *decoder, unsigned new_size = dec->bs_size + sizes[i]; if (new_size > buf->res->buf->size) { - dec->ws->buffer_unmap(buf->res->cs_buf); + dec->ws->buffer_unmap(buf->res->buf); if (!rvid_resize_buffer(dec->screen, dec->cs, buf, new_size)) { RVID_ERR("Can't resize bitstream buffer!"); return; } - dec->bs_ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs, + dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE); if (!dec->bs_ptr) return; @@ -913,7 +1039,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, struct pipe_picture_desc *picture) { struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder; - struct radeon_winsys_cs_handle *dt; + struct pb_buffer *dt; struct rvid_buffer *msg_fb_it_buf, *bs_buf; unsigned bs_size; @@ -927,7 +1053,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, bs_size = align(dec->bs_size, 128); memset(dec->bs_ptr, 0, bs_size - dec->bs_size); - dec->ws->buffer_unmap(bs_buf->res->cs_buf); + dec->ws->buffer_unmap(bs_buf->res->buf); map_msg_fb_it_buf(dec); dec->msg->size = sizeof(*dec->msg); @@ -948,9 +1074,15 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size; dec->msg->body.decode.bsd_size = bs_size; - dec->msg->body.decode.db_pitch = dec->base.width; + dec->msg->body.decode.db_pitch = align(dec->base.width, 16); + + if (dec->stream_type == RUVD_CODEC_H264_PERF && + ((struct r600_common_screen*)dec->screen)->family >= CHIP_POLARIS10) + dec->msg->body.decode.dpb_reserved = dec->ctx.res->buf->size; dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target); + if (((struct r600_common_screen*)dec->screen)->family >= CHIP_STONEY) + dec->msg->body.decode.dt_wa_chroma_top_offset = dec->msg->body.decode.dt_pitch / 2; switch (u_reduce_video_profile(picture->profile)) { case PIPE_VIDEO_FORMAT_MPEG4_AVC: @@ -959,6 +1091,20 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, case PIPE_VIDEO_FORMAT_HEVC: dec->msg->body.decode.codec.h265 = get_h265_msg(dec, target, (struct pipe_h265_picture_desc*)picture); + if (dec->ctx.res == NULL) { + unsigned ctx_size; + if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) + ctx_size = calc_ctx_size_h265_main10(dec, (struct pipe_h265_picture_desc*)picture); + else + ctx_size = calc_ctx_size_h265_main(dec); + if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) { + RVID_ERR("Can't allocated context buffer.\n"); + } + rvid_clear_buffer(decoder->context, &dec->ctx); + } + + if (dec->ctx.res) + dec->msg->body.decode.dpb_reserved = dec->ctx.res->buf->size; break; case PIPE_VIDEO_FORMAT_VC1: @@ -982,28 +1128,27 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, dec->msg->body.decode.extension_support = 0x1; /* set at least the feedback buffer size */ - dec->fb[0] = FB_BUFFER_SIZE; + dec->fb[0] = dec->fb_size; send_msg_buf(dec); - send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->cs_buf, 0, + send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->buf, 0, RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM); - if (u_reduce_video_profile(picture->profile) == PIPE_VIDEO_FORMAT_HEVC) { - send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->cs_buf, 0, + if (dec->ctx.res) + send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->buf, 0, RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM); - } - send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->cs_buf, + send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->buf, 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT); send_cmd(dec, RUVD_CMD_DECODING_TARGET_BUFFER, dt, 0, RADEON_USAGE_WRITE, RADEON_DOMAIN_VRAM); - send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->cs_buf, + send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->buf, FB_BUFFER_OFFSET, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT); if (have_it(dec)) - send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->cs_buf, - FB_BUFFER_OFFSET + FB_BUFFER_SIZE, RADEON_USAGE_READ, RADEON_DOMAIN_GTT); + send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->buf, + FB_BUFFER_OFFSET + dec->fb_size, RADEON_USAGE_READ, RADEON_DOMAIN_GTT); set_reg(dec, RUVD_ENGINE_CNTL, 1); - flush(dec); + flush(dec, RADEON_FLUSH_ASYNC); next_buffer(dec); } @@ -1028,7 +1173,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, unsigned bs_buf_size; struct radeon_info info; struct ruvd_decoder *dec; - int i; + int r, i; ws->query_info(ws, &info); @@ -1039,6 +1184,9 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, /* fall through */ case PIPE_VIDEO_FORMAT_MPEG4: + width = align(width, VL_MACROBLOCK_WIDTH); + height = align(height, VL_MACROBLOCK_HEIGHT); + break; case PIPE_VIDEO_FORMAT_MPEG4_AVC: width = align(width, VL_MACROBLOCK_WIDTH); height = align(height, VL_MACROBLOCK_HEIGHT); @@ -1055,7 +1203,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, return NULL; if (info.drm_major < 3) - dec->use_legacy = TRUE; + dec->use_legacy = true; dec->base = *templ; dec->base.context = context; @@ -1074,15 +1222,17 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, dec->stream_handle = rvid_alloc_stream_handle(); dec->screen = context->screen; dec->ws = ws; - dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL, NULL); + dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL); if (!dec->cs) { RVID_ERR("Can't get command submission context.\n"); goto error; } - bs_buf_size = width * height * 512 / (16 * 16); + dec->fb_size = (info.family == CHIP_TONGA) ? FB_BUFFER_SIZE_TONGA : + FB_BUFFER_SIZE; + bs_buf_size = width * height * (512 / (16 * 16)); for (i = 0; i < NUM_BUFFERS; ++i) { - unsigned msg_fb_it_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE; + unsigned msg_fb_it_size = FB_BUFFER_OFFSET + dec->fb_size; STATIC_ASSERT(sizeof(struct ruvd_msg) <= FB_BUFFER_OFFSET); if (have_it(dec)) msg_fb_it_size += IT_SCALING_TABLE_SIZE; @@ -1111,8 +1261,8 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, rvid_clear_buffer(context, &dec->dpb); - if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) { - unsigned ctx_size = calc_ctx_size(dec); + if (dec->stream_type == RUVD_CODEC_H264_PERF && info.family >= CHIP_POLARIS10) { + unsigned ctx_size = calc_ctx_size_h264_perf(dec); if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) { RVID_ERR("Can't allocated context buffer.\n"); goto error; @@ -1120,6 +1270,16 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, rvid_clear_buffer(context, &dec->ctx); } + if (info.family >= CHIP_POLARIS10 && info.drm_minor >= 3) { + if (!rvid_create_buffer(dec->screen, &dec->sessionctx, + UVD_SESSION_CONTEXT_SIZE, + PIPE_USAGE_DEFAULT)) { + RVID_ERR("Can't allocated session ctx.\n"); + goto error; + } + rvid_clear_buffer(context, &dec->sessionctx); + } + map_msg_fb_it_buf(dec); dec->msg->size = sizeof(*dec->msg); dec->msg->msg_type = RUVD_MSG_CREATE; @@ -1129,7 +1289,10 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, dec->msg->body.create.height_in_samples = dec->base.height; dec->msg->body.create.dpb_size = dpb_size; send_msg_buf(dec); - flush(dec); + r = flush(dec, 0); + if (r) + goto error; + next_buffer(dec); return &dec->base; @@ -1143,8 +1306,8 @@ error: } rvid_destroy_buffer(&dec->dpb); - if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) - rvid_destroy_buffer(&dec->ctx); + rvid_destroy_buffer(&dec->ctx); + rvid_destroy_buffer(&dec->sessionctx); FREE(dec); diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h index 452fbd608..e3f8504d8 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h @@ -38,13 +38,13 @@ #include "vl/vl_video_buffer.h" /* UVD uses PM4 packet type 0 and 2 */ -#define RUVD_PKT_TYPE_S(x) (((x) & 0x3) << 30) +#define RUVD_PKT_TYPE_S(x) (((unsigned)(x) & 0x3) << 30) #define RUVD_PKT_TYPE_G(x) (((x) >> 30) & 0x3) #define RUVD_PKT_TYPE_C 0x3FFFFFFF -#define RUVD_PKT_COUNT_S(x) (((x) & 0x3FFF) << 16) +#define RUVD_PKT_COUNT_S(x) (((unsigned)(x) & 0x3FFF) << 16) #define RUVD_PKT_COUNT_G(x) (((x) >> 16) & 0x3FFF) #define RUVD_PKT_COUNT_C 0xC000FFFF -#define RUVD_PKT0_BASE_INDEX_S(x) (((x) & 0xFFFF) << 0) +#define RUVD_PKT0_BASE_INDEX_S(x) (((unsigned)(x) & 0xFFFF) << 0) #define RUVD_PKT0_BASE_INDEX_G(x) (((x) >> 0) & 0xFFFF) #define RUVD_PKT0_BASE_INDEX_C 0xFFFF0000 #define RUVD_PKT0(index, count) (RUVD_PKT_TYPE_S(0) | RUVD_PKT0_BASE_INDEX_S(index) | RUVD_PKT_COUNT_S(count)) @@ -61,6 +61,7 @@ #define RUVD_CMD_DPB_BUFFER 0x00000001 #define RUVD_CMD_DECODING_TARGET_BUFFER 0x00000002 #define RUVD_CMD_FEEDBACK_BUFFER 0x00000003 +#define RUVD_CMD_SESSION_CONTEXT_BUFFER 0x00000005 #define RUVD_CMD_BITSTREAM_BUFFER 0x00000100 #define RUVD_CMD_ITSCALING_TABLE_BUFFER 0x00000204 #define RUVD_CMD_CONTEXT_BUFFER 0x00000206 @@ -233,6 +234,15 @@ struct ruvd_h265 { uint8_t highestTid; uint8_t isNonRef; + + uint8_t p010_mode; + uint8_t msb_mode; + uint8_t luma_10to8; + uint8_t chroma_10to8; + uint8_t sclr_luma10to8; + uint8_t sclr_chroma10to8; + + uint8_t direct_reflist[2][15]; }; struct ruvd_vc1 { @@ -385,7 +395,10 @@ struct ruvd_msg { uint32_t dt_chroma_top_offset; uint32_t dt_chroma_bottom_offset; uint32_t dt_surf_tile_config; - uint32_t dt_reserved[3]; + uint32_t dt_uv_surf_tile_config; + // re-use dt_wa_chroma_top_offset as dt_ext_info for UV pitch in stoney + uint32_t dt_wa_chroma_top_offset; + uint32_t dt_wa_chroma_bottom_offset; uint32_t reserved[16]; @@ -409,7 +422,7 @@ struct ruvd_msg { }; /* driver dependent callback */ -typedef struct radeon_winsys_cs_handle* (*ruvd_set_dtb) +typedef struct pb_buffer* (*ruvd_set_dtb) (struct ruvd_msg* msg, struct vl_video_buffer *vb); /* create an UVD decode */ diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c index 7eab974a3..ef93e46c1 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c @@ -49,13 +49,16 @@ #define FW_50_1_2 ((50 << 24) | (1 << 16) | (2 << 8)) #define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8)) #define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8)) +#define FW_52_0_3 ((52 << 24) | (0 << 16) | (3 << 8)) +#define FW_52_4_3 ((52 << 24) | (4 << 16) | (3 << 8)) +#define FW_52_8_3 ((52 << 24) | (8 << 16) | (3 << 8)) /** * flush commands to the hardware */ static void flush(struct rvce_encoder *enc) { - enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL, 0); + enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL); enc->task_info_idx = 0; enc->bs_idx = 0; } @@ -63,7 +66,7 @@ static void flush(struct rvce_encoder *enc) #if 0 static void dump_feedback(struct rvce_encoder *enc, struct rvid_buffer *fb) { - uint32_t *ptr = enc->ws->buffer_map(fb->res->cs_buf, enc->cs, PIPE_TRANSFER_READ_WRITE); + uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE); unsigned i = 0; fprintf(stderr, "\n"); fprintf(stderr, "encStatus:\t\t\t%08x\n", ptr[i++]); @@ -82,7 +85,7 @@ static void dump_feedback(struct rvce_encoder *enc, struct rvid_buffer *fb) fprintf(stderr, "seiPrivatePackageOffset:\t%08x\n", ptr[i++]); fprintf(stderr, "seiPrivatePackageSize:\t\t%08x\n", ptr[i++]); fprintf(stderr, "\n"); - enc->ws->buffer_unmap(fb->res->cs_buf); + enc->ws->buffer_unmap(fb->res->buf); } #endif @@ -265,6 +268,7 @@ static void rvce_begin_frame(struct pipe_video_codec *encoder, enc->pic.quant_b_frames != pic->quant_b_frames; enc->pic = *pic; + get_pic_param(enc, pic); enc->get_buffer(vid_buf->resources[0], &enc->handle, &enc->luma); enc->get_buffer(vid_buf->resources[1], NULL, &enc->chroma); @@ -311,7 +315,7 @@ static void rvce_encode_bitstream(struct pipe_video_codec *encoder, RVID_ERR("Can't create feedback buffer.\n"); return; } - if (!enc->cs->cdw) + if (!radeon_emitted(enc->cs, 0)) enc->session(enc); enc->encode(enc); enc->feedback(enc); @@ -345,7 +349,7 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder, struct rvid_buffer *fb = feedback; if (size) { - uint32_t *ptr = enc->ws->buffer_map(fb->res->cs_buf, enc->cs, PIPE_TRANSFER_READ_WRITE); + uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE); if (ptr[1]) { *size = ptr[4] - ptr[9]; @@ -353,7 +357,7 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder, *size = 0; } - enc->ws->buffer_unmap(fb->res->cs_buf); + enc->ws->buffer_unmap(fb->res->buf); } //dump_feedback(enc, fb); rvid_destroy_buffer(fb); @@ -403,9 +407,12 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, if (rscreen->info.drm_major == 3) enc->use_vm = true; - if ((rscreen->info.drm_major > 2) || (rscreen->info.drm_minor >= 42)) + if ((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42) || + rscreen->info.drm_major == 3) enc->use_vui = true; - if (rscreen->info.family >= CHIP_TONGA) + if (rscreen->info.family >= CHIP_TONGA && + rscreen->info.family != CHIP_STONEY && + rscreen->info.family != CHIP_POLARIS11) enc->dual_pipe = true; /* TODO enable B frame with dual instance */ if ((rscreen->info.family >= CHIP_TONGA) && @@ -426,7 +433,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, enc->screen = context->screen; enc->ws = ws; - enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc, NULL); + enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc); if (!enc->cs) { RVID_ERR("Can't get command submission context.\n"); goto error; @@ -448,7 +455,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, get_buffer(((struct vl_video_buffer *)tmp_buf)->resources[0], NULL, &tmp_surf); cpb_size = align(tmp_surf->level[0].pitch_bytes, 128); - cpb_size = cpb_size * align(tmp_surf->npix_y, 16); + cpb_size = cpb_size * align(tmp_surf->npix_y, 32); cpb_size = cpb_size * 3 / 2; cpb_size = cpb_size * enc->cpb_num; if (enc->dual_pipe) @@ -469,6 +476,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, switch (rscreen->info.vce_fw_version) { case FW_40_2_2: radeon_vce_40_2_2_init(enc); + get_pic_param = radeon_vce_40_2_2_get_param; break; case FW_50_0_1: @@ -476,6 +484,14 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, case FW_50_10_2: case FW_50_17_3: radeon_vce_50_init(enc); + get_pic_param = radeon_vce_50_get_param; + break; + + case FW_52_0_3: + case FW_52_4_3: + case FW_52_8_3: + radeon_vce_52_init(enc); + get_pic_param = radeon_vce_52_get_param; break; default: @@ -500,23 +516,32 @@ error: */ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen) { - return rscreen->info.vce_fw_version == FW_40_2_2 || - rscreen->info.vce_fw_version == FW_50_0_1 || - rscreen->info.vce_fw_version == FW_50_1_2 || - rscreen->info.vce_fw_version == FW_50_10_2 || - rscreen->info.vce_fw_version == FW_50_17_3; + switch (rscreen->info.vce_fw_version) { + case FW_40_2_2: + case FW_50_0_1: + case FW_50_1_2: + case FW_50_10_2: + case FW_50_17_3: + case FW_52_0_3: + case FW_52_4_3: + case FW_52_8_3: + return true; + default: + return false; + } } /** * Add the buffer as relocation to the current command submission */ -void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf, +void rvce_add_buffer(struct rvce_encoder *enc, struct pb_buffer *buf, enum radeon_bo_usage usage, enum radeon_bo_domain domain, signed offset) { int reloc_idx; - reloc_idx = enc->ws->cs_add_reloc(enc->cs, buf, usage, domain, RADEON_PRIO_MIN); + reloc_idx = enc->ws->cs_add_buffer(enc->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED, + domain, RADEON_PRIO_VCE); if (enc->use_vm) { uint64_t addr; addr = enc->ws->buffer_get_virtual_address(buf); @@ -524,6 +549,7 @@ void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *b RVCE_CS(addr >> 32); RVCE_CS(addr); } else { + offset += enc->ws->buffer_get_reloc_offset(buf); RVCE_CS(reloc_idx * 4); RVCE_CS(offset); } diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c index c00565904..fe15ded39 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c @@ -59,11 +59,11 @@ static void task_info(struct rvce_encoder *enc, uint32_t op, RVCE_BEGIN(0x00000002); // task info if (op == 0x3) { if (enc->task_info_idx) { - uint32_t offs = enc->cs->cdw - enc->task_info_idx + 3; + uint32_t offs = enc->cs->current.cdw - enc->task_info_idx + 3; // Update offsetOfNextTaskInfo - enc->cs->buf[enc->task_info_idx] = offs; + enc->cs->current.buf[enc->task_info_idx] = offs; } - enc->task_info_idx = enc->cs->cdw; + enc->task_info_idx = enc->cs->current.cdw; } RVCE_CS(0xffffffff); // offsetOfNextTaskInfo RVCE_CS(op); // taskOperation @@ -77,7 +77,7 @@ static void task_info(struct rvce_encoder *enc, uint32_t op, static void feedback(struct rvce_encoder *enc) { RVCE_BEGIN(0x05000005); // feedback buffer - RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo + RVCE_WRITE(enc->fb->res->buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo RVCE_CS(0x00000001); // feedbackRingSize RVCE_END(); } @@ -303,7 +303,7 @@ static void encode(struct rvce_encoder *enc) enc->task_info(enc, 0x00000003, 0, 0, 0); RVCE_BEGIN(0x05000001); // context buffer - RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo + RVCE_READWRITE(enc->cpb.res->buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo RVCE_END(); RVCE_BEGIN(0x05000004); // video bitstream buffer @@ -431,6 +431,10 @@ static void destroy(struct rvce_encoder *enc) RVCE_END(); } +void radeon_vce_40_2_2_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic) +{ +} + void radeon_vce_40_2_2_init(struct rvce_encoder *enc) { enc->session = session; diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c index afdab18c0..262e13ba9 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c @@ -95,7 +95,7 @@ static void encode(struct rvce_encoder *enc) enc->task_info(enc, 0x00000003, dep, 0, bs_idx); RVCE_BEGIN(0x05000001); // context buffer - RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo + RVCE_READWRITE(enc->cpb.res->buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo RVCE_END(); bs_offset = -(signed)(bs_idx * enc->bs_size); @@ -233,6 +233,10 @@ static void encode(struct rvce_encoder *enc) RVCE_END(); } +void radeon_vce_50_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic) +{ +} + void radeon_vce_50_init(struct rvce_encoder *enc) { radeon_vce_40_2_2_init(enc); diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c index 3894eea31..5db01fe52 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c @@ -40,27 +40,152 @@ static const unsigned profiles[7] = { 66, 77, 88, 100, 110, 122, 244 }; +static void get_rate_control_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic) +{ + enc->enc_pic.rc.rc_method = pic->rate_ctrl.rate_ctrl_method; + enc->enc_pic.rc.target_bitrate = pic->rate_ctrl.target_bitrate; + enc->enc_pic.rc.peak_bitrate = pic->rate_ctrl.peak_bitrate; + enc->enc_pic.rc.quant_i_frames = pic->quant_i_frames; + enc->enc_pic.rc.quant_p_frames = pic->quant_p_frames; + enc->enc_pic.rc.quant_b_frames = pic->quant_b_frames; + enc->enc_pic.rc.gop_size = pic->gop_size; + enc->enc_pic.rc.frame_rate_num = pic->rate_ctrl.frame_rate_num; + enc->enc_pic.rc.frame_rate_den = pic->rate_ctrl.frame_rate_den; + enc->enc_pic.rc.max_qp = 51; + enc->enc_pic.rc.vbv_buffer_size = pic->rate_ctrl.vbv_buffer_size; + enc->enc_pic.rc.vbv_buf_lv = pic->rate_ctrl.vbv_buf_lv; + enc->enc_pic.rc.fill_data_enable = pic->rate_ctrl.fill_data_enable; + enc->enc_pic.rc.enforce_hrd = pic->rate_ctrl.enforce_hrd; + enc->enc_pic.rc.target_bits_picture = pic->rate_ctrl.target_bits_picture; + enc->enc_pic.rc.peak_bits_picture_integer = pic->rate_ctrl.peak_bits_picture_integer; + enc->enc_pic.rc.peak_bits_picture_fraction = pic->rate_ctrl.peak_bits_picture_fraction; +} + +static void get_motion_estimation_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic) +{ + enc->enc_pic.me.motion_est_quarter_pixel = pic->motion_est.motion_est_quarter_pixel; + enc->enc_pic.me.enc_disable_sub_mode = pic->motion_est.enc_disable_sub_mode; + enc->enc_pic.me.lsmvert = pic->motion_est.lsmvert; + enc->enc_pic.me.enc_en_ime_overw_dis_subm = pic->motion_est.enc_en_ime_overw_dis_subm; + enc->enc_pic.me.enc_ime_overw_dis_subm_no = pic->motion_est.enc_ime_overw_dis_subm_no; + enc->enc_pic.me.enc_ime2_search_range_x = pic->motion_est.enc_ime2_search_range_x; + enc->enc_pic.me.enc_ime2_search_range_y = pic->motion_est.enc_ime2_search_range_y; + enc->enc_pic.me.enc_ime_decimation_search = 0x00000001; + enc->enc_pic.me.motion_est_half_pixel = 0x00000001; + enc->enc_pic.me.enc_search_range_x = 0x00000010; + enc->enc_pic.me.enc_search_range_y = 0x00000010; + enc->enc_pic.me.enc_search1_range_x = 0x00000010; + enc->enc_pic.me.enc_search1_range_y = 0x00000010; +} + +static void get_pic_control_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic) +{ + unsigned encNumMBsPerSlice; + encNumMBsPerSlice = align(enc->base.width, 16) / 16; + encNumMBsPerSlice *= align(enc->base.height, 16) / 16; + enc->enc_pic.pc.enc_crop_right_offset = (align(enc->base.width, 16) - enc->base.width) >> 1; + enc->enc_pic.pc.enc_crop_bottom_offset = (align(enc->base.height, 16) - enc->base.height) >> 1; + enc->enc_pic.pc.enc_num_mbs_per_slice = encNumMBsPerSlice; + enc->enc_pic.pc.enc_b_pic_pattern = MAX2(enc->base.max_references, 1) - 1; + enc->enc_pic.pc.enc_number_of_reference_frames = MIN2(enc->base.max_references, 2); + enc->enc_pic.pc.enc_max_num_ref_frames = enc->base.max_references + 1; + enc->enc_pic.pc.enc_num_default_active_ref_l0 = 0x00000001; + enc->enc_pic.pc.enc_num_default_active_ref_l1 = 0x00000001; + enc->enc_pic.pc.enc_cabac_enable = pic->pic_ctrl.enc_cabac_enable; + enc->enc_pic.pc.enc_constraint_set_flags = pic->pic_ctrl.enc_constraint_set_flags; + enc->enc_pic.pc.enc_num_default_active_ref_l0 = 0x00000001; + enc->enc_pic.pc.enc_num_default_active_ref_l1 = 0x00000001; +} + +static void get_task_info_param(struct rvce_encoder *enc) +{ + enc->enc_pic.ti.offset_of_next_task_info = 0xffffffff; +} + +static void get_feedback_buffer_param(struct rvce_encoder *enc) +{ + enc->enc_pic.fb.feedback_ring_size = 0x00000001; +} + +static void get_config_ext_param(struct rvce_encoder *enc) +{ + enc->enc_pic.ce.enc_enable_perf_logging = 0x00000003; +} + +static void get_vui_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic) +{ + enc->enc_pic.enable_vui = pic->enable_vui; + enc->enc_pic.vui.video_format = 0x00000005; + enc->enc_pic.vui.color_prim = 0x00000002; + enc->enc_pic.vui.transfer_char = 0x00000002; + enc->enc_pic.vui.matrix_coef = 0x00000002; + enc->enc_pic.vui.timing_info_present_flag = 0x00000001; + enc->enc_pic.vui.num_units_in_tick = pic->rate_ctrl.frame_rate_den; + enc->enc_pic.vui.time_scale = pic->rate_ctrl.frame_rate_num * 2; + enc->enc_pic.vui.fixed_frame_rate_flag = 0x00000001; + enc->enc_pic.vui.bit_rate_scale = 0x00000004; + enc->enc_pic.vui.cpb_size_scale = 0x00000006; + enc->enc_pic.vui.initial_cpb_removal_delay_length_minus1 = 0x00000017; + enc->enc_pic.vui.cpb_removal_delay_length_minus1 = 0x00000017; + enc->enc_pic.vui.dpb_output_delay_length_minus1 = 0x00000017; + enc->enc_pic.vui.time_offset_length = 0x00000018; + enc->enc_pic.vui.motion_vectors_over_pic_boundaries_flag = 0x00000001; + enc->enc_pic.vui.max_bytes_per_pic_denom = 0x00000002; + enc->enc_pic.vui.max_bits_per_mb_denom = 0x00000001; + enc->enc_pic.vui.log2_max_mv_length_hori = 0x00000010; + enc->enc_pic.vui.log2_max_mv_length_vert = 0x00000010; + enc->enc_pic.vui.num_reorder_frames = 0x00000003; + enc->enc_pic.vui.max_dec_frame_buffering = 0x00000003; +} + +void radeon_vce_52_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic) +{ + get_rate_control_param(enc, pic); + get_motion_estimation_param(enc, pic); + get_pic_control_param(enc, pic); + get_task_info_param(enc); + get_feedback_buffer_param(enc); + get_vui_param(enc, pic); + get_config_ext_param(enc); + + enc->enc_pic.picture_type = pic->picture_type; + enc->enc_pic.frame_num = pic->frame_num; + enc->enc_pic.frame_num_cnt = pic->frame_num_cnt; + enc->enc_pic.p_remain = pic->p_remain; + enc->enc_pic.i_remain = pic->i_remain; + enc->enc_pic.gop_cnt = pic->gop_cnt; + enc->enc_pic.pic_order_cnt = pic->pic_order_cnt; + enc->enc_pic.ref_idx_l0 = pic->ref_idx_l0; + enc->enc_pic.ref_idx_l1 = pic->ref_idx_l1; + enc->enc_pic.not_referenced = pic->not_referenced; + if (enc->dual_inst) + enc->enc_pic.addrmode_arraymode_disrdo_distwoinstants = 0x00000201; + else + enc->enc_pic.addrmode_arraymode_disrdo_distwoinstants = 0x01000201; + enc->enc_pic.is_idr = pic->is_idr; +} + static void create(struct rvce_encoder *enc) { enc->task_info(enc, 0x00000000, 0, 0, 0); RVCE_BEGIN(0x01000001); // create cmd - RVCE_CS(0x00000000); // encUseCircularBuffer + RVCE_CS(enc->enc_pic.ec.enc_use_circular_buffer); RVCE_CS(profiles[enc->base.profile - PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE]); // encProfile RVCE_CS(enc->base.level); // encLevel - RVCE_CS(0x00000000); // encPicStructRestriction + RVCE_CS(enc->enc_pic.ec.enc_pic_struct_restriction); RVCE_CS(enc->base.width); // encImageWidth RVCE_CS(enc->base.height); // encImageHeight RVCE_CS(enc->luma->level[0].pitch_bytes); // encRefPicLumaPitch RVCE_CS(enc->chroma->level[0].pitch_bytes); // encRefPicChromaPitch RVCE_CS(align(enc->luma->npix_y, 16) / 8); // encRefYHeightInQw - RVCE_CS(0x00000000); // encRefPic(Addr|Array)Mode, encPicStructRestriction, disableRDO + RVCE_CS(enc->enc_pic.addrmode_arraymode_disrdo_distwoinstants); - RVCE_CS(0x00000000); // encPreEncodeContextBufferOffset - RVCE_CS(0x00000000); // encPreEncodeInputLumaBufferOffset - RVCE_CS(0x00000000); // encPreEncodeInputChromaBufferOffs - RVCE_CS(0x00000000); // encPreEncodeMode|ChromaFlag|VBAQMode|SceneChangeSensitivity + RVCE_CS(enc->enc_pic.ec.enc_pre_encode_context_buffer_offset); + RVCE_CS(enc->enc_pic.ec.enc_pre_encode_input_luma_buffer_offset); + RVCE_CS(enc->enc_pic.ec.enc_pre_encode_input_chroma_buffer_offset); + RVCE_CS(enc->enc_pic.ec.enc_pre_encode_mode_chromaflag_vbaqmode_scenechangesensitivity); RVCE_END(); } @@ -73,7 +198,7 @@ static void encode(struct rvce_encoder *enc) if (enc->dual_inst) { if (bs_idx == 0) dep = 1; - else if (enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR) + else if (enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR) dep = 0; else dep = 2; @@ -107,13 +232,13 @@ static void encode(struct rvce_encoder *enc) } RVCE_BEGIN(0x03000001); // encode - RVCE_CS(enc->pic.frame_num ? 0x0 : 0x11); // insertHeaders - RVCE_CS(0x00000000); // pictureStructure + RVCE_CS(enc->enc_pic.frame_num ? 0x0 : 0x11); // insertHeaders + RVCE_CS(enc->enc_pic.eo.picture_structure); RVCE_CS(enc->bs_size); // allowedMaxBitstreamSize - RVCE_CS(0x00000000); // forceRefreshMap - RVCE_CS(0x00000000); // insertAUD - RVCE_CS(0x00000000); // endOfSequence - RVCE_CS(0x00000000); // endOfStream + RVCE_CS(enc->enc_pic.eo.force_refresh_map); + RVCE_CS(enc->enc_pic.eo.insert_aud); + RVCE_CS(enc->enc_pic.eo.end_of_sequence); + RVCE_CS(enc->enc_pic.eo.end_of_stream); RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM, @@ -122,121 +247,396 @@ static void encode(struct rvce_encoder *enc) RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch if (enc->dual_pipe) - RVCE_CS(0x00000000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading) + enc->enc_pic.eo.enc_input_pic_addr_array_disable2pipe_disablemboffload = 0x00000000; else - RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading) - RVCE_CS(0x00000000); // encInputPicTileConfig - RVCE_CS(enc->pic.picture_type); // encPicType - RVCE_CS(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag - RVCE_CS(0x00000000); // encIdrPicId - RVCE_CS(0x00000000); // encMGSKeyPic - RVCE_CS(!enc->pic.not_referenced); // encReferenceFlag - RVCE_CS(0x00000000); // encTemporalLayerIndex - RVCE_CS(0x00000000); // num_ref_idx_active_override_flag - RVCE_CS(0x00000000); // num_ref_idx_l0_active_minus1 - RVCE_CS(0x00000000); // num_ref_idx_l1_active_minus1 - - i = enc->pic.frame_num - enc->pic.ref_idx_l0; - if (i > 1 && enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P) { - RVCE_CS(0x00000001); // encRefListModificationOp - RVCE_CS(i - 1); // encRefListModificationNum + enc->enc_pic.eo.enc_input_pic_addr_array_disable2pipe_disablemboffload = 0x00010000; + RVCE_CS(enc->enc_pic.eo.enc_input_pic_addr_array_disable2pipe_disablemboffload); + RVCE_CS(enc->enc_pic.eo.enc_input_pic_tile_config); + RVCE_CS(enc->enc_pic.picture_type); // encPicType + RVCE_CS(enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag + if ((enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR) && (enc->enc_pic.eo.enc_idr_pic_id !=0)) + enc->enc_pic.eo.enc_idr_pic_id = enc->enc_pic.idr_pic_id - 1; + else + enc->enc_pic.eo.enc_idr_pic_id = 0x00000000; + RVCE_CS(enc->enc_pic.eo.enc_idr_pic_id); + RVCE_CS(enc->enc_pic.eo.enc_mgs_key_pic); + RVCE_CS(!enc->enc_pic.not_referenced); + RVCE_CS(enc->enc_pic.eo.enc_temporal_layer_index); + RVCE_CS(enc->enc_pic.eo.num_ref_idx_active_override_flag); + RVCE_CS(enc->enc_pic.eo.num_ref_idx_l0_active_minus1); + RVCE_CS(enc->enc_pic.eo.num_ref_idx_l1_active_minus1); + + i = enc->enc_pic.frame_num - enc->enc_pic.ref_idx_l0; + if (i > 1 && enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P) { + enc->enc_pic.eo.enc_ref_list_modification_op = 0x00000001; + enc->enc_pic.eo.enc_ref_list_modification_num = i - 1; + RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_op); + RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_num); } else { - RVCE_CS(0x00000000); // encRefListModificationOp - RVCE_CS(0x00000000); // encRefListModificationNum + enc->enc_pic.eo.enc_ref_list_modification_op = 0x00000000; + enc->enc_pic.eo.enc_ref_list_modification_num = 0x00000000; + RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_op); + RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_num); } for (i = 0; i < 3; ++i) { - RVCE_CS(0x00000000); // encRefListModificationOp - RVCE_CS(0x00000000); // encRefListModificationNum + enc->enc_pic.eo.enc_ref_list_modification_op = 0x00000000; + enc->enc_pic.eo.enc_ref_list_modification_num = 0x00000000; + RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_op); + RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_num); } for (i = 0; i < 4; ++i) { - RVCE_CS(0x00000000); // encDecodedPictureMarkingOp - RVCE_CS(0x00000000); // encDecodedPictureMarkingNum - RVCE_CS(0x00000000); // encDecodedPictureMarkingIdx - RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingOp - RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingNum + RVCE_CS(enc->enc_pic.eo.enc_decoded_picture_marking_op); + RVCE_CS(enc->enc_pic.eo.enc_decoded_picture_marking_num); + RVCE_CS(enc->enc_pic.eo.enc_decoded_picture_marking_idx); + RVCE_CS(enc->enc_pic.eo.enc_decoded_ref_base_picture_marking_op); + RVCE_CS(enc->enc_pic.eo.enc_decoded_ref_base_picture_marking_num); } // encReferencePictureL0[0] RVCE_CS(0x00000000); // pictureStructure - if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P || - enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) { + if(enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P || + enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) { struct rvce_cpb_slot *l0 = l0_slot(enc); rvce_frame_offset(enc, l0, &luma_offset, &chroma_offset); - RVCE_CS(l0->picture_type); // encPicType - RVCE_CS(l0->frame_num); // frameNumber - RVCE_CS(l0->pic_order_cnt); // pictureOrderCount - RVCE_CS(luma_offset); // lumaOffset - RVCE_CS(chroma_offset); // chromaOffset + RVCE_CS(l0->picture_type); + RVCE_CS(l0->frame_num); + RVCE_CS(l0->pic_order_cnt); + RVCE_CS(luma_offset); + RVCE_CS(chroma_offset); } else { - RVCE_CS(0x00000000); // encPicType - RVCE_CS(0x00000000); // frameNumber - RVCE_CS(0x00000000); // pictureOrderCount - RVCE_CS(0xffffffff); // lumaOffset - RVCE_CS(0xffffffff); // chromaOffset + enc->enc_pic.eo.l0_enc_pic_type = 0x00000000; + enc->enc_pic.eo.l0_frame_number = 0x00000000; + enc->enc_pic.eo.l0_picture_order_count = 0x00000000; + enc->enc_pic.eo.l0_luma_offset = 0xffffffff; + enc->enc_pic.eo.l0_chroma_offset = 0xffffffff; + RVCE_CS(enc->enc_pic.eo.l0_enc_pic_type); + RVCE_CS(enc->enc_pic.eo.l0_frame_number); + RVCE_CS(enc->enc_pic.eo.l0_picture_order_count); + RVCE_CS(enc->enc_pic.eo.l0_luma_offset); + RVCE_CS(enc->enc_pic.eo.l0_chroma_offset); } // encReferencePictureL0[1] - RVCE_CS(0x00000000); // pictureStructure - RVCE_CS(0x00000000); // encPicType - RVCE_CS(0x00000000); // frameNumber - RVCE_CS(0x00000000); // pictureOrderCount - RVCE_CS(0xffffffff); // lumaOffset - RVCE_CS(0xffffffff); // chromaOffset + enc->enc_pic.eo.l0_picture_structure = 0x00000000; + enc->enc_pic.eo.l0_enc_pic_type = 0x00000000; + enc->enc_pic.eo.l0_frame_number = 0x00000000; + enc->enc_pic.eo.l0_picture_order_count = 0x00000000; + enc->enc_pic.eo.l0_luma_offset = 0xffffffff; + enc->enc_pic.eo.l0_chroma_offset = 0xffffffff; + RVCE_CS(enc->enc_pic.eo.l0_picture_structure); + RVCE_CS(enc->enc_pic.eo.l0_enc_pic_type); + RVCE_CS(enc->enc_pic.eo.l0_frame_number); + RVCE_CS(enc->enc_pic.eo.l0_picture_order_count); + RVCE_CS(enc->enc_pic.eo.l0_luma_offset); + RVCE_CS(enc->enc_pic.eo.l0_chroma_offset); // encReferencePictureL1[0] RVCE_CS(0x00000000); // pictureStructure - if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) { + if(enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) { struct rvce_cpb_slot *l1 = l1_slot(enc); rvce_frame_offset(enc, l1, &luma_offset, &chroma_offset); - RVCE_CS(l1->picture_type); // encPicType - RVCE_CS(l1->frame_num); // frameNumber - RVCE_CS(l1->pic_order_cnt); // pictureOrderCount - RVCE_CS(luma_offset); // lumaOffset - RVCE_CS(chroma_offset); // chromaOffset + RVCE_CS(l1->picture_type); + RVCE_CS(l1->frame_num); + RVCE_CS(l1->pic_order_cnt); + RVCE_CS(luma_offset); + RVCE_CS(chroma_offset); } else { - RVCE_CS(0x00000000); // encPicType - RVCE_CS(0x00000000); // frameNumber - RVCE_CS(0x00000000); // pictureOrderCount - RVCE_CS(0xffffffff); // lumaOffset - RVCE_CS(0xffffffff); // chromaOffset + enc->enc_pic.eo.l1_enc_pic_type = 0x00000000; + enc->enc_pic.eo.l1_frame_number = 0x00000000; + enc->enc_pic.eo.l1_picture_order_count = 0x00000000; + enc->enc_pic.eo.l1_luma_offset = 0xffffffff; + enc->enc_pic.eo.l1_chroma_offset = 0xffffffff; + RVCE_CS(enc->enc_pic.eo.l1_enc_pic_type); + RVCE_CS(enc->enc_pic.eo.l1_frame_number); + RVCE_CS(enc->enc_pic.eo.l1_picture_order_count); + RVCE_CS(enc->enc_pic.eo.l1_luma_offset); + RVCE_CS(enc->enc_pic.eo.l1_chroma_offset); } rvce_frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset); - RVCE_CS(luma_offset); // encReconstructedLumaOffset - RVCE_CS(chroma_offset); // encReconstructedChromaOffset - RVCE_CS(0x00000000); // encColocBufferOffset - RVCE_CS(0x00000000); // encReconstructedRefBasePictureLumaOffset - RVCE_CS(0x00000000); // encReconstructedRefBasePictureChromaOffset - RVCE_CS(0x00000000); // encReferenceRefBasePictureLumaOffset - RVCE_CS(0x00000000); // encReferenceRefBasePictureChromaOffset - RVCE_CS(0x00000000); // pictureCount - RVCE_CS(enc->pic.frame_num); // frameNumber - RVCE_CS(enc->pic.pic_order_cnt); // pictureOrderCount - RVCE_CS(0x00000000); // numIPicRemainInRCGOP - RVCE_CS(0x00000000); // numPPicRemainInRCGOP - RVCE_CS(0x00000000); // numBPicRemainInRCGOP - RVCE_CS(0x00000000); // numIRPicRemainInRCGOP - RVCE_CS(0x00000000); // enableIntraRefresh - - RVCE_CS(0x00000000); // aq_variance_en - RVCE_CS(0x00000000); // aq_block_size - RVCE_CS(0x00000000); // aq_mb_variance_sel - RVCE_CS(0x00000000); // aq_frame_variance_sel - RVCE_CS(0x00000000); // aq_param_a - RVCE_CS(0x00000000); // aq_param_b - RVCE_CS(0x00000000); // aq_param_c - RVCE_CS(0x00000000); // aq_param_d - RVCE_CS(0x00000000); // aq_param_e - - RVCE_CS(0x00000000); // contextInSFB + RVCE_CS(luma_offset); + RVCE_CS(chroma_offset); + RVCE_CS(enc->enc_pic.eo.enc_coloc_buffer_offset); + RVCE_CS(enc->enc_pic.eo.enc_reconstructed_ref_base_picture_luma_offset); + RVCE_CS(enc->enc_pic.eo.enc_reconstructed_ref_base_picture_chroma_offset); + RVCE_CS(enc->enc_pic.eo.enc_reference_ref_base_picture_luma_offset); + RVCE_CS(enc->enc_pic.eo.enc_reference_ref_base_picture_chroma_offset); + RVCE_CS(enc->enc_pic.frame_num_cnt-1); + RVCE_CS(enc->enc_pic.frame_num); + RVCE_CS(enc->enc_pic.pic_order_cnt); + RVCE_CS(enc->enc_pic.i_remain); + RVCE_CS(enc->enc_pic.p_remain); + RVCE_CS(enc->enc_pic.eo.num_b_pic_remain_in_rcgop); + RVCE_CS(enc->enc_pic.eo.num_ir_pic_remain_in_rcgop); + RVCE_CS(enc->enc_pic.eo.enable_intra_refresh); + + RVCE_CS(enc->enc_pic.eo.aq_variance_en); + RVCE_CS(enc->enc_pic.eo.aq_block_size); + RVCE_CS(enc->enc_pic.eo.aq_mb_variance_sel); + RVCE_CS(enc->enc_pic.eo.aq_frame_variance_sel); + RVCE_CS(enc->enc_pic.eo.aq_param_a); + RVCE_CS(enc->enc_pic.eo.aq_param_b); + RVCE_CS(enc->enc_pic.eo.aq_param_c); + RVCE_CS(enc->enc_pic.eo.aq_param_d); + RVCE_CS(enc->enc_pic.eo.aq_param_e); + + RVCE_CS(enc->enc_pic.eo.context_in_sfb); RVCE_END(); } -void radeon_vce_52_init(struct rvce_encoder *enc) +static void rate_control(struct rvce_encoder *enc) +{ + RVCE_BEGIN(0x04000005); // rate control + RVCE_CS(enc->enc_pic.rc.rc_method); + RVCE_CS(enc->enc_pic.rc.target_bitrate); + RVCE_CS(enc->enc_pic.rc.peak_bitrate); + RVCE_CS(enc->enc_pic.rc.frame_rate_num); + RVCE_CS(enc->enc_pic.rc.gop_size); + RVCE_CS(enc->enc_pic.rc.quant_i_frames); + RVCE_CS(enc->enc_pic.rc.quant_p_frames); + RVCE_CS(enc->enc_pic.rc.quant_b_frames); + RVCE_CS(enc->enc_pic.rc.vbv_buffer_size); + RVCE_CS(enc->enc_pic.rc.frame_rate_den); + RVCE_CS(enc->enc_pic.rc.vbv_buf_lv); + RVCE_CS(enc->enc_pic.rc.max_au_size); + RVCE_CS(enc->enc_pic.rc.qp_initial_mode); + RVCE_CS(enc->enc_pic.rc.target_bits_picture); + RVCE_CS(enc->enc_pic.rc.peak_bits_picture_integer); + RVCE_CS(enc->enc_pic.rc.peak_bits_picture_fraction); + RVCE_CS(enc->enc_pic.rc.min_qp); + RVCE_CS(enc->enc_pic.rc.max_qp); + RVCE_CS(enc->enc_pic.rc.skip_frame_enable); + RVCE_CS(enc->enc_pic.rc.fill_data_enable); + RVCE_CS(enc->enc_pic.rc.enforce_hrd); + RVCE_CS(enc->enc_pic.rc.b_pics_delta_qp); + RVCE_CS(enc->enc_pic.rc.ref_b_pics_delta_qp); + RVCE_CS(enc->enc_pic.rc.rc_reinit_disable); + RVCE_CS(enc->enc_pic.rc.enc_lcvbr_init_qp_flag); + RVCE_CS(enc->enc_pic.rc.lcvbrsatd_based_nonlinear_bit_budget_flag); + RVCE_END(); +} + +static void config(struct rvce_encoder *enc) { - radeon_vce_50_init(enc); + enc->task_info(enc, 0x00000002, 0, 0xffffffff, 0); + enc->rate_control(enc); + enc->config_extension(enc); + enc->motion_estimation(enc); + enc->rdo(enc); + if (enc->use_vui) + enc->vui(enc); + enc->pic_control(enc); +} + +static void config_extension(struct rvce_encoder *enc) +{ + RVCE_BEGIN(0x04000001); // config extension + RVCE_CS(enc->enc_pic.ce.enc_enable_perf_logging); + RVCE_END(); +} +static void destroy(struct rvce_encoder *enc) +{ + enc->task_info(enc, 0x00000001, 0, 0, 0); + + RVCE_BEGIN(0x02000001); // destroy + RVCE_END(); +} + +static void feedback(struct rvce_encoder *enc) +{ + RVCE_BEGIN(0x05000005); // feedback buffer + RVCE_WRITE(enc->fb->res->buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo + RVCE_CS(enc->enc_pic.fb.feedback_ring_size); + RVCE_END(); +} + +static void motion_estimation(struct rvce_encoder *enc) +{ + RVCE_BEGIN(0x04000007); // motion estimation + RVCE_CS(enc->enc_pic.me.enc_ime_decimation_search); + RVCE_CS(enc->enc_pic.me.motion_est_half_pixel); + RVCE_CS(enc->enc_pic.me.motion_est_quarter_pixel); + RVCE_CS(enc->enc_pic.me.disable_favor_pmv_point); + RVCE_CS(enc->enc_pic.me.force_zero_point_center); + RVCE_CS(enc->enc_pic.me.lsmvert); + RVCE_CS(enc->enc_pic.me.enc_search_range_x); + RVCE_CS(enc->enc_pic.me.enc_search_range_y); + RVCE_CS(enc->enc_pic.me.enc_search1_range_x); + RVCE_CS(enc->enc_pic.me.enc_search1_range_y); + RVCE_CS(enc->enc_pic.me.disable_16x16_frame1); + RVCE_CS(enc->enc_pic.me.disable_satd); + RVCE_CS(enc->enc_pic.me.enable_amd); + RVCE_CS(enc->enc_pic.me.enc_disable_sub_mode); + RVCE_CS(enc->enc_pic.me.enc_ime_skip_x); + RVCE_CS(enc->enc_pic.me.enc_ime_skip_y); + RVCE_CS(enc->enc_pic.me.enc_en_ime_overw_dis_subm); + RVCE_CS(enc->enc_pic.me.enc_ime_overw_dis_subm_no); + RVCE_CS(enc->enc_pic.me.enc_ime2_search_range_x); + RVCE_CS(enc->enc_pic.me.enc_ime2_search_range_y); + RVCE_CS(enc->enc_pic.me.parallel_mode_speedup_enable); + RVCE_CS(enc->enc_pic.me.fme0_enc_disable_sub_mode); + RVCE_CS(enc->enc_pic.me.fme1_enc_disable_sub_mode); + RVCE_CS(enc->enc_pic.me.ime_sw_speedup_enable); + RVCE_END(); +} + +static void pic_control(struct rvce_encoder *enc) +{ + RVCE_BEGIN(0x04000002); // pic control + RVCE_CS(enc->enc_pic.pc.enc_use_constrained_intra_pred); + RVCE_CS(enc->enc_pic.pc.enc_cabac_enable); + RVCE_CS(enc->enc_pic.pc.enc_cabac_idc); + RVCE_CS(enc->enc_pic.pc.enc_loop_filter_disable); + RVCE_CS(enc->enc_pic.pc.enc_lf_beta_offset); + RVCE_CS(enc->enc_pic.pc.enc_lf_alpha_c0_offset); + RVCE_CS(enc->enc_pic.pc.enc_crop_left_offset); + RVCE_CS(enc->enc_pic.pc.enc_crop_right_offset); + RVCE_CS(enc->enc_pic.pc.enc_crop_top_offset); + RVCE_CS(enc->enc_pic.pc.enc_crop_bottom_offset); + RVCE_CS(enc->enc_pic.pc.enc_num_mbs_per_slice); + RVCE_CS(enc->enc_pic.pc.enc_intra_refresh_num_mbs_per_slot); + RVCE_CS(enc->enc_pic.pc.enc_force_intra_refresh); + RVCE_CS(enc->enc_pic.pc.enc_force_imb_period); + RVCE_CS(enc->enc_pic.pc.enc_pic_order_cnt_type); + RVCE_CS(enc->enc_pic.pc.log2_max_pic_order_cnt_lsb_minus4); + RVCE_CS(enc->enc_pic.pc.enc_sps_id); + RVCE_CS(enc->enc_pic.pc.enc_pps_id); + RVCE_CS(enc->enc_pic.pc.enc_constraint_set_flags); + RVCE_CS(enc->enc_pic.pc.enc_b_pic_pattern); + RVCE_CS(enc->enc_pic.pc.weight_pred_mode_b_picture); + RVCE_CS(enc->enc_pic.pc.enc_number_of_reference_frames); + RVCE_CS(enc->enc_pic.pc.enc_max_num_ref_frames); + RVCE_CS(enc->enc_pic.pc.enc_num_default_active_ref_l0); + RVCE_CS(enc->enc_pic.pc.enc_num_default_active_ref_l1); + RVCE_CS(enc->enc_pic.pc.enc_slice_mode); + RVCE_CS(enc->enc_pic.pc.enc_max_slice_size); + RVCE_END(); +} + +static void rdo(struct rvce_encoder *enc) +{ + RVCE_BEGIN(0x04000008); // rdo + RVCE_CS(enc->enc_pic.rdo.enc_disable_tbe_pred_i_frame); + RVCE_CS(enc->enc_pic.rdo.enc_disable_tbe_pred_p_frame); + RVCE_CS(enc->enc_pic.rdo.use_fme_interpol_y); + RVCE_CS(enc->enc_pic.rdo.use_fme_interpol_uv); + RVCE_CS(enc->enc_pic.rdo.use_fme_intrapol_y); + RVCE_CS(enc->enc_pic.rdo.use_fme_intrapol_uv); + RVCE_CS(enc->enc_pic.rdo.use_fme_interpol_y_1); + RVCE_CS(enc->enc_pic.rdo.use_fme_interpol_uv_1); + RVCE_CS(enc->enc_pic.rdo.use_fme_intrapol_y_1); + RVCE_CS(enc->enc_pic.rdo.use_fme_intrapol_uv_1); + RVCE_CS(enc->enc_pic.rdo.enc_16x16_cost_adj); + RVCE_CS(enc->enc_pic.rdo.enc_skip_cost_adj); + RVCE_CS(enc->enc_pic.rdo.enc_force_16x16_skip); + RVCE_CS(enc->enc_pic.rdo.enc_disable_threshold_calc_a); + RVCE_CS(enc->enc_pic.rdo.enc_luma_coeff_cost); + RVCE_CS(enc->enc_pic.rdo.enc_luma_mb_coeff_cost); + RVCE_CS(enc->enc_pic.rdo.enc_chroma_coeff_cost); + RVCE_END(); +} + +static void session(struct rvce_encoder *enc) +{ + RVCE_BEGIN(0x00000001); // session cmd + RVCE_CS(enc->stream_handle); + RVCE_END(); +} + +static void task_info(struct rvce_encoder *enc, uint32_t op, + uint32_t dep, uint32_t fb_idx, uint32_t ring_idx) +{ + RVCE_BEGIN(0x00000002); // task info + if (op == 0x3) { + if (enc->task_info_idx) { + uint32_t offs = enc->cs->current.cdw - enc->task_info_idx + 3; + // Update offsetOfNextTaskInfo + enc->cs->current.buf[enc->task_info_idx] = offs; + } + enc->task_info_idx = enc->cs->current.cdw; + } + enc->enc_pic.ti.task_operation = op; + enc->enc_pic.ti.reference_picture_dependency = dep; + enc->enc_pic.ti.feedback_index = fb_idx; + enc->enc_pic.ti.video_bitstream_ring_index = ring_idx; + RVCE_CS(enc->enc_pic.ti.offset_of_next_task_info); + RVCE_CS(enc->enc_pic.ti.task_operation); + RVCE_CS(enc->enc_pic.ti.reference_picture_dependency); + RVCE_CS(enc->enc_pic.ti.collocate_flag_dependency); + RVCE_CS(enc->enc_pic.ti.feedback_index); + RVCE_CS(enc->enc_pic.ti.video_bitstream_ring_index); + RVCE_END(); +} + +static void vui(struct rvce_encoder *enc) +{ + int i; + + if (!enc->enc_pic.enable_vui) + return; + + RVCE_BEGIN(0x04000009); // vui + RVCE_CS(enc->enc_pic.vui.aspect_ratio_info_present_flag); + RVCE_CS(enc->enc_pic.vui.aspect_ratio_idc); + RVCE_CS(enc->enc_pic.vui.sar_width); + RVCE_CS(enc->enc_pic.vui.sar_height); + RVCE_CS(enc->enc_pic.vui.overscan_info_present_flag); + RVCE_CS(enc->enc_pic.vui.overscan_Approp_flag); + RVCE_CS(enc->enc_pic.vui.video_signal_type_present_flag); + RVCE_CS(enc->enc_pic.vui.video_format); + RVCE_CS(enc->enc_pic.vui.video_full_range_flag); + RVCE_CS(enc->enc_pic.vui.color_description_present_flag); + RVCE_CS(enc->enc_pic.vui.color_prim); + RVCE_CS(enc->enc_pic.vui.transfer_char); + RVCE_CS(enc->enc_pic.vui.matrix_coef); + RVCE_CS(enc->enc_pic.vui.chroma_loc_info_present_flag); + RVCE_CS(enc->enc_pic.vui.chroma_loc_top); + RVCE_CS(enc->enc_pic.vui.chroma_loc_bottom); + RVCE_CS(enc->enc_pic.vui.timing_info_present_flag); + RVCE_CS(enc->enc_pic.vui.num_units_in_tick); + RVCE_CS(enc->enc_pic.vui.time_scale); + RVCE_CS(enc->enc_pic.vui.fixed_frame_rate_flag); + RVCE_CS(enc->enc_pic.vui.nal_hrd_parameters_present_flag); + RVCE_CS(enc->enc_pic.vui.cpb_cnt_minus1); + RVCE_CS(enc->enc_pic.vui.bit_rate_scale); + RVCE_CS(enc->enc_pic.vui.cpb_size_scale); + for (i = 0; i < 32; i++) { + RVCE_CS(enc->enc_pic.vui.bit_rate_value_minus); + RVCE_CS(enc->enc_pic.vui.cpb_size_value_minus); + RVCE_CS(enc->enc_pic.vui.cbr_flag); + } + RVCE_CS(enc->enc_pic.vui.initial_cpb_removal_delay_length_minus1); + RVCE_CS(enc->enc_pic.vui.cpb_removal_delay_length_minus1); + RVCE_CS(enc->enc_pic.vui.dpb_output_delay_length_minus1); + RVCE_CS(enc->enc_pic.vui.time_offset_length); + RVCE_CS(enc->enc_pic.vui.low_delay_hrd_flag); + RVCE_CS(enc->enc_pic.vui.pic_struct_present_flag); + RVCE_CS(enc->enc_pic.vui.bitstream_restriction_present_flag); + RVCE_CS(enc->enc_pic.vui.motion_vectors_over_pic_boundaries_flag); + RVCE_CS(enc->enc_pic.vui.max_bytes_per_pic_denom); + RVCE_CS(enc->enc_pic.vui.max_bits_per_mb_denom); + RVCE_CS(enc->enc_pic.vui.log2_max_mv_length_hori); + RVCE_CS(enc->enc_pic.vui.log2_max_mv_length_vert); + RVCE_CS(enc->enc_pic.vui.num_reorder_frames); + RVCE_CS(enc->enc_pic.vui.max_dec_frame_buffering); + RVCE_END(); +} + +void radeon_vce_52_init(struct rvce_encoder *enc) +{ + enc->session = session; + enc->task_info = task_info; enc->create = create; + enc->feedback = feedback; + enc->rate_control = rate_control; + enc->config_extension = config_extension; + enc->pic_control = pic_control; + enc->motion_estimation = motion_estimation; + enc->rdo = rdo; + enc->vui = vui; + enc->config = config; enc->encode = encode; + enc->destroy = destroy; } diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_video.c b/lib/mesa/src/gallium/drivers/radeon/radeon_video.c index f56c6cf6c..de8e11cd8 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_video.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_video.c @@ -43,6 +43,8 @@ #include "radeon_video.h" #include "radeon_vce.h" +#define UVD_FW_1_66_16 ((1 << 24) | (66 << 16) | (16 << 8)) + /* generate an stream handle */ unsigned rvid_alloc_stream_handle() { @@ -64,8 +66,14 @@ bool rvid_create_buffer(struct pipe_screen *screen, struct rvid_buffer *buffer, { memset(buffer, 0, sizeof(*buffer)); buffer->usage = usage; + + /* Hardware buffer placement restrictions require the kernel to be + * able to move buffers around individually, so request a + * non-sub-allocated buffer. + */ buffer->res = (struct r600_resource *) - pipe_buffer_create(screen, PIPE_BIND_CUSTOM, usage, size); + pipe_buffer_create(screen, PIPE_BIND_CUSTOM | PIPE_BIND_SHARED, + usage, size); return buffer->res != NULL; } @@ -73,7 +81,7 @@ bool rvid_create_buffer(struct pipe_screen *screen, struct rvid_buffer *buffer, /* destroy a buffer */ void rvid_destroy_buffer(struct rvid_buffer *buffer) { - pipe_resource_reference((struct pipe_resource **)&buffer->res, NULL); + r600_resource_reference(&buffer->res, NULL); } /* reallocate a buffer, preserving its content */ @@ -89,11 +97,11 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs, if (!rvid_create_buffer(screen, new_buf, new_size, new_buf->usage)) goto error; - src = ws->buffer_map(old_buf.res->cs_buf, cs, PIPE_TRANSFER_READ); + src = ws->buffer_map(old_buf.res->buf, cs, PIPE_TRANSFER_READ); if (!src) goto error; - dst = ws->buffer_map(new_buf->res->cs_buf, cs, PIPE_TRANSFER_WRITE); + dst = ws->buffer_map(new_buf->res->buf, cs, PIPE_TRANSFER_WRITE); if (!dst) goto error; @@ -103,14 +111,14 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs, dst += bytes; memset(dst, 0, new_size); } - ws->buffer_unmap(new_buf->res->cs_buf); - ws->buffer_unmap(old_buf.res->cs_buf); + ws->buffer_unmap(new_buf->res->buf); + ws->buffer_unmap(old_buf.res->buf); rvid_destroy_buffer(&old_buf); return true; error: if (src) - ws->buffer_unmap(old_buf.res->cs_buf); + ws->buffer_unmap(old_buf.res->buf); rvid_destroy_buffer(new_buf); *new_buf = old_buf; return false; @@ -122,7 +130,7 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer) struct r600_common_context *rctx = (struct r600_common_context*)context; rctx->clear_buffer(context, &buffer->res->b.b, 0, buffer->res->buf->size, - 0, false); + 0, R600_COHERENCY_NONE); context->flush(context, NULL, 0); } @@ -130,7 +138,7 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer) * join surfaces into the same buffer with identical tiling params * sumup their sizes and replace the backend buffers with a single bo */ -void rvid_join_surfaces(struct radeon_winsys* ws, unsigned bind, +void rvid_join_surfaces(struct radeon_winsys* ws, struct pb_buffer** buffers[VL_NUM_COMPONENTS], struct radeon_surf *surfaces[VL_NUM_COMPONENTS]) { @@ -165,7 +173,7 @@ void rvid_join_surfaces(struct radeon_winsys* ws, unsigned bind, /* adjust the texture layer offsets */ off = align(off, surfaces[i]->bo_alignment); - for (j = 0; j < Elements(surfaces[i]->level); ++j) + for (j = 0; j < ARRAY_SIZE(surfaces[i]->level); ++j) surfaces[i]->level[j].offset += off; off += surfaces[i]->bo_size; } @@ -185,7 +193,7 @@ void rvid_join_surfaces(struct radeon_winsys* ws, unsigned bind, /* TODO: 2D tiling workaround */ alignment *= 2; - pb = ws->buffer_create(ws, size, alignment, bind, RADEON_DOMAIN_VRAM, 0); + pb = ws->buffer_create(ws, size, alignment, RADEON_DOMAIN_VRAM, 0); if (!pb) return; @@ -206,30 +214,33 @@ int rvid_get_video_param(struct pipe_screen *screen, { struct r600_common_screen *rscreen = (struct r600_common_screen *)screen; enum pipe_video_format codec = u_reduce_video_profile(profile); + struct radeon_info info; + + rscreen->ws->query_info(rscreen->ws, &info); if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) { switch (param) { case PIPE_VIDEO_CAP_SUPPORTED: return codec == PIPE_VIDEO_FORMAT_MPEG4_AVC && rvce_is_fw_version_supported(rscreen); - case PIPE_VIDEO_CAP_NPOT_TEXTURES: - return 1; - case PIPE_VIDEO_CAP_MAX_WIDTH: + case PIPE_VIDEO_CAP_NPOT_TEXTURES: + return 1; + case PIPE_VIDEO_CAP_MAX_WIDTH: return (rscreen->family < CHIP_TONGA) ? 2048 : 4096; - case PIPE_VIDEO_CAP_MAX_HEIGHT: + case PIPE_VIDEO_CAP_MAX_HEIGHT: return (rscreen->family < CHIP_TONGA) ? 1152 : 2304; - case PIPE_VIDEO_CAP_PREFERED_FORMAT: - return PIPE_FORMAT_NV12; - case PIPE_VIDEO_CAP_PREFERS_INTERLACED: - return false; - case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: - return false; - case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: - return true; - case PIPE_VIDEO_CAP_STACKED_FRAMES: + case PIPE_VIDEO_CAP_PREFERED_FORMAT: + return PIPE_FORMAT_NV12; + case PIPE_VIDEO_CAP_PREFERS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED: + return false; + case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE: + return true; + case PIPE_VIDEO_CAP_STACKED_FRAMES: return (rscreen->family < CHIP_TONGA) ? 1 : 2; - default: - return 0; + default: + return 0; } } @@ -237,18 +248,27 @@ int rvid_get_video_param(struct pipe_screen *screen, case PIPE_VIDEO_CAP_SUPPORTED: switch (codec) { case PIPE_VIDEO_FORMAT_MPEG12: + return profile != PIPE_VIDEO_PROFILE_MPEG1; case PIPE_VIDEO_FORMAT_MPEG4: + /* no support for MPEG4 on older hw */ + return rscreen->family >= CHIP_PALM; case PIPE_VIDEO_FORMAT_MPEG4_AVC: - if (rscreen->family < CHIP_PALM) - /* no support for MPEG4 */ - return codec != PIPE_VIDEO_FORMAT_MPEG4; + if ((rscreen->family == CHIP_POLARIS10 || + rscreen->family == CHIP_POLARIS11) && + info.uvd_fw_version < UVD_FW_1_66_16 ) { + RVID_ERR("POLARIS10/11 firmware version need to be updated.\n"); + return false; + } return true; case PIPE_VIDEO_FORMAT_VC1: return true; case PIPE_VIDEO_FORMAT_HEVC: /* Carrizo only supports HEVC Main */ - return rscreen->family >= CHIP_CARRIZO && - profile == PIPE_VIDEO_PROFILE_HEVC_MAIN; + if (rscreen->family >= CHIP_STONEY) + return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN || + profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10); + else if (rscreen->family >= CHIP_CARRIZO) + return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN; default: return false; } @@ -257,7 +277,7 @@ int rvid_get_video_param(struct pipe_screen *screen, case PIPE_VIDEO_CAP_MAX_WIDTH: return (rscreen->family < CHIP_TONGA) ? 2048 : 4096; case PIPE_VIDEO_CAP_MAX_HEIGHT: - return (rscreen->family < CHIP_TONGA) ? 1152 : 2304; + return (rscreen->family < CHIP_TONGA) ? 1152 : 4096; case PIPE_VIDEO_CAP_PREFERED_FORMAT: return PIPE_FORMAT_NV12; case PIPE_VIDEO_CAP_PREFERS_INTERLACED: @@ -294,8 +314,9 @@ int rvid_get_video_param(struct pipe_screen *screen, case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE: case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN: case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH: - return 41; + return (rscreen->family < CHIP_TONGA) ? 41 : 52; case PIPE_VIDEO_PROFILE_HEVC_MAIN: + case PIPE_VIDEO_PROFILE_HEVC_MAIN_10: return 186; default: return 0; diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_video.h b/lib/mesa/src/gallium/drivers/radeon/radeon_video.h index c9ee67f07..39305b4fd 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_video.h +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_video.h @@ -66,7 +66,7 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer) /* join surfaces into the same buffer with identical tiling params sumup their sizes and replace the backend buffers with a single bo */ -void rvid_join_surfaces(struct radeon_winsys* ws, unsigned bind, +void rvid_join_surfaces(struct radeon_winsys* ws, struct pb_buffer** buffers[VL_NUM_COMPONENTS], struct radeon_surf *surfaces[VL_NUM_COMPONENTS]); diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h b/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h index f9a7f878f..8946209d3 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h @@ -26,25 +26,12 @@ /* The public winsys interface header for the radeon driver. */ -/* R300 features in DRM. - * - * 2.6.0: - * - Hyper-Z - * - GB_Z_PEQ_CONFIG on rv350->r4xx - * - R500 FG_ALPHA_VALUE - * - * 2.8.0: - * - R500 US_FORMAT regs - * - R500 ARGB2101010 colorbuffer - * - CMask and AA regs - * - R16F/RG16F - */ - #include "pipebuffer/pb_buffer.h" +#include "amd/common/amd_family.h" + #define RADEON_FLUSH_ASYNC (1 << 0) -#define RADEON_FLUSH_KEEP_TILING_FLAGS (1 << 1) /* needs DRM 2.12.0 */ -#define RADEON_FLUSH_END_OF_FRAME (1 << 2) +#define RADEON_FLUSH_END_OF_FRAME (1 << 1) /* Tiling flags. */ enum radeon_bo_layout { @@ -65,94 +52,18 @@ enum radeon_bo_flag { /* bitfield */ RADEON_FLAG_GTT_WC = (1 << 0), RADEON_FLAG_CPU_ACCESS = (1 << 1), RADEON_FLAG_NO_CPU_ACCESS = (1 << 2), + RADEON_FLAG_HANDLE = (1 << 3), /* the buffer most not be suballocated */ }; enum radeon_bo_usage { /* bitfield */ RADEON_USAGE_READ = 2, RADEON_USAGE_WRITE = 4, - RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE -}; + RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE, -enum radeon_family { - CHIP_UNKNOWN = 0, - CHIP_R300, /* R3xx-based cores. */ - CHIP_R350, - CHIP_RV350, - CHIP_RV370, - CHIP_RV380, - CHIP_RS400, - CHIP_RC410, - CHIP_RS480, - CHIP_R420, /* R4xx-based cores. */ - CHIP_R423, - CHIP_R430, - CHIP_R480, - CHIP_R481, - CHIP_RV410, - CHIP_RS600, - CHIP_RS690, - CHIP_RS740, - CHIP_RV515, /* R5xx-based cores. */ - CHIP_R520, - CHIP_RV530, - CHIP_R580, - CHIP_RV560, - CHIP_RV570, - CHIP_R600, - CHIP_RV610, - CHIP_RV630, - CHIP_RV670, - CHIP_RV620, - CHIP_RV635, - CHIP_RS780, - CHIP_RS880, - CHIP_RV770, - CHIP_RV730, - CHIP_RV710, - CHIP_RV740, - CHIP_CEDAR, - CHIP_REDWOOD, - CHIP_JUNIPER, - CHIP_CYPRESS, - CHIP_HEMLOCK, - CHIP_PALM, - CHIP_SUMO, - CHIP_SUMO2, - CHIP_BARTS, - CHIP_TURKS, - CHIP_CAICOS, - CHIP_CAYMAN, - CHIP_ARUBA, - CHIP_TAHITI, - CHIP_PITCAIRN, - CHIP_VERDE, - CHIP_OLAND, - CHIP_HAINAN, - CHIP_BONAIRE, - CHIP_KAVERI, - CHIP_KABINI, - CHIP_HAWAII, - CHIP_MULLINS, - CHIP_TONGA, - CHIP_ICELAND, - CHIP_CARRIZO, - CHIP_FIJI, - CHIP_STONEY, - CHIP_LAST, -}; - -enum chip_class { - CLASS_UNKNOWN = 0, - R300, - R400, - R500, - R600, - R700, - EVERGREEN, - CAYMAN, - SI, - CIK, - VI, + /* The winsys ensures that the CS submission will be scheduled after + * previously flushed CSs referencing this BO in a conflicting way. + */ + RADEON_USAGE_SYNCHRONIZED = 8 }; enum ring_type { @@ -167,10 +78,13 @@ enum ring_type { enum radeon_value_id { RADEON_REQUESTED_VRAM_MEMORY, RADEON_REQUESTED_GTT_MEMORY, + RADEON_MAPPED_VRAM, + RADEON_MAPPED_GTT, RADEON_BUFFER_WAIT_TIME_NS, RADEON_TIMESTAMP, RADEON_NUM_CS_FLUSHES, RADEON_NUM_BYTES_MOVED, + RADEON_NUM_EVICTIONS, RADEON_VRAM_USAGE, RADEON_GTT_USAGE, RADEON_GPU_TEMPERATURE, /* DRM 2.42.0 */ @@ -179,73 +93,161 @@ enum radeon_value_id { RADEON_GPU_RESET_COUNTER, /* DRM 2.43.0 */ }; +/* Each group of four has the same priority. */ enum radeon_bo_priority { - RADEON_PRIO_MIN, - RADEON_PRIO_SHADER_DATA, /* shader code, resource descriptors */ - RADEON_PRIO_SHADER_BUFFER_RO, /* read-only */ - RADEON_PRIO_SHADER_TEXTURE_RO, /* read-only */ - RADEON_PRIO_SHADER_RESOURCE_RW, /* buffers, textures, streamout, GS rings, RATs; read/write */ - RADEON_PRIO_COLOR_BUFFER, - RADEON_PRIO_DEPTH_BUFFER, - RADEON_PRIO_SHADER_TEXTURE_MSAA, - RADEON_PRIO_COLOR_BUFFER_MSAA, - RADEON_PRIO_DEPTH_BUFFER_MSAA, - RADEON_PRIO_COLOR_META, - RADEON_PRIO_DEPTH_META, - RADEON_PRIO_MAX /* must be <= 15 */ + RADEON_PRIO_FENCE = 0, + RADEON_PRIO_TRACE, + RADEON_PRIO_SO_FILLED_SIZE, + RADEON_PRIO_QUERY, + + RADEON_PRIO_IB1 = 4, /* main IB submitted to the kernel */ + RADEON_PRIO_IB2, /* IB executed with INDIRECT_BUFFER */ + RADEON_PRIO_DRAW_INDIRECT, + RADEON_PRIO_INDEX_BUFFER, + + RADEON_PRIO_VCE = 8, + RADEON_PRIO_UVD, + RADEON_PRIO_SDMA_BUFFER, + RADEON_PRIO_SDMA_TEXTURE, + + RADEON_PRIO_CP_DMA = 12, + + RADEON_PRIO_CONST_BUFFER = 16, + RADEON_PRIO_DESCRIPTORS, + RADEON_PRIO_BORDER_COLORS, + + RADEON_PRIO_SAMPLER_BUFFER = 20, + RADEON_PRIO_VERTEX_BUFFER, + + RADEON_PRIO_SHADER_RW_BUFFER = 24, + RADEON_PRIO_COMPUTE_GLOBAL, + + RADEON_PRIO_SAMPLER_TEXTURE = 28, + RADEON_PRIO_SHADER_RW_IMAGE, + + RADEON_PRIO_SAMPLER_TEXTURE_MSAA = 32, + + RADEON_PRIO_COLOR_BUFFER = 36, + + RADEON_PRIO_DEPTH_BUFFER = 40, + + RADEON_PRIO_COLOR_BUFFER_MSAA = 44, + + RADEON_PRIO_DEPTH_BUFFER_MSAA = 48, + + RADEON_PRIO_CMASK = 52, + RADEON_PRIO_DCC, + RADEON_PRIO_HTILE, + RADEON_PRIO_SHADER_BINARY, /* the hw can't hide instruction cache misses */ + + RADEON_PRIO_SHADER_RINGS = 56, + + RADEON_PRIO_SCRATCH_BUFFER = 60, + /* 63 is the maximum value */ }; struct winsys_handle; -struct radeon_winsys_cs_handle; struct radeon_winsys_ctx; +struct radeon_winsys_cs_chunk { + unsigned cdw; /* Number of used dwords. */ + unsigned max_dw; /* Maximum number of dwords. */ + uint32_t *buf; /* The base pointer of the chunk. */ +}; + struct radeon_winsys_cs { - unsigned cdw; /* Number of used dwords. */ - unsigned max_dw; /* Maximum number of dwords. */ - uint32_t *buf; /* The command buffer. */ - enum ring_type ring_type; + struct radeon_winsys_cs_chunk current; + struct radeon_winsys_cs_chunk *prev; + unsigned num_prev; /* Number of previous chunks. */ + unsigned max_prev; /* Space in array pointed to by prev. */ + unsigned prev_dw; /* Total number of dwords in previous chunks. */ + + /* Memory usage of the buffer list. These are always 0 for CE and preamble + * IBs. */ + uint64_t used_vram; + uint64_t used_gart; }; struct radeon_info { + /* PCI info: domain:bus:dev:func */ + uint32_t pci_domain; + uint32_t pci_bus; + uint32_t pci_dev; + uint32_t pci_func; + + /* Device info. */ uint32_t pci_id; enum radeon_family family; enum chip_class chip_class; + uint32_t gart_page_size; uint64_t gart_size; uint64_t vram_size; - uint32_t max_sclk; - uint32_t max_compute_units; - uint32_t max_se; - uint32_t max_sh_per_se; + uint64_t max_alloc_size; + uint32_t min_alloc_size; + bool has_dedicated_vram; + bool has_virtual_memory; + bool gfx_ib_pad_with_type2; + bool has_sdma; + bool has_uvd; + uint32_t uvd_fw_version; + uint32_t vce_fw_version; + uint32_t me_fw_version; + uint32_t pfp_fw_version; + uint32_t ce_fw_version; + uint32_t vce_harvest_config; + uint32_t clock_crystal_freq; + /* Kernel info. */ uint32_t drm_major; /* version */ uint32_t drm_minor; uint32_t drm_patchlevel; + bool has_userptr; - boolean has_uvd; - uint32_t vce_fw_version; - boolean has_userptr; + /* Shader cores. */ + uint32_t r600_max_quad_pipes; /* wave size / 16 */ + uint32_t max_shader_clock; + uint32_t num_good_compute_units; + uint32_t max_se; /* shader engines */ + uint32_t max_sh_per_se; /* shader arrays per shader engine */ + /* Render backends (color + depth blocks). */ uint32_t r300_num_gb_pipes; uint32_t r300_num_z_pipes; - - uint32_t r600_num_backends; - uint32_t r600_clock_crystal_freq; - uint32_t r600_tiling_config; - uint32_t r600_num_tile_pipes; - uint32_t r600_max_pipes; - boolean r600_virtual_address; - boolean r600_has_dma; - - uint32_t r600_backend_map; - boolean r600_backend_map_valid; - - boolean si_tile_mode_array_valid; + uint32_t r600_gb_backend_map; /* R600 harvest config */ + bool r600_gb_backend_map_valid; + uint32_t r600_num_banks; + uint32_t num_render_backends; + uint32_t num_tile_pipes; /* pipe count from PIPE_CONFIG */ + uint32_t pipe_interleave_bytes; + uint32_t enabled_rb_mask; /* GCN harvest config */ + + /* Tile modes. */ uint32_t si_tile_mode_array[32]; - uint32_t si_backend_enabled_mask; - - boolean cik_macrotile_mode_array_valid; uint32_t cik_macrotile_mode_array[16]; - uint32_t vce_harvest_config; +}; + +/* Tiling info for display code, DRI sharing, and other data. */ +struct radeon_bo_metadata { + /* Tiling flags describing the texture layout for display code + * and DRI sharing. + */ + enum radeon_bo_layout microtile; + enum radeon_bo_layout macrotile; + unsigned pipe_config; + unsigned bankw; + unsigned bankh; + unsigned tile_split; + unsigned mtilea; + unsigned num_banks; + unsigned stride; + bool scanout; + + /* Additional metadata associated with the buffer, in bytes. + * The maximum size is 64 * 4. This is opaque for the winsys & kernel. + * Supported by amdgpu only. + */ + uint32_t size_metadata; + uint32_t metadata[64]; }; enum radeon_feature_id { @@ -265,7 +267,6 @@ enum radeon_feature_id { #define RADEON_SURF_TYPE_2D_ARRAY 5 #define RADEON_SURF_MODE_MASK 0xFF #define RADEON_SURF_MODE_SHIFT 8 -#define RADEON_SURF_MODE_LINEAR 0 #define RADEON_SURF_MODE_LINEAR_ALIGNED 1 #define RADEON_SURF_MODE_1D 2 #define RADEON_SURF_MODE_2D 3 @@ -276,6 +277,8 @@ enum radeon_feature_id { #define RADEON_SURF_HAS_SBUFFER_MIPTREE (1 << 19) #define RADEON_SURF_HAS_TILE_MODE_INDEX (1 << 20) #define RADEON_SURF_FMASK (1 << 21) +#define RADEON_SURF_DISABLE_DCC (1 << 22) +#define RADEON_SURF_TC_COMPATIBLE_HTILE (1 << 23) #define RADEON_SURF_GET(v, field) (((v) >> RADEON_SURF_ ## field ## _SHIFT) & RADEON_SURF_ ## field ## _MASK) #define RADEON_SURF_SET(v, field) (((v) & RADEON_SURF_ ## field ## _MASK) << RADEON_SURF_ ## field ## _SHIFT) @@ -292,6 +295,9 @@ struct radeon_surf_level { uint32_t nblk_z; uint32_t pitch_bytes; uint32_t mode; + uint64_t dcc_offset; + uint64_t dcc_fast_clear_size; + bool dcc_enabled; }; struct radeon_surf { @@ -320,13 +326,34 @@ struct radeon_surf { uint32_t mtilea; uint32_t tile_split; uint32_t stencil_tile_split; - uint64_t stencil_offset; struct radeon_surf_level level[RADEON_SURF_MAX_LEVEL]; struct radeon_surf_level stencil_level[RADEON_SURF_MAX_LEVEL]; uint32_t tiling_index[RADEON_SURF_MAX_LEVEL]; uint32_t stencil_tiling_index[RADEON_SURF_MAX_LEVEL]; uint32_t pipe_config; uint32_t num_banks; + uint32_t macro_tile_index; + uint32_t micro_tile_mode; /* displayable, thin, depth, rotated */ + + /* Whether the depth miptree or stencil miptree as used by the DB are + * adjusted from their TC compatible form to ensure depth/stencil + * compatibility. If either is true, the corresponding plane cannot be + * sampled from. + */ + bool depth_adjusted; + bool stencil_adjusted; + + uint64_t dcc_size; + uint64_t dcc_alignment; + /* TC-compatible HTILE only. */ + uint64_t htile_size; + uint64_t htile_alignment; +}; + +struct radeon_bo_list_item { + uint64_t bo_size; + uint64_t vm_address; + uint64_t priority_usage; /* mask of (1 << RADEON_PRIO_*) */ }; struct radeon_winsys { @@ -378,15 +405,11 @@ struct radeon_winsys { * \return The created buffer object. */ struct pb_buffer *(*buffer_create)(struct radeon_winsys *ws, - unsigned size, + uint64_t size, unsigned alignment, - boolean use_reusable_pool, enum radeon_bo_domain domain, enum radeon_bo_flag flags); - struct radeon_winsys_cs_handle *(*buffer_get_cs_handle)( - struct pb_buffer *buf); - /** * Map the entire data store of a buffer object into the client's address * space. @@ -396,7 +419,7 @@ struct radeon_winsys { * \param usage A bitmask of the PIPE_TRANSFER_* flags. * \return The pointer at the beginning of the buffer. */ - void *(*buffer_map)(struct radeon_winsys_cs_handle *buf, + void *(*buffer_map)(struct pb_buffer *buf, struct radeon_winsys_cs *cs, enum pipe_transfer_usage usage); @@ -405,7 +428,7 @@ struct radeon_winsys { * * \param buf A winsys buffer object to unmap. */ - void (*buffer_unmap)(struct radeon_winsys_cs_handle *buf); + void (*buffer_unmap)(struct pb_buffer *buf); /** * Wait for the buffer and return true if the buffer is not used @@ -419,45 +442,24 @@ struct radeon_winsys { enum radeon_bo_usage usage); /** - * Return tiling flags describing a memory layout of a buffer object. + * Return buffer metadata. + * (tiling info for display code, DRI sharing, and other data) * * \param buf A winsys buffer object to get the flags from. - * \param macrotile A pointer to the return value of the microtile flag. - * \param microtile A pointer to the return value of the macrotile flag. - * - * \note microtile and macrotile are not bitmasks! + * \param md Metadata */ - void (*buffer_get_tiling)(struct pb_buffer *buf, - enum radeon_bo_layout *microtile, - enum radeon_bo_layout *macrotile, - unsigned *bankw, unsigned *bankh, - unsigned *tile_split, - unsigned *stencil_tile_split, - unsigned *mtilea, - bool *scanout); + void (*buffer_get_metadata)(struct pb_buffer *buf, + struct radeon_bo_metadata *md); /** - * Set tiling flags describing a memory layout of a buffer object. + * Set buffer metadata. + * (tiling info for display code, DRI sharing, and other data) * * \param buf A winsys buffer object to set the flags for. - * \param cs A command stream to flush if the buffer is referenced by it. - * \param macrotile A macrotile flag. - * \param microtile A microtile flag. - * \param stride A stride of the buffer in bytes, for texturing. - * - * \note microtile and macrotile are not bitmasks! + * \param md Metadata */ - void (*buffer_set_tiling)(struct pb_buffer *buf, - struct radeon_winsys_cs *rcs, - enum radeon_bo_layout microtile, - enum radeon_bo_layout macrotile, - unsigned pipe_config, - unsigned bankw, unsigned bankh, - unsigned tile_split, - unsigned stencil_tile_split, - unsigned mtilea, unsigned num_banks, - unsigned stride, - bool scanout); + void (*buffer_set_metadata)(struct pb_buffer *buf, + struct radeon_bo_metadata *md); /** * Get a winsys buffer from a winsys handle. The internal structure @@ -470,7 +472,7 @@ struct radeon_winsys { */ struct pb_buffer *(*buffer_from_handle)(struct radeon_winsys *ws, struct winsys_handle *whandle, - unsigned *stride); + unsigned *stride, unsigned *offset); /** * Get a winsys buffer from a user pointer. The resulting buffer can't @@ -481,7 +483,15 @@ struct radeon_winsys { * \param Size Size in bytes for the new buffer. */ struct pb_buffer *(*buffer_from_ptr)(struct radeon_winsys *ws, - void *pointer, unsigned size); + void *pointer, uint64_t size); + + /** + * Whether the buffer was created from a user pointer. + * + * \param buf A winsys buffer object + * \return whether \p buf was created via buffer_from_ptr + */ + bool (*buffer_is_user_ptr)(struct pb_buffer *buf); /** * Get a winsys handle from a winsys buffer. The internal structure @@ -490,24 +500,40 @@ struct radeon_winsys { * \param buf A winsys buffer object to get the handle from. * \param whandle A winsys handle pointer. * \param stride A stride of the buffer in bytes, for texturing. - * \return TRUE on success. + * \return true on success. */ - boolean (*buffer_get_handle)(struct pb_buffer *buf, - unsigned stride, - struct winsys_handle *whandle); + bool (*buffer_get_handle)(struct pb_buffer *buf, + unsigned stride, unsigned offset, + unsigned slice_size, + struct winsys_handle *whandle); /** * Return the virtual address of a buffer. * + * When virtual memory is not in use, this is the offset relative to the + * relocation base (non-zero for sub-allocated buffers). + * * \param buf A winsys buffer object * \return virtual address */ - uint64_t (*buffer_get_virtual_address)(struct radeon_winsys_cs_handle *buf); + uint64_t (*buffer_get_virtual_address)(struct pb_buffer *buf); + + /** + * Return the offset of this buffer relative to the relocation base. + * This is only non-zero for sub-allocated buffers. + * + * This is only supported in the radeon winsys, since amdgpu uses virtual + * addresses in submissions even for the video engines. + * + * \param buf A winsys buffer object + * \return the offset for relocations + */ + unsigned (*buffer_get_reloc_offset)(struct pb_buffer *buf); /** * Query the initial placement of the buffer from the kernel driver. */ - enum radeon_bo_domain (*buffer_get_initial_domain)(struct radeon_winsys_cs_handle *buf); + enum radeon_bo_domain (*buffer_get_initial_domain)(struct pb_buffer *buf); /************************************************************************** * Command submission. @@ -539,15 +565,43 @@ struct radeon_winsys { * \param ring_type The ring type (GFX, DMA, UVD) * \param flush Flush callback function associated with the command stream. * \param user User pointer that will be passed to the flush callback. - * \param trace_buf Trace buffer when tracing is enabled */ struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys_ctx *ctx, enum ring_type ring_type, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), - void *flush_ctx, - struct radeon_winsys_cs_handle *trace_buf); + void *flush_ctx); + + /** + * Add a constant engine IB to a graphics CS. This makes the graphics CS + * from "cs_create" a group of two IBs that share a buffer list and are + * flushed together. + * + * The returned constant CS is only a stream for writing packets to the new + * IB. Calling other winsys functions with it is not allowed, not even + * "cs_destroy". + * + * In order to add buffers and check memory usage, use the graphics CS. + * In order to flush it, use the graphics CS, which will flush both IBs. + * Destroying the graphics CS will destroy both of them. + * + * \param cs The graphics CS from "cs_create" that will hold the buffer + * list and will be used for flushing. + */ + struct radeon_winsys_cs *(*cs_add_const_ib)(struct radeon_winsys_cs *cs); + /** + * Add a constant engine preamble IB to a graphics CS. This add an extra IB + * in similar manner to cs_add_const_ib. This should always be called after + * cs_add_const_ib. + * + * The returned IB is a constant engine IB that only gets flushed if the + * context changed. + * + * \param cs The graphics CS from "cs_create" that will hold the buffer + * list and will be used for flushing. + */ + struct radeon_winsys_cs *(*cs_add_const_preamble_ib)(struct radeon_winsys_cs *cs); /** * Destroy a command stream. * @@ -556,19 +610,18 @@ struct radeon_winsys { void (*cs_destroy)(struct radeon_winsys_cs *cs); /** - * Add a new buffer relocation. Every relocation must first be added - * before it can be written. + * Add a buffer. Each buffer used by a CS must be added using this function. * - * \param cs A command stream to add buffer for validation against. - * \param buf A winsys buffer to validate. + * \param cs Command stream + * \param buf Buffer * \param usage Whether the buffer is used for read and/or write. * \param domain Bitmask of the RADEON_DOMAIN_* flags. * \param priority A higher number means a greater chance of being * placed in the requested domain. 15 is the maximum. - * \return Relocation index. + * \return Buffer index. */ - unsigned (*cs_add_reloc)(struct radeon_winsys_cs *cs, - struct radeon_winsys_cs_handle *buf, + unsigned (*cs_add_buffer)(struct radeon_winsys_cs *cs, + struct pb_buffer *buf, enum radeon_bo_usage usage, enum radeon_bo_domain domain, enum radeon_bo_priority priority); @@ -576,32 +629,47 @@ struct radeon_winsys { /** * Return the index of an already-added buffer. * + * Not supported on amdgpu. Drivers with GPUVM should not care about + * buffer indices. + * * \param cs Command stream * \param buf Buffer * \return The buffer index, or -1 if the buffer has not been added. */ - int (*cs_get_reloc)(struct radeon_winsys_cs *cs, - struct radeon_winsys_cs_handle *buf); + int (*cs_lookup_buffer)(struct radeon_winsys_cs *cs, + struct pb_buffer *buf); /** - * Return TRUE if there is enough memory in VRAM and GTT for the relocs - * added so far. If the validation fails, all the relocations which have + * Return true if there is enough memory in VRAM and GTT for the buffers + * added so far. If the validation fails, all buffers which have * been added since the last call of cs_validate will be removed and - * the CS will be flushed (provided there are still any relocations). + * the CS will be flushed (provided there are still any buffers). * * \param cs A command stream to validate. */ - boolean (*cs_validate)(struct radeon_winsys_cs *cs); + bool (*cs_validate)(struct radeon_winsys_cs *cs); /** - * Return TRUE if there is enough memory in VRAM and GTT for the relocs - * added so far. + * Check whether the given number of dwords is available in the IB. + * Optionally chain a new chunk of the IB if necessary and supported. * - * \param cs A command stream to validate. - * \param vram VRAM memory size pending to be use - * \param gtt GTT memory size pending to be use + * \param cs A command stream. + * \param dw Number of CS dwords requested by the caller. */ - boolean (*cs_memory_below_limit)(struct radeon_winsys_cs *cs, uint64_t vram, uint64_t gtt); + bool (*cs_check_space)(struct radeon_winsys_cs *cs, unsigned dw); + + /** + * Return the buffer list. + * + * This is the buffer list as passed to the kernel, i.e. it only contains + * the parent buffers of sub-allocated buffers. + * + * \param cs Command stream + * \param list Returned buffer list. Set to NULL to query the count only. + * \return The buffer count. + */ + unsigned (*cs_get_buffer_list)(struct radeon_winsys_cs *cs, + struct radeon_bo_list_item *list); /** * Flush a command stream. @@ -610,22 +678,29 @@ struct radeon_winsys { * \param flags, RADEON_FLUSH_ASYNC or 0. * \param fence Pointer to a fence. If non-NULL, a fence is inserted * after the CS and is returned through this parameter. - * \param cs_trace_id A unique identifier of the cs, used for tracing. + * \return Negative POSIX error code or 0 for success. + * Asynchronous submissions never return an error. */ - void (*cs_flush)(struct radeon_winsys_cs *cs, - unsigned flags, - struct pipe_fence_handle **fence, - uint32_t cs_trace_id); + int (*cs_flush)(struct radeon_winsys_cs *cs, + unsigned flags, + struct pipe_fence_handle **fence); /** - * Return TRUE if a buffer is referenced by a command stream. + * Create a fence before the CS is flushed. + * The user must flush manually to complete the initializaton of the fence. + * The fence must not be used before the flush. + */ + struct pipe_fence_handle *(*cs_get_next_fence)(struct radeon_winsys_cs *cs); + + /** + * Return true if a buffer is referenced by a command stream. * * \param cs A command stream. * \param buf A winsys buffer. */ - boolean (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs, - struct radeon_winsys_cs_handle *buf, - enum radeon_bo_usage usage); + bool (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs, + struct pb_buffer *buf, + enum radeon_bo_usage usage); /** * Request access to a feature for a command stream. @@ -634,9 +709,9 @@ struct radeon_winsys { * \param fid Feature ID, one of RADEON_FID_* * \param enable Whether to enable or disable the feature. */ - boolean (*cs_request_feature)(struct radeon_winsys_cs *cs, - enum radeon_feature_id fid, - boolean enable); + bool (*cs_request_feature)(struct radeon_winsys_cs *cs, + enum radeon_feature_id fid, + bool enable); /** * Make sure all asynchronous flush of the cs have completed * @@ -681,21 +756,25 @@ struct radeon_winsys { uint64_t (*query_value)(struct radeon_winsys *ws, enum radeon_value_id value); - void (*read_registers)(struct radeon_winsys *ws, unsigned reg_offset, + bool (*read_registers)(struct radeon_winsys *ws, unsigned reg_offset, unsigned num_registers, uint32_t *out); }; +static inline bool radeon_emitted(struct radeon_winsys_cs *cs, unsigned num_dw) +{ + return cs && (cs->prev_dw + cs->current.cdw > num_dw); +} static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value) { - cs->buf[cs->cdw++] = value; + cs->current.buf[cs->current.cdw++] = value; } static inline void radeon_emit_array(struct radeon_winsys_cs *cs, const uint32_t *values, unsigned count) { - memcpy(cs->buf+cs->cdw, values, count * 4); - cs->cdw += count; + memcpy(cs->current.buf + cs->current.cdw, values, count * 4); + cs->current.cdw += count; } #endif |