summaryrefslogtreecommitdiff
path: root/lib/mesa/src/gallium/drivers/radeon
diff options
context:
space:
mode:
Diffstat (limited to 'lib/mesa/src/gallium/drivers/radeon')
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/Makefile.am5
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/Makefile.in98
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/Makefile.sources14
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c343
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c56
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c953
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h542
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/r600_query.c1863
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/r600_query.h111
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/r600_streamout.c59
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/r600_texture.c1770
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.c197
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.h50
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c263
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h23
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/radeon_vce.c60
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c14
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c6
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c596
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/radeon_video.c87
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/radeon_video.h2
-rw-r--r--lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h531
22 files changed, 5757 insertions, 1886 deletions
diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.am b/lib/mesa/src/gallium/drivers/radeon/Makefile.am
index 13d8976de..a6fc145cb 100644
--- a/lib/mesa/src/gallium/drivers/radeon/Makefile.am
+++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.am
@@ -16,7 +16,8 @@ libradeon_la_SOURCES = \
if NEED_RADEON_LLVM
AM_CFLAGS += \
- $(LLVM_CFLAGS)
+ $(LLVM_CFLAGS) \
+ $(LIBELF_CFLAGS)
libradeon_la_SOURCES += \
$(LLVM_C_FILES)
@@ -24,7 +25,7 @@ libradeon_la_SOURCES += \
libradeon_la_LIBADD = \
$(CLOCK_LIB) \
$(LLVM_LIBS) \
- $(ELF_LIB)
+ $(LIBELF_LIBS)
libradeon_la_LDFLAGS = \
$(LLVM_LDFLAGS)
diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.in b/lib/mesa/src/gallium/drivers/radeon/Makefile.in
index f9faa3eef..d720beb87 100644
--- a/lib/mesa/src/gallium/drivers/radeon/Makefile.in
+++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.in
@@ -54,18 +54,19 @@ target_triplet = @target@
DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
$(srcdir)/Makefile.sources $(top_srcdir)/bin/depcomp \
$(top_srcdir)/src/gallium/Automake.inc
-@HAVE_LIBDRM_TRUE@am__append_1 = \
-@HAVE_LIBDRM_TRUE@ $(LIBDRM_LIBS)
-
-@HAVE_DRISW_TRUE@am__append_2 = \
+@HAVE_DRISW_TRUE@am__append_1 = \
@HAVE_DRISW_TRUE@ $(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la
-@HAVE_DRISW_KMS_TRUE@am__append_3 = \
+@HAVE_DRISW_KMS_TRUE@am__append_2 = \
@HAVE_DRISW_KMS_TRUE@ $(top_builddir)/src/gallium/winsys/sw/kms-dri/libswkmsdri.la \
@HAVE_DRISW_KMS_TRUE@ $(LIBDRM_LIBS)
-@HAVE_GALLIUM_LLVM_TRUE@am__append_4 = \
-@HAVE_GALLIUM_LLVM_TRUE@ $(LLVM_CFLAGS)
+@NEED_RADEON_LLVM_TRUE@am__append_3 = \
+@NEED_RADEON_LLVM_TRUE@ $(LLVM_CFLAGS) \
+@NEED_RADEON_LLVM_TRUE@ $(LIBELF_CFLAGS)
+
+@NEED_RADEON_LLVM_TRUE@am__append_4 = \
+@NEED_RADEON_LLVM_TRUE@ $(LLVM_C_FILES)
subdir = src/gallium/drivers/radeon
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -86,16 +87,27 @@ CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
LTLIBRARIES = $(noinst_LTLIBRARIES)
am__DEPENDENCIES_1 =
-@HAVE_GALLIUM_LLVM_TRUE@libradeon_la_DEPENDENCIES = \
-@HAVE_GALLIUM_LLVM_TRUE@ $(am__DEPENDENCIES_1) \
-@HAVE_GALLIUM_LLVM_TRUE@ $(am__DEPENDENCIES_1)
+@NEED_RADEON_LLVM_TRUE@libradeon_la_DEPENDENCIES = \
+@NEED_RADEON_LLVM_TRUE@ $(am__DEPENDENCIES_1) \
+@NEED_RADEON_LLVM_TRUE@ $(am__DEPENDENCIES_1) \
+@NEED_RADEON_LLVM_TRUE@ $(am__DEPENDENCIES_1)
+am__libradeon_la_SOURCES_DIST = cayman_msaa.c r600_buffer_common.c \
+ r600_cs.h r600_gpu_load.c r600_perfcounter.c \
+ r600_pipe_common.c r600_pipe_common.h r600_query.c \
+ r600_query.h r600_streamout.c r600_test_dma.c r600_texture.c \
+ r600_viewport.c radeon_uvd.c radeon_uvd.h radeon_vce_40_2_2.c \
+ radeon_vce_50.c radeon_vce_52.c radeon_vce.c radeon_vce.h \
+ radeon_video.c radeon_video.h radeon_winsys.h \
+ radeon_elf_util.c radeon_elf_util.h
am__objects_1 = cayman_msaa.lo r600_buffer_common.lo r600_gpu_load.lo \
r600_perfcounter.lo r600_pipe_common.lo r600_query.lo \
r600_streamout.lo r600_test_dma.lo r600_texture.lo \
r600_viewport.lo radeon_uvd.lo radeon_vce_40_2_2.lo \
radeon_vce_50.lo radeon_vce_52.lo radeon_vce.lo \
radeon_video.lo
-am_libradeon_la_OBJECTS = $(am__objects_1)
+am__objects_2 = radeon_elf_util.lo
+@NEED_RADEON_LLVM_TRUE@am__objects_3 = $(am__objects_2)
+am_libradeon_la_OBJECTS = $(am__objects_1) $(am__objects_3)
libradeon_la_OBJECTS = $(am_libradeon_la_OBJECTS)
AM_V_lt = $(am__v_lt_@AM_V@)
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
@@ -139,7 +151,7 @@ am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
am__v_CCLD_0 = @echo " CCLD " $@;
am__v_CCLD_1 =
SOURCES = $(libradeon_la_SOURCES)
-DIST_SOURCES = $(libradeon_la_SOURCES)
+DIST_SOURCES = $(am__libradeon_la_SOURCES_DIST)
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
@@ -153,8 +165,6 @@ AMDGPU_CFLAGS = @AMDGPU_CFLAGS@
AMDGPU_LIBS = @AMDGPU_LIBS@
AMTAR = @AMTAR@
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
-ANDROID_CFLAGS = @ANDROID_CFLAGS@
-ANDROID_LIBS = @ANDROID_LIBS@
AR = @AR@
AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
@@ -185,6 +195,8 @@ DLLTOOL = @DLLTOOL@
DLOPEN_LIBS = @DLOPEN_LIBS@
DRI2PROTO_CFLAGS = @DRI2PROTO_CFLAGS@
DRI2PROTO_LIBS = @DRI2PROTO_LIBS@
+DRI3PROTO_CFLAGS = @DRI3PROTO_CFLAGS@
+DRI3PROTO_LIBS = @DRI3PROTO_LIBS@
DRIGL_CFLAGS = @DRIGL_CFLAGS@
DRIGL_LIBS = @DRIGL_LIBS@
DRI_DRIVER_INSTALL_DIR = @DRI_DRIVER_INSTALL_DIR@
@@ -197,11 +209,10 @@ ECHO_C = @ECHO_C@
ECHO_N = @ECHO_N@
ECHO_T = @ECHO_T@
EGL_CFLAGS = @EGL_CFLAGS@
+EGL_CLIENT_APIS = @EGL_CLIENT_APIS@
EGL_LIB_DEPS = @EGL_LIB_DEPS@
EGL_NATIVE_PLATFORM = @EGL_NATIVE_PLATFORM@
EGREP = @EGREP@
-ETNAVIV_CFLAGS = @ETNAVIV_CFLAGS@
-ETNAVIV_LIBS = @ETNAVIV_LIBS@
EXEEXT = @EXEEXT@
EXPAT_CFLAGS = @EXPAT_CFLAGS@
EXPAT_LIBS = @EXPAT_LIBS@
@@ -249,27 +260,31 @@ LIBDRM_CFLAGS = @LIBDRM_CFLAGS@
LIBDRM_LIBS = @LIBDRM_LIBS@
LIBELF_CFLAGS = @LIBELF_CFLAGS@
LIBELF_LIBS = @LIBELF_LIBS@
-LIBGLVND_DATADIR = @LIBGLVND_DATADIR@
LIBOBJS = @LIBOBJS@
LIBS = @LIBS@
-LIBSENSORS_LIBS = @LIBSENSORS_LIBS@
+LIBSENSORS_LDFLAGS = @LIBSENSORS_LDFLAGS@
+LIBSHA1_CFLAGS = @LIBSHA1_CFLAGS@
+LIBSHA1_LIBS = @LIBSHA1_LIBS@
LIBTOOL = @LIBTOOL@
-LIBUNWIND_CFLAGS = @LIBUNWIND_CFLAGS@
-LIBUNWIND_LIBS = @LIBUNWIND_LIBS@
LIB_DIR = @LIB_DIR@
LIB_EXT = @LIB_EXT@
LIPO = @LIPO@
+LLVM_BINDIR = @LLVM_BINDIR@
LLVM_CFLAGS = @LLVM_CFLAGS@
LLVM_CONFIG = @LLVM_CONFIG@
+LLVM_CPPFLAGS = @LLVM_CPPFLAGS@
LLVM_CXXFLAGS = @LLVM_CXXFLAGS@
LLVM_INCLUDEDIR = @LLVM_INCLUDEDIR@
LLVM_LDFLAGS = @LLVM_LDFLAGS@
+LLVM_LIBDIR = @LLVM_LIBDIR@
LLVM_LIBS = @LLVM_LIBS@
+LLVM_VERSION = @LLVM_VERSION@
LN_S = @LN_S@
LTLIBOBJS = @LTLIBOBJS@
MAINT = @MAINT@
MAKEINFO = @MAKEINFO@
MANIFEST_TOOL = @MANIFEST_TOOL@
+MESA_LLVM = @MESA_LLVM@
MKDIR_P = @MKDIR_P@
MSVC2013_COMPAT_CFLAGS = @MSVC2013_COMPAT_CFLAGS@
MSVC2013_COMPAT_CXXFLAGS = @MSVC2013_COMPAT_CXXFLAGS@
@@ -290,6 +305,8 @@ OMX_LIBS = @OMX_LIBS@
OMX_LIB_INSTALL_DIR = @OMX_LIB_INSTALL_DIR@
OPENCL_LIBNAME = @OPENCL_LIBNAME@
OPENCL_VERSION = @OPENCL_VERSION@
+OPENSSL_CFLAGS = @OPENSSL_CFLAGS@
+OPENSSL_LIBS = @OPENSSL_LIBS@
OSMESA_LIB = @OSMESA_LIB@
OSMESA_LIB_DEPS = @OSMESA_LIB_DEPS@
OSMESA_PC_LIB_PRIV = @OSMESA_PC_LIB_PRIV@
@@ -309,6 +326,8 @@ PKG_CONFIG = @PKG_CONFIG@
PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
POSIX_SHELL = @POSIX_SHELL@
+PRESENTPROTO_CFLAGS = @PRESENTPROTO_CFLAGS@
+PRESENTPROTO_LIBS = @PRESENTPROTO_LIBS@
PTHREADSTUBS_CFLAGS = @PTHREADSTUBS_CFLAGS@
PTHREADSTUBS_LIBS = @PTHREADSTUBS_LIBS@
PTHREAD_CC = @PTHREAD_CC@
@@ -324,6 +343,8 @@ SED = @SED@
SELINUX_CFLAGS = @SELINUX_CFLAGS@
SELINUX_LIBS = @SELINUX_LIBS@
SET_MAKE = @SET_MAKE@
+SHA1_CFLAGS = @SHA1_CFLAGS@
+SHA1_LIBS = @SHA1_LIBS@
SHELL = @SHELL@
SIMPENROSE_CFLAGS = @SIMPENROSE_CFLAGS@
SIMPENROSE_LIBS = @SIMPENROSE_LIBS@
@@ -332,6 +353,7 @@ STRIP = @STRIP@
SWR_AVX2_CXXFLAGS = @SWR_AVX2_CXXFLAGS@
SWR_AVX_CXXFLAGS = @SWR_AVX_CXXFLAGS@
SWR_CXX11_CXXFLAGS = @SWR_CXX11_CXXFLAGS@
+TIMESTAMP_CMD = @TIMESTAMP_CMD@
VALGRIND_CFLAGS = @VALGRIND_CFLAGS@
VALGRIND_LIBS = @VALGRIND_LIBS@
VA_CFLAGS = @VA_CFLAGS@
@@ -347,6 +369,7 @@ VDPAU_LIB_INSTALL_DIR = @VDPAU_LIB_INSTALL_DIR@
VDPAU_MAJOR = @VDPAU_MAJOR@
VDPAU_MINOR = @VDPAU_MINOR@
VERSION = @VERSION@
+VG_LIB_DEPS = @VG_LIB_DEPS@
VISIBILITY_CFLAGS = @VISIBILITY_CFLAGS@
VISIBILITY_CXXFLAGS = @VISIBILITY_CXXFLAGS@
VL_CFLAGS = @VL_CFLAGS@
@@ -375,10 +398,9 @@ XVMC_LIBS = @XVMC_LIBS@
XVMC_LIB_INSTALL_DIR = @XVMC_LIB_INSTALL_DIR@
XVMC_MAJOR = @XVMC_MAJOR@
XVMC_MINOR = @XVMC_MINOR@
+XXD = @XXD@
YACC = @YACC@
YFLAGS = @YFLAGS@
-ZLIB_CFLAGS = @ZLIB_CFLAGS@
-ZLIB_LIBS = @ZLIB_LIBS@
abs_builddir = @abs_builddir@
abs_srcdir = @abs_srcdir@
abs_top_builddir = @abs_top_builddir@
@@ -464,6 +486,10 @@ C_SOURCES := \
radeon_video.h \
radeon_winsys.h
+LLVM_C_FILES := \
+ radeon_elf_util.c \
+ radeon_elf_util.h
+
GALLIUM_CFLAGS = \
-I$(top_srcdir)/include \
-I$(top_srcdir)/src \
@@ -511,8 +537,12 @@ GALLIUM_TARGET_CFLAGS = \
$(LIBDRM_CFLAGS) \
$(VISIBILITY_CFLAGS)
-GALLIUM_COMMON_LIB_DEPS = -lm $(LIBUNWIND_LIBS) $(LIBSENSORS_LIBS) \
- $(CLOCK_LIB) $(PTHREAD_LIBS) $(DLOPEN_LIBS) $(am__append_1)
+GALLIUM_COMMON_LIB_DEPS = \
+ -lm \
+ $(CLOCK_LIB) \
+ $(PTHREAD_LIBS) \
+ $(DLOPEN_LIBS)
+
GALLIUM_WINSYS_CFLAGS = \
-I$(top_srcdir)/src \
-I$(top_srcdir)/include \
@@ -524,20 +554,19 @@ GALLIUM_WINSYS_CFLAGS = \
GALLIUM_PIPE_LOADER_WINSYS_LIBS = \
$(top_builddir)/src/gallium/winsys/sw/null/libws_null.la \
$(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \
- $(am__append_2) $(am__append_3)
+ $(am__append_1) $(am__append_2)
AM_CFLAGS = $(GALLIUM_DRIVER_CFLAGS) $(RADEON_CFLAGS) \
- -Wstrict-overflow=0 $(am__append_4)
+ -Wstrict-overflow=0 $(am__append_3)
# ^^ disable warnings about overflows (os_time_timeout)
noinst_LTLIBRARIES = libradeon.la
-libradeon_la_SOURCES = \
- $(C_SOURCES)
-
-@HAVE_GALLIUM_LLVM_TRUE@libradeon_la_LIBADD = \
-@HAVE_GALLIUM_LLVM_TRUE@ $(CLOCK_LIB) \
-@HAVE_GALLIUM_LLVM_TRUE@ $(LLVM_LIBS)
+libradeon_la_SOURCES = $(C_SOURCES) $(am__append_4)
+@NEED_RADEON_LLVM_TRUE@libradeon_la_LIBADD = \
+@NEED_RADEON_LLVM_TRUE@ $(CLOCK_LIB) \
+@NEED_RADEON_LLVM_TRUE@ $(LLVM_LIBS) \
+@NEED_RADEON_LLVM_TRUE@ $(LIBELF_LIBS)
-@HAVE_GALLIUM_LLVM_TRUE@libradeon_la_LDFLAGS = \
-@HAVE_GALLIUM_LLVM_TRUE@ $(LLVM_LDFLAGS)
+@NEED_RADEON_LLVM_TRUE@libradeon_la_LDFLAGS = \
+@NEED_RADEON_LLVM_TRUE@ $(LLVM_LDFLAGS)
EXTRA_DIST = \
LLVM_REVISION.txt
@@ -607,6 +636,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r600_test_dma.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r600_texture.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/r600_viewport.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/radeon_elf_util.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/radeon_uvd.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/radeon_vce.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/radeon_vce_40_2_2.Plo@am__quote@
diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.sources b/lib/mesa/src/gallium/drivers/radeon/Makefile.sources
index f63790c32..3e13dae3c 100644
--- a/lib/mesa/src/gallium/drivers/radeon/Makefile.sources
+++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.sources
@@ -2,17 +2,21 @@ C_SOURCES := \
cayman_msaa.c \
r600_buffer_common.c \
r600_cs.h \
- r600d_common.h \
r600_gpu_load.c \
+ r600_perfcounter.c \
r600_pipe_common.c \
r600_pipe_common.h \
r600_query.c \
+ r600_query.h \
r600_streamout.c \
+ r600_test_dma.c \
r600_texture.c \
+ r600_viewport.c \
radeon_uvd.c \
radeon_uvd.h \
radeon_vce_40_2_2.c \
radeon_vce_50.c \
+ radeon_vce_52.c \
radeon_vce.c \
radeon_vce.h \
radeon_video.c \
@@ -21,10 +25,4 @@ C_SOURCES := \
LLVM_C_FILES := \
radeon_elf_util.c \
- radeon_elf_util.h \
- radeon_llvm_emit.c \
- radeon_llvm_emit.h \
- radeon_llvm.h \
- radeon_llvm_util.c \
- radeon_llvm_util.h \
- radeon_setup_tgsi_llvm.c
+ radeon_elf_util.h
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c b/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c
index 2d1058479..bbab58946 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -30,18 +30,18 @@
#include <inttypes.h>
#include <stdio.h>
-boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
- struct radeon_winsys_cs_handle *buf,
- enum radeon_bo_usage usage)
+bool r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
+ struct pb_buffer *buf,
+ enum radeon_bo_usage usage)
{
- if (ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs, buf, usage)) {
- return TRUE;
+ if (ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, buf, usage)) {
+ return true;
}
- if (ctx->rings.dma.cs && ctx->rings.dma.cs->cdw &&
- ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs, buf, usage)) {
- return TRUE;
+ if (radeon_emitted(ctx->dma.cs, 0) &&
+ ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, buf, usage)) {
+ return true;
}
- return FALSE;
+ return false;
}
void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
@@ -52,7 +52,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
bool busy = false;
if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
- return ctx->ws->buffer_map(resource->cs_buf, NULL, usage);
+ return ctx->ws->buffer_map(resource->buf, NULL, usage);
}
if (!(usage & PIPE_TRANSFER_WRITE)) {
@@ -60,26 +60,25 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
rusage = RADEON_USAGE_WRITE;
}
- if (ctx->rings.gfx.cs->cdw != ctx->initial_gfx_cs_size &&
- ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs,
- resource->cs_buf, rusage)) {
+ if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
+ ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
+ resource->buf, rusage)) {
if (usage & PIPE_TRANSFER_DONTBLOCK) {
- ctx->rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+ ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
return NULL;
} else {
- ctx->rings.gfx.flush(ctx, 0, NULL);
+ ctx->gfx.flush(ctx, 0, NULL);
busy = true;
}
}
- if (ctx->rings.dma.cs &&
- ctx->rings.dma.cs->cdw &&
- ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs,
- resource->cs_buf, rusage)) {
+ if (radeon_emitted(ctx->dma.cs, 0) &&
+ ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
+ resource->buf, rusage)) {
if (usage & PIPE_TRANSFER_DONTBLOCK) {
- ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+ ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
return NULL;
} else {
- ctx->rings.dma.flush(ctx, 0, NULL);
+ ctx->dma.flush(ctx, 0, NULL);
busy = true;
}
}
@@ -90,31 +89,33 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
} else {
/* We will be wait for the GPU. Wait for any offloaded
* CS flush to complete to avoid busy-waiting in the winsys. */
- ctx->ws->cs_sync_flush(ctx->rings.gfx.cs);
- if (ctx->rings.dma.cs)
- ctx->ws->cs_sync_flush(ctx->rings.dma.cs);
+ ctx->ws->cs_sync_flush(ctx->gfx.cs);
+ if (ctx->dma.cs)
+ ctx->ws->cs_sync_flush(ctx->dma.cs);
}
}
/* Setting the CS to NULL will prevent doing checks we have done already. */
- return ctx->ws->buffer_map(resource->cs_buf, NULL, usage);
+ return ctx->ws->buffer_map(resource->buf, NULL, usage);
}
-bool r600_init_resource(struct r600_common_screen *rscreen,
- struct r600_resource *res,
- unsigned size, unsigned alignment,
- bool use_reusable_pool)
+void r600_init_resource_fields(struct r600_common_screen *rscreen,
+ struct r600_resource *res,
+ uint64_t size, unsigned alignment)
{
struct r600_texture *rtex = (struct r600_texture*)res;
- struct pb_buffer *old_buf, *new_buf;
- enum radeon_bo_flag flags = 0;
+
+ res->bo_size = size;
+ res->bo_alignment = alignment;
+ res->flags = 0;
switch (res->b.b.usage) {
case PIPE_USAGE_STREAM:
- flags = RADEON_FLAG_GTT_WC;
+ res->flags = RADEON_FLAG_GTT_WC;
/* fall through */
case PIPE_USAGE_STAGING:
- /* Transfers are likely to occur more often with these resources. */
+ /* Transfers are likely to occur more often with these
+ * resources. */
res->domains = RADEON_DOMAIN_GTT;
break;
case PIPE_USAGE_DYNAMIC:
@@ -124,52 +125,78 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
if (rscreen->info.drm_major == 2 &&
rscreen->info.drm_minor < 40) {
res->domains = RADEON_DOMAIN_GTT;
- flags |= RADEON_FLAG_GTT_WC;
+ res->flags |= RADEON_FLAG_GTT_WC;
break;
}
- flags |= RADEON_FLAG_CPU_ACCESS;
+ res->flags |= RADEON_FLAG_CPU_ACCESS;
/* fall through */
case PIPE_USAGE_DEFAULT:
case PIPE_USAGE_IMMUTABLE:
default:
- /* Not listing GTT here improves performance in some apps. */
+ /* Not listing GTT here improves performance in some
+ * apps. */
res->domains = RADEON_DOMAIN_VRAM;
- flags |= RADEON_FLAG_GTT_WC;
+ res->flags |= RADEON_FLAG_GTT_WC;
break;
}
if (res->b.b.target == PIPE_BUFFER &&
res->b.b.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
- /* Use GTT for all persistent mappings with older kernels,
- * because they didn't always flush the HDP cache before CS
- * execution.
+ /* Use GTT for all persistent mappings with older
+ * kernels, because they didn't always flush the HDP
+ * cache before CS execution.
*
- * Write-combined CPU mappings are fine, the kernel ensures all CPU
- * writes finish before the GPU executes a command stream.
+ * Write-combined CPU mappings are fine, the kernel
+ * ensures all CPU writes finish before the GPU
+ * executes a command stream.
*/
if (rscreen->info.drm_major == 2 &&
rscreen->info.drm_minor < 40)
res->domains = RADEON_DOMAIN_GTT;
else if (res->domains & RADEON_DOMAIN_VRAM)
- flags |= RADEON_FLAG_CPU_ACCESS;
+ res->flags |= RADEON_FLAG_CPU_ACCESS;
}
/* Tiled textures are unmappable. Always put them in VRAM. */
if (res->b.b.target != PIPE_BUFFER &&
rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) {
res->domains = RADEON_DOMAIN_VRAM;
- flags &= ~RADEON_FLAG_CPU_ACCESS;
- flags |= RADEON_FLAG_NO_CPU_ACCESS;
+ res->flags &= ~RADEON_FLAG_CPU_ACCESS;
+ res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
+ RADEON_FLAG_GTT_WC;
}
+ /* If VRAM is just stolen system memory, allow both VRAM and
+ * GTT, whichever has free space. If a buffer is evicted from
+ * VRAM to GTT, it will stay there.
+ */
+ if (!rscreen->info.has_dedicated_vram &&
+ res->domains == RADEON_DOMAIN_VRAM)
+ res->domains = RADEON_DOMAIN_VRAM_GTT;
+
if (rscreen->debug_flags & DBG_NO_WC)
- flags &= ~RADEON_FLAG_GTT_WC;
+ res->flags &= ~RADEON_FLAG_GTT_WC;
+
+ /* Set expected VRAM and GART usage for the buffer. */
+ res->vram_usage = 0;
+ res->gart_usage = 0;
+
+ if (res->domains & RADEON_DOMAIN_VRAM)
+ res->vram_usage = size;
+ else if (res->domains & RADEON_DOMAIN_GTT)
+ res->gart_usage = size;
+}
+
+bool r600_alloc_resource(struct r600_common_screen *rscreen,
+ struct r600_resource *res)
+{
+ struct pb_buffer *old_buf, *new_buf;
/* Allocate a new resource. */
- new_buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment,
- use_reusable_pool,
- res->domains, flags);
+ new_buf = rscreen->ws->buffer_create(rscreen->ws, res->bo_size,
+ res->bo_alignment,
+ res->domains, res->flags);
if (!new_buf) {
return false;
}
@@ -179,11 +206,10 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
* the same buffer where one of the contexts invalidates it while
* the others are using it. */
old_buf = res->buf;
- res->cs_buf = rscreen->ws->buffer_get_cs_handle(new_buf); /* should be atomic */
res->buf = new_buf; /* should be atomic */
- if (rscreen->info.r600_virtual_address)
- res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->cs_buf);
+ if (rscreen->info.has_virtual_memory)
+ res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->buf);
else
res->gpu_address = 0;
@@ -192,8 +218,9 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
util_range_set_empty(&res->valid_buffer_range);
res->TC_L2_dirty = false;
+ /* Print debug information. */
if (rscreen->debug_flags & DBG_VM && res->b.b.target == PIPE_BUFFER) {
- fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Buffer %u bytes\n",
+ fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n",
res->gpu_address, res->gpu_address + res->buf->size,
res->buf->size);
}
@@ -210,6 +237,42 @@ static void r600_buffer_destroy(struct pipe_screen *screen,
FREE(rbuffer);
}
+static bool
+r600_invalidate_buffer(struct r600_common_context *rctx,
+ struct r600_resource *rbuffer)
+{
+ /* Shared buffers can't be reallocated. */
+ if (rbuffer->is_shared)
+ return false;
+
+ /* In AMD_pinned_memory, the user pointer association only gets
+ * broken when the buffer is explicitly re-allocated.
+ */
+ if (rctx->ws->buffer_is_user_ptr(rbuffer->buf))
+ return false;
+
+ /* Check if mapping this buffer would cause waiting for the GPU. */
+ if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
+ !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
+ rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
+ } else {
+ util_range_set_empty(&rbuffer->valid_buffer_range);
+ }
+
+ return true;
+}
+
+void r600_invalidate_resource(struct pipe_context *ctx,
+ struct pipe_resource *resource)
+{
+ struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+ struct r600_resource *rbuffer = r600_resource(resource);
+
+ /* We currently only do anyting here for buffers */
+ if (resource->target == PIPE_BUFFER)
+ (void)r600_invalidate_buffer(rctx, rbuffer);
+}
+
static void *r600_buffer_get_transfer(struct pipe_context *ctx,
struct pipe_resource *resource,
unsigned level,
@@ -220,7 +283,7 @@ static void *r600_buffer_get_transfer(struct pipe_context *ctx,
unsigned offset)
{
struct r600_common_context *rctx = (struct r600_common_context*)ctx;
- struct r600_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
+ struct r600_transfer *transfer = slab_alloc(&rctx->pool_transfers);
transfer->transfer.resource = resource;
transfer->transfer.level = level;
@@ -240,7 +303,7 @@ static bool r600_can_dma_copy_buffer(struct r600_common_context *rctx,
bool dword_aligned = !(dstx % 4) && !(srcx % 4) && !(size % 4);
return rctx->screen->has_cp_dma ||
- (dword_aligned && (rctx->rings.dma.cs ||
+ (dword_aligned && (rctx->dma.cs ||
rctx->screen->has_streamout));
}
@@ -263,6 +326,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
* in which case it can be mapped unsynchronized. */
if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
usage & PIPE_TRANSFER_WRITE &&
+ !rbuffer->is_shared &&
!util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
}
@@ -277,29 +341,31 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
assert(usage & PIPE_TRANSFER_WRITE);
- /* Check if mapping this buffer would cause waiting for the GPU. */
- if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
- !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
- rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
+ if (r600_invalidate_buffer(rctx, rbuffer)) {
+ /* At this point, the buffer is always idle. */
+ usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+ } else {
+ /* Fall back to a temporary buffer. */
+ usage |= PIPE_TRANSFER_DISCARD_RANGE;
}
- /* At this point, the buffer is always idle. */
- usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
}
- else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
- !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
- !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) &&
- r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) {
+
+ if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
+ !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+ PIPE_TRANSFER_PERSISTENT)) &&
+ !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) &&
+ r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) {
assert(usage & PIPE_TRANSFER_WRITE);
/* Check if mapping this buffer would cause waiting for the GPU. */
- if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
+ if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
!rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
/* Do a wait-free write-only transfer using a temporary buffer. */
unsigned offset;
struct r600_resource *staging = NULL;
u_upload_alloc(rctx->uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT),
- &offset, (struct pipe_resource**)&staging, (void**)&data);
+ 256, &offset, (struct pipe_resource**)&staging, (void**)&data);
if (staging) {
data += box->x % R600_MAP_BUFFER_ALIGNMENT;
@@ -311,23 +377,29 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
}
}
- /* Using a staging buffer in GTT for larger reads is much faster. */
+ /* Use a staging buffer in cached GTT for reads. */
else if ((usage & PIPE_TRANSFER_READ) &&
- !(usage & PIPE_TRANSFER_WRITE) &&
- rbuffer->domains == RADEON_DOMAIN_VRAM &&
+ !(usage & PIPE_TRANSFER_PERSISTENT) &&
+ (rbuffer->domains & RADEON_DOMAIN_VRAM ||
+ rbuffer->flags & RADEON_FLAG_GTT_WC) &&
r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) {
struct r600_resource *staging;
staging = (struct r600_resource*) pipe_buffer_create(
- ctx->screen, PIPE_BIND_TRANSFER_READ, PIPE_USAGE_STAGING,
+ ctx->screen, 0, PIPE_USAGE_STAGING,
box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT));
if (staging) {
/* Copy the VRAM buffer to the staging buffer. */
rctx->dma_copy(ctx, &staging->b.b, 0,
box->x % R600_MAP_BUFFER_ALIGNMENT,
- 0, 0, resource, level, box);
+ 0, 0, resource, 0, box);
- data = r600_buffer_map_sync_with_rings(rctx, staging, PIPE_TRANSFER_READ);
+ data = r600_buffer_map_sync_with_rings(rctx, staging,
+ usage & ~PIPE_TRANSFER_UNSYNCHRONIZED);
+ if (!data) {
+ r600_resource_reference(&staging, NULL);
+ return NULL;
+ }
data += box->x % R600_MAP_BUFFER_ALIGNMENT;
return r600_buffer_get_transfer(ctx, resource, level, usage, box,
@@ -345,38 +417,81 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
ptransfer, data, NULL, 0);
}
-static void r600_buffer_transfer_unmap(struct pipe_context *ctx,
- struct pipe_transfer *transfer)
+static void r600_buffer_do_flush_region(struct pipe_context *ctx,
+ struct pipe_transfer *transfer,
+ const struct pipe_box *box)
{
- struct r600_common_context *rctx = (struct r600_common_context*)ctx;
struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
struct r600_resource *rbuffer = r600_resource(transfer->resource);
if (rtransfer->staging) {
- if (rtransfer->transfer.usage & PIPE_TRANSFER_WRITE) {
- struct pipe_resource *dst, *src;
- unsigned soffset, doffset, size;
- struct pipe_box box;
+ struct pipe_resource *dst, *src;
+ unsigned soffset;
+ struct pipe_box dma_box;
- dst = transfer->resource;
- src = &rtransfer->staging->b.b;
- size = transfer->box.width;
- doffset = transfer->box.x;
- soffset = rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT;
+ dst = transfer->resource;
+ src = &rtransfer->staging->b.b;
+ soffset = rtransfer->offset + box->x % R600_MAP_BUFFER_ALIGNMENT;
- u_box_1d(soffset, size, &box);
+ u_box_1d(soffset, box->width, &dma_box);
- /* Copy the staging buffer into the original one. */
- rctx->dma_copy(ctx, dst, 0, doffset, 0, 0, src, 0, &box);
- }
- pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL);
+ /* Copy the staging buffer into the original one. */
+ ctx->resource_copy_region(ctx, dst, 0, box->x, 0, 0, src, 0, &dma_box);
}
- if (transfer->usage & PIPE_TRANSFER_WRITE) {
- util_range_add(&rbuffer->valid_buffer_range, transfer->box.x,
- transfer->box.x + transfer->box.width);
+ util_range_add(&rbuffer->valid_buffer_range, box->x,
+ box->x + box->width);
+}
+
+static void r600_buffer_flush_region(struct pipe_context *ctx,
+ struct pipe_transfer *transfer,
+ const struct pipe_box *rel_box)
+{
+ if (transfer->usage & (PIPE_TRANSFER_WRITE |
+ PIPE_TRANSFER_FLUSH_EXPLICIT)) {
+ struct pipe_box box;
+
+ u_box_1d(transfer->box.x + rel_box->x, rel_box->width, &box);
+ r600_buffer_do_flush_region(ctx, transfer, &box);
}
- util_slab_free(&rctx->pool_transfers, transfer);
+}
+
+static void r600_buffer_transfer_unmap(struct pipe_context *ctx,
+ struct pipe_transfer *transfer)
+{
+ struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+ struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+
+ if (transfer->usage & PIPE_TRANSFER_WRITE &&
+ !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
+ r600_buffer_do_flush_region(ctx, transfer, &transfer->box);
+
+ if (rtransfer->staging)
+ r600_resource_reference(&rtransfer->staging, NULL);
+
+ slab_free(&rctx->pool_transfers, transfer);
+}
+
+void r600_buffer_subdata(struct pipe_context *ctx,
+ struct pipe_resource *buffer,
+ unsigned usage, unsigned offset,
+ unsigned size, const void *data)
+{
+ struct pipe_transfer *transfer = NULL;
+ struct pipe_box box;
+ uint8_t *map = NULL;
+
+ u_box_1d(offset, size, &box);
+ map = r600_buffer_transfer_map(ctx, buffer, 0,
+ PIPE_TRANSFER_WRITE |
+ PIPE_TRANSFER_DISCARD_RANGE |
+ usage,
+ &box, &transfer);
+ if (!map)
+ return;
+
+ memcpy(map, data, size);
+ r600_buffer_transfer_unmap(ctx, transfer);
}
static const struct u_resource_vtbl r600_buffer_vtbl =
@@ -384,9 +499,8 @@ static const struct u_resource_vtbl r600_buffer_vtbl =
NULL, /* get_handle */
r600_buffer_destroy, /* resource_destroy */
r600_buffer_transfer_map, /* transfer_map */
- NULL, /* transfer_flush_region */
+ r600_buffer_flush_region, /* transfer_flush_region */
r600_buffer_transfer_unmap, /* transfer_unmap */
- NULL /* transfer_inline_write */
};
static struct r600_resource *
@@ -398,11 +512,14 @@ r600_alloc_buffer_struct(struct pipe_screen *screen,
rbuffer = MALLOC_STRUCT(r600_resource);
rbuffer->b.b = *templ;
+ rbuffer->b.b.next = NULL;
pipe_reference_init(&rbuffer->b.b.reference, 1);
rbuffer->b.b.screen = screen;
rbuffer->b.vtbl = &r600_buffer_vtbl;
rbuffer->buf = NULL;
+ rbuffer->bind_history = 0;
rbuffer->TC_L2_dirty = false;
+ rbuffer->is_shared = false;
util_range_init(&rbuffer->valid_buffer_range);
return rbuffer;
}
@@ -414,13 +531,39 @@ struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
struct r600_resource *rbuffer = r600_alloc_buffer_struct(screen, templ);
- if (!r600_init_resource(rscreen, rbuffer, templ->width0, alignment, TRUE)) {
+ r600_init_resource_fields(rscreen, rbuffer, templ->width0, alignment);
+
+ if (templ->bind & PIPE_BIND_SHARED)
+ rbuffer->flags |= RADEON_FLAG_HANDLE;
+
+ if (!r600_alloc_resource(rscreen, rbuffer)) {
FREE(rbuffer);
return NULL;
}
return &rbuffer->b.b;
}
+struct pipe_resource *r600_aligned_buffer_create(struct pipe_screen *screen,
+ unsigned bind,
+ unsigned usage,
+ unsigned size,
+ unsigned alignment)
+{
+ struct pipe_resource buffer;
+
+ memset(&buffer, 0, sizeof buffer);
+ buffer.target = PIPE_BUFFER;
+ buffer.format = PIPE_FORMAT_R8_UNORM;
+ buffer.bind = bind;
+ buffer.usage = usage;
+ buffer.flags = 0;
+ buffer.width0 = size;
+ buffer.height0 = 1;
+ buffer.depth0 = 1;
+ buffer.array_size = 1;
+ return r600_buffer_create(screen, &buffer, alignment);
+}
+
struct pipe_resource *
r600_buffer_from_user_memory(struct pipe_screen *screen,
const struct pipe_resource *templ,
@@ -440,11 +583,9 @@ r600_buffer_from_user_memory(struct pipe_screen *screen,
return NULL;
}
- rbuffer->cs_buf = ws->buffer_get_cs_handle(rbuffer->buf);
-
- if (rscreen->info.r600_virtual_address)
+ if (rscreen->info.has_virtual_memory)
rbuffer->gpu_address =
- ws->buffer_get_virtual_address(rbuffer->cs_buf);
+ ws->buffer_get_virtual_address(rbuffer->buf);
else
rbuffer->gpu_address = 0;
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c b/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c
index f3529a1fe..0c55fc2a2 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_perfcounter.c
@@ -28,7 +28,7 @@
#include "util/u_memory.h"
#include "r600_query.h"
#include "r600_pipe_common.h"
-#include "r600d_common.h"
+#include "amd/common/r600d_common.h"
/* Max counters per HW block */
#define R600_QUERY_MAX_COUNTERS 16
@@ -84,8 +84,8 @@ struct r600_pc_group {
struct r600_pc_counter {
unsigned base;
- unsigned dwords;
- unsigned stride;
+ unsigned qwords;
+ unsigned stride; /* in uint64s */
};
#define R600_PC_SHADERS_WINDOWING (1 << 31)
@@ -115,6 +115,14 @@ static void r600_pc_query_destroy(struct r600_common_context *ctx,
r600_query_hw_destroy(ctx, rquery);
}
+static bool r600_pc_query_prepare_buffer(struct r600_common_context *ctx,
+ struct r600_query_hw *hwquery,
+ struct r600_resource *buffer)
+{
+ /* no-op */
+ return true;
+}
+
static void r600_pc_query_emit_start(struct r600_common_context *ctx,
struct r600_query_hw *hwquery,
struct r600_resource *buffer, uint64_t va)
@@ -172,7 +180,7 @@ static void r600_pc_query_emit_stop(struct r600_common_context *ctx,
pc->emit_read(ctx, block,
group->num_counters, group->selectors,
buffer, va);
- va += 4 * group->num_counters;
+ va += sizeof(uint64_t) * group->num_counters;
} while (group->instance < 0 && ++instance < block->num_instances);
} while (++se < se_end);
}
@@ -194,15 +202,15 @@ static void r600_pc_query_add_result(struct r600_common_context *ctx,
union pipe_query_result *result)
{
struct r600_query_pc *query = (struct r600_query_pc *)hwquery;
- uint32_t *results = buffer;
+ uint64_t *results = buffer;
unsigned i, j;
for (i = 0; i < query->num_counters; ++i) {
struct r600_pc_counter *counter = &query->counters[i];
- for (j = 0; j < counter->dwords; ++j) {
+ for (j = 0; j < counter->qwords; ++j) {
uint32_t value = results[counter->base + j * counter->stride];
- result->batch[i].u32 += value;
+ result->batch[i].u64 += value;
}
}
}
@@ -215,6 +223,7 @@ static struct r600_query_ops batch_query_ops = {
};
static struct r600_query_hw_ops batch_query_hw_ops = {
+ .prepare_buffer = r600_pc_query_prepare_buffer,
.emit_start = r600_pc_query_emit_start,
.emit_stop = r600_pc_query_emit_stop,
.clear_result = r600_pc_query_clear_result,
@@ -310,7 +319,6 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
query->b.b.ops = &batch_query_ops;
query->b.ops = &batch_query_hw_ops;
- query->b.flags = R600_QUERY_HW_FLAG_TIMER;
query->num_counters = num_queries;
@@ -362,7 +370,7 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
instances *= block->num_instances;
group->result_base = i;
- query->b.result_size += 4 * instances * group->num_counters;
+ query->b.result_size += sizeof(uint64_t) * instances * group->num_counters;
i += instances * group->num_counters;
pc->get_size(block, group->num_counters, group->selectors,
@@ -402,11 +410,11 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
counter->base = group->result_base + j;
counter->stride = group->num_counters;
- counter->dwords = 1;
+ counter->qwords = 1;
if ((block->flags & R600_PC_BLOCK_SE) && group->se < 0)
- counter->dwords = screen->info.max_se;
+ counter->qwords = screen->info.max_se;
if (group->instance < 0)
- counter->dwords *= block->num_instances;
+ counter->qwords *= block->num_instances;
}
if (!r600_query_hw_init(rctx, &query->b))
@@ -419,8 +427,8 @@ error:
return NULL;
}
-static boolean r600_init_block_names(struct r600_common_screen *screen,
- struct r600_perfcounter_block *block)
+static bool r600_init_block_names(struct r600_common_screen *screen,
+ struct r600_perfcounter_block *block)
{
unsigned i, j, k;
unsigned groups_shader = 1, groups_se = 1, groups_instance = 1;
@@ -453,7 +461,7 @@ static boolean r600_init_block_names(struct r600_common_screen *screen,
block->group_names = MALLOC(block->num_groups * block->group_name_stride);
if (!block->group_names)
- return FALSE;
+ return false;
groupname = block->group_names;
for (i = 0; i < groups_shader; ++i) {
@@ -488,7 +496,7 @@ static boolean r600_init_block_names(struct r600_common_screen *screen,
block->selector_names = MALLOC(block->num_groups * block->num_selectors *
block->selector_name_stride);
if (!block->selector_names)
- return FALSE;
+ return false;
groupname = block->group_names;
p = block->selector_names;
@@ -500,7 +508,7 @@ static boolean r600_init_block_names(struct r600_common_screen *screen,
groupname += block->group_name_stride;
}
- return TRUE;
+ return true;
}
int r600_get_perfcounter_info(struct r600_common_screen *screen,
@@ -536,7 +544,7 @@ int r600_get_perfcounter_info(struct r600_common_screen *screen,
info->name = block->selector_names + sub * block->selector_name_stride;
info->query_type = R600_QUERY_FIRST_PERFCOUNTER + index;
info->max_value.u64 = 0;
- info->type = PIPE_DRIVER_QUERY_TYPE_UINT;
+ info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE;
info->group_id = base_gid + sub / block->num_selectors;
info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
@@ -578,17 +586,17 @@ void r600_perfcounters_destroy(struct r600_common_screen *rscreen)
rscreen->perfcounters->cleanup(rscreen);
}
-boolean r600_perfcounters_init(struct r600_perfcounters *pc,
- unsigned num_blocks)
+bool r600_perfcounters_init(struct r600_perfcounters *pc,
+ unsigned num_blocks)
{
pc->blocks = CALLOC(num_blocks, sizeof(struct r600_perfcounter_block));
if (!pc->blocks)
- return FALSE;
+ return false;
- pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", FALSE);
- pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", FALSE);
+ pc->separate_se = debug_get_bool_option("RADEON_PC_SEPARATE_SE", false);
+ pc->separate_instance = debug_get_bool_option("RADEON_PC_SEPARATE_INSTANCE", false);
- return TRUE;
+ return true;
}
void r600_perfcounters_add_block(struct r600_common_screen *rscreen,
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c
index 495fda0a8..f62bbf2e0 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -27,23 +27,118 @@
#include "r600_pipe_common.h"
#include "r600_cs.h"
#include "tgsi/tgsi_parse.h"
+#include "util/list.h"
#include "util/u_draw_quad.h"
#include "util/u_memory.h"
#include "util/u_format_s3tc.h"
#include "util/u_upload_mgr.h"
+#include "os/os_time.h"
#include "vl/vl_decoder.h"
#include "vl/vl_video_buffer.h"
#include "radeon/radeon_video.h"
#include <inttypes.h>
+#include <sys/utsname.h>
#ifndef HAVE_LLVM
#define HAVE_LLVM 0
#endif
+struct r600_multi_fence {
+ struct pipe_reference reference;
+ struct pipe_fence_handle *gfx;
+ struct pipe_fence_handle *sdma;
+
+ /* If the context wasn't flushed at fence creation, this is non-NULL. */
+ struct {
+ struct r600_common_context *ctx;
+ unsigned ib_index;
+ } gfx_unflushed;
+};
+
+/*
+ * shader binary helpers.
+ */
+void radeon_shader_binary_init(struct radeon_shader_binary *b)
+{
+ memset(b, 0, sizeof(*b));
+}
+
+void radeon_shader_binary_clean(struct radeon_shader_binary *b)
+{
+ if (!b)
+ return;
+ FREE(b->code);
+ FREE(b->config);
+ FREE(b->rodata);
+ FREE(b->global_symbol_offsets);
+ FREE(b->relocs);
+ FREE(b->disasm_string);
+ FREE(b->llvm_ir_string);
+}
+
/*
* pipe_context
*/
+void r600_gfx_write_fence(struct r600_common_context *ctx, struct r600_resource *buf,
+ uint64_t va, uint32_t old_value, uint32_t new_value)
+{
+ struct radeon_winsys_cs *cs = ctx->gfx.cs;
+
+ if (ctx->chip_class == CIK ||
+ ctx->chip_class == VI) {
+ /* Two EOP events are required to make all engines go idle
+ * (and optional cache flushes executed) before the timestamp
+ * is written.
+ */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+ radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) |
+ EVENT_INDEX(5));
+ radeon_emit(cs, va);
+ radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
+ radeon_emit(cs, old_value); /* immediate data */
+ radeon_emit(cs, 0); /* unused */
+ }
+
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+ radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) |
+ EVENT_INDEX(5));
+ radeon_emit(cs, va);
+ radeon_emit(cs, (va >> 32) | EOP_DATA_SEL(1));
+ radeon_emit(cs, new_value); /* immediate data */
+ radeon_emit(cs, 0); /* unused */
+
+ r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+}
+
+unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen)
+{
+ unsigned dwords = 6;
+
+ if (screen->chip_class == CIK ||
+ screen->chip_class == VI)
+ dwords *= 2;
+
+ if (!screen->info.has_virtual_memory)
+ dwords += 2;
+
+ return dwords;
+}
+
+void r600_gfx_wait_fence(struct r600_common_context *ctx,
+ uint64_t va, uint32_t ref, uint32_t mask)
+{
+ struct radeon_winsys_cs *cs = ctx->gfx.cs;
+
+ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+ radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
+ radeon_emit(cs, va);
+ radeon_emit(cs, va >> 32);
+ radeon_emit(cs, ref); /* reference value */
+ radeon_emit(cs, mask); /* mask */
+ radeon_emit(cs, 4); /* poll interval */
+}
+
void r600_draw_rectangle(struct blitter_context *blitter,
int x1, int y1, int x2, int y2, float depth,
enum blitter_attrib_type type,
@@ -77,7 +172,7 @@ void r600_draw_rectangle(struct blitter_context *blitter,
/* Upload vertices. The hw rectangle has only 3 vertices,
* I guess the 4th one is derived from the first 3.
* The vertex specification should match u_blitter's vertex element state. */
- u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, &offset, &buf, (void**)&vb);
+ u_upload_alloc(rctx->uploader, 0, sizeof(float) * 24, 256, &offset, &buf, (void**)&vb);
if (!buf)
return;
@@ -108,12 +203,89 @@ void r600_draw_rectangle(struct blitter_context *blitter,
pipe_resource_reference(&buf, NULL);
}
-void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw)
+void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
+ struct r600_resource *dst, struct r600_resource *src)
+{
+ uint64_t vram = 0, gtt = 0;
+
+ if (dst) {
+ vram += dst->vram_usage;
+ gtt += dst->gart_usage;
+ }
+ if (src) {
+ vram += src->vram_usage;
+ gtt += src->gart_usage;
+ }
+
+ /* Flush the GFX IB if DMA depends on it. */
+ if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) &&
+ ((dst &&
+ ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf,
+ RADEON_USAGE_READWRITE)) ||
+ (src &&
+ ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf,
+ RADEON_USAGE_WRITE))))
+ ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+
+ /* Flush if there's not enough space, or if the memory usage per IB
+ * is too large.
+ */
+ if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
+ !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
+ ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+ assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
+ }
+
+ /* If GPUVM is not supported, the CS checker needs 2 entries
+ * in the buffer list per packet, which has to be done manually.
+ */
+ if (ctx->screen->info.has_virtual_memory) {
+ if (dst)
+ radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
+ RADEON_USAGE_WRITE,
+ RADEON_PRIO_SDMA_BUFFER);
+ if (src)
+ radeon_add_to_buffer_list(ctx, &ctx->dma, src,
+ RADEON_USAGE_READ,
+ RADEON_PRIO_SDMA_BUFFER);
+ }
+}
+
+/* This is required to prevent read-after-write hazards. */
+void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
{
- /* Flush if there's not enough space. */
- if ((num_dw + ctx->rings.dma.cs->cdw) > ctx->rings.dma.cs->max_dw) {
- ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
- assert((num_dw + ctx->rings.dma.cs->cdw) <= ctx->rings.dma.cs->max_dw);
+ struct radeon_winsys_cs *cs = rctx->dma.cs;
+
+ /* done at the end of DMA calls, so increment this. */
+ rctx->num_dma_calls++;
+
+ /* IBs using too little memory are limited by the IB submission overhead.
+ * IBs using too much memory are limited by the kernel/TTM overhead.
+ * Too long IBs create CPU-GPU pipeline bubbles and add latency.
+ *
+ * This heuristic makes sure that DMA requests are executed
+ * very soon after the call is made and lowers memory usage.
+ * It improves texture upload performance by keeping the DMA
+ * engine busy while uploads are being submitted.
+ */
+ if (cs->used_vram + cs->used_gart > 64 * 1024 * 1024) {
+ rctx->dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
+ return;
+ }
+
+ r600_need_dma_space(rctx, 1, NULL, NULL);
+
+ if (!radeon_emitted(cs, 0)) /* empty queue */
+ return;
+
+ /* NOP waits for idle on Evergreen and later. */
+ if (rctx->chip_class >= CIK)
+ radeon_emit(cs, 0x00000000); /* NOP */
+ else if (rctx->chip_class >= EVERGREEN)
+ radeon_emit(cs, 0xf0000000); /* NOP */
+ else {
+ /* TODO: R600-R700 should use the FENCE packet.
+ * CS checker support is required. */
}
}
@@ -123,24 +295,9 @@ static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
void r600_preflush_suspend_features(struct r600_common_context *ctx)
{
- /* Disable render condition. */
- ctx->saved_render_cond = NULL;
- ctx->saved_render_cond_cond = FALSE;
- ctx->saved_render_cond_mode = 0;
- if (ctx->current_render_cond) {
- ctx->saved_render_cond = ctx->current_render_cond;
- ctx->saved_render_cond_cond = ctx->current_render_cond_cond;
- ctx->saved_render_cond_mode = ctx->current_render_cond_mode;
- ctx->b.render_condition(&ctx->b, NULL, FALSE, 0);
- }
-
/* suspend queries */
- ctx->queries_suspended_for_flush = false;
- if (ctx->num_cs_dw_nontimer_queries_suspend) {
- r600_suspend_nontimer_queries(ctx);
- r600_suspend_timer_queries(ctx);
- ctx->queries_suspended_for_flush = true;
- }
+ if (!LIST_IS_EMPTY(&ctx->active_queries))
+ r600_suspend_queries(ctx);
ctx->streamout.suspended = false;
if (ctx->streamout.begin_emitted) {
@@ -157,48 +314,152 @@ void r600_postflush_resume_features(struct r600_common_context *ctx)
}
/* resume queries */
- if (ctx->queries_suspended_for_flush) {
- r600_resume_nontimer_queries(ctx);
- r600_resume_timer_queries(ctx);
- }
-
- /* Re-enable render condition. */
- if (ctx->saved_render_cond) {
- ctx->b.render_condition(&ctx->b, ctx->saved_render_cond,
- ctx->saved_render_cond_cond,
- ctx->saved_render_cond_mode);
- }
+ if (!LIST_IS_EMPTY(&ctx->active_queries))
+ r600_resume_queries(ctx);
}
static void r600_flush_from_st(struct pipe_context *ctx,
struct pipe_fence_handle **fence,
unsigned flags)
{
+ struct pipe_screen *screen = ctx->screen;
struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+ struct radeon_winsys *ws = rctx->ws;
unsigned rflags = 0;
+ struct pipe_fence_handle *gfx_fence = NULL;
+ struct pipe_fence_handle *sdma_fence = NULL;
+ bool deferred_fence = false;
if (flags & PIPE_FLUSH_END_OF_FRAME)
rflags |= RADEON_FLUSH_END_OF_FRAME;
+ if (flags & PIPE_FLUSH_DEFERRED)
+ rflags |= RADEON_FLUSH_ASYNC;
+
+ if (rctx->dma.cs) {
+ rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL);
+ }
+
+ if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) {
+ if (fence)
+ ws->fence_reference(&gfx_fence, rctx->last_gfx_fence);
+ if (!(rflags & RADEON_FLUSH_ASYNC))
+ ws->cs_sync_flush(rctx->gfx.cs);
+ } else {
+ /* Instead of flushing, create a deferred fence. Constraints:
+ * - The state tracker must allow a deferred flush.
+ * - The state tracker must request a fence.
+ * Thread safety in fence_finish must be ensured by the state tracker.
+ */
+ if (flags & PIPE_FLUSH_DEFERRED && fence) {
+ gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs);
+ deferred_fence = true;
+ } else {
+ rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
+ }
+ }
+
+ /* Both engines can signal out of order, so we need to keep both fences. */
+ if (fence) {
+ struct r600_multi_fence *multi_fence =
+ CALLOC_STRUCT(r600_multi_fence);
+ if (!multi_fence)
+ return;
+
+ multi_fence->reference.count = 1;
+ /* If both fences are NULL, fence_finish will always return true. */
+ multi_fence->gfx = gfx_fence;
+ multi_fence->sdma = sdma_fence;
+
+ if (deferred_fence) {
+ multi_fence->gfx_unflushed.ctx = rctx;
+ multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes;
+ }
- if (rctx->rings.dma.cs) {
- rctx->rings.dma.flush(rctx, rflags, NULL);
+ screen->fence_reference(screen, fence, NULL);
+ *fence = (struct pipe_fence_handle*)multi_fence;
}
- rctx->rings.gfx.flush(rctx, rflags, fence);
}
static void r600_flush_dma_ring(void *ctx, unsigned flags,
struct pipe_fence_handle **fence)
{
struct r600_common_context *rctx = (struct r600_common_context *)ctx;
- struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
-
- if (!cs->cdw) {
+ struct radeon_winsys_cs *cs = rctx->dma.cs;
+ struct radeon_saved_cs saved;
+ bool check_vm =
+ (rctx->screen->debug_flags & DBG_CHECK_VM) &&
+ rctx->check_vm_faults;
+
+ if (!radeon_emitted(cs, 0)) {
+ if (fence)
+ rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
return;
}
- rctx->rings.dma.flushing = true;
- rctx->ws->cs_flush(cs, flags, fence, 0);
- rctx->rings.dma.flushing = false;
+ if (check_vm)
+ radeon_save_cs(rctx->ws, cs, &saved);
+
+ rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence);
+ if (fence)
+ rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
+
+ if (check_vm) {
+ /* Use conservative timeout 800ms, after which we won't wait any
+ * longer and assume the GPU is hung.
+ */
+ rctx->ws->fence_wait(rctx->ws, rctx->last_sdma_fence, 800*1000*1000);
+
+ rctx->check_vm_faults(rctx, &saved, RING_DMA);
+ radeon_clear_saved_cs(&saved);
+ }
+}
+
+/**
+ * Store a linearized copy of all chunks of \p cs together with the buffer
+ * list in \p saved.
+ */
+void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
+ struct radeon_saved_cs *saved)
+{
+ void *buf;
+ unsigned i;
+
+ /* Save the IB chunks. */
+ saved->num_dw = cs->prev_dw + cs->current.cdw;
+ saved->ib = MALLOC(4 * saved->num_dw);
+ if (!saved->ib)
+ goto oom;
+
+ buf = saved->ib;
+ for (i = 0; i < cs->num_prev; ++i) {
+ memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4);
+ buf += cs->prev[i].cdw;
+ }
+ memcpy(buf, cs->current.buf, cs->current.cdw * 4);
+
+ /* Save the buffer list. */
+ saved->bo_count = ws->cs_get_buffer_list(cs, NULL);
+ saved->bo_list = CALLOC(saved->bo_count,
+ sizeof(saved->bo_list[0]));
+ if (!saved->bo_list) {
+ FREE(saved->ib);
+ goto oom;
+ }
+ ws->cs_get_buffer_list(cs, saved->bo_list);
+
+ return;
+
+oom:
+ fprintf(stderr, "%s: out of memory\n", __func__);
+ memset(saved, 0, sizeof(*saved));
+}
+
+void radeon_clear_saved_cs(struct radeon_saved_cs *saved)
+{
+ FREE(saved->ib);
+ FREE(saved->bo_list);
+
+ memset(saved, 0, sizeof(*saved));
}
static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
@@ -214,31 +475,82 @@ static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
return PIPE_UNKNOWN_CONTEXT_RESET;
}
+static void r600_set_debug_callback(struct pipe_context *ctx,
+ const struct pipe_debug_callback *cb)
+{
+ struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+
+ if (cb)
+ rctx->debug = *cb;
+ else
+ memset(&rctx->debug, 0, sizeof(rctx->debug));
+}
+
+static void r600_set_device_reset_callback(struct pipe_context *ctx,
+ const struct pipe_device_reset_callback *cb)
+{
+ struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+
+ if (cb)
+ rctx->device_reset_callback = *cb;
+ else
+ memset(&rctx->device_reset_callback, 0,
+ sizeof(rctx->device_reset_callback));
+}
+
+bool r600_check_device_reset(struct r600_common_context *rctx)
+{
+ enum pipe_reset_status status;
+
+ if (!rctx->device_reset_callback.reset)
+ return false;
+
+ if (!rctx->b.get_device_reset_status)
+ return false;
+
+ status = rctx->b.get_device_reset_status(&rctx->b);
+ if (status == PIPE_NO_RESET)
+ return false;
+
+ rctx->device_reset_callback.reset(rctx->device_reset_callback.data, status);
+ return true;
+}
+
bool r600_common_context_init(struct r600_common_context *rctx,
- struct r600_common_screen *rscreen)
+ struct r600_common_screen *rscreen,
+ unsigned context_flags)
{
- util_slab_create(&rctx->pool_transfers,
- sizeof(struct r600_transfer), 64,
- UTIL_SLAB_SINGLETHREADED);
+ slab_create_child(&rctx->pool_transfers, &rscreen->pool_transfers);
rctx->screen = rscreen;
rctx->ws = rscreen->ws;
rctx->family = rscreen->family;
rctx->chip_class = rscreen->chip_class;
- if (rscreen->family == CHIP_HAWAII)
- rctx->max_db = 16;
+ if (rscreen->chip_class >= CIK)
+ rctx->max_db = MAX2(8, rscreen->info.num_render_backends);
else if (rscreen->chip_class >= EVERGREEN)
rctx->max_db = 8;
else
rctx->max_db = 4;
+ rctx->b.invalidate_resource = r600_invalidate_resource;
rctx->b.transfer_map = u_transfer_map_vtbl;
- rctx->b.transfer_flush_region = u_default_transfer_flush_region;
+ rctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
rctx->b.transfer_unmap = u_transfer_unmap_vtbl;
- rctx->b.transfer_inline_write = u_default_transfer_inline_write;
- rctx->b.memory_barrier = r600_memory_barrier;
+ rctx->b.texture_subdata = u_default_texture_subdata;
+ rctx->b.memory_barrier = r600_memory_barrier;
rctx->b.flush = r600_flush_from_st;
+ rctx->b.set_debug_callback = r600_set_debug_callback;
+
+ /* evergreen_compute.c has a special codepath for global buffers.
+ * Everything else can use the direct path.
+ */
+ if ((rscreen->chip_class == EVERGREEN || rscreen->chip_class == CAYMAN) &&
+ (context_flags & PIPE_CONTEXT_COMPUTE_ONLY))
+ rctx->b.buffer_subdata = u_default_buffer_subdata;
+ else
+ rctx->b.buffer_subdata = r600_buffer_subdata;
if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 43) {
rctx->b.get_device_reset_status = r600_get_reset_status;
@@ -247,21 +559,23 @@ bool r600_common_context_init(struct r600_common_context *rctx,
RADEON_GPU_RESET_COUNTER);
}
- LIST_INITHEAD(&rctx->texture_buffers);
+ rctx->b.set_device_reset_callback = r600_set_device_reset_callback;
r600_init_context_texture_functions(rctx);
+ r600_init_viewport_functions(rctx);
r600_streamout_init(rctx);
r600_query_init(rctx);
cayman_init_msaa(&rctx->b);
- rctx->allocator_so_filled_size = u_suballocator_create(&rctx->b, 4096, 4,
- 0, PIPE_USAGE_DEFAULT, TRUE);
- if (!rctx->allocator_so_filled_size)
+ rctx->allocator_zeroed_memory =
+ u_suballocator_create(&rctx->b, rscreen->info.gart_page_size,
+ 0, PIPE_USAGE_DEFAULT, true);
+ if (!rctx->allocator_zeroed_memory)
return false;
- rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024, 256,
+ rctx->uploader = u_upload_create(&rctx->b, 1024 * 1024,
PIPE_BIND_INDEX_BUFFER |
- PIPE_BIND_CONSTANT_BUFFER);
+ PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
if (!rctx->uploader)
return false;
@@ -269,11 +583,11 @@ bool r600_common_context_init(struct r600_common_context *rctx,
if (!rctx->ctx)
return false;
- if (rscreen->info.r600_has_dma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
- rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
- r600_flush_dma_ring,
- rctx, NULL);
- rctx->rings.dma.flush = r600_flush_dma_ring;
+ if (rscreen->info.has_sdma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
+ rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
+ r600_flush_dma_ring,
+ rctx);
+ rctx->dma.flush = r600_flush_dma_ring;
}
return true;
@@ -281,46 +595,41 @@ bool r600_common_context_init(struct r600_common_context *rctx,
void r600_common_context_cleanup(struct r600_common_context *rctx)
{
- if (rctx->rings.gfx.cs)
- rctx->ws->cs_destroy(rctx->rings.gfx.cs);
- if (rctx->rings.dma.cs)
- rctx->ws->cs_destroy(rctx->rings.dma.cs);
- if (rctx->ctx)
- rctx->ws->ctx_destroy(rctx->ctx);
+ unsigned i,j;
- if (rctx->uploader) {
- u_upload_destroy(rctx->uploader);
- }
+ /* Release DCC stats. */
+ for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
+ assert(!rctx->dcc_stats[i].query_active);
- util_slab_destroy(&rctx->pool_transfers);
+ for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
+ if (rctx->dcc_stats[i].ps_stats[j])
+ rctx->b.destroy_query(&rctx->b,
+ rctx->dcc_stats[i].ps_stats[j]);
- if (rctx->allocator_so_filled_size) {
- u_suballocator_destroy(rctx->allocator_so_filled_size);
+ r600_texture_reference(&rctx->dcc_stats[i].tex, NULL);
}
-}
-void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r)
-{
- struct r600_common_context *rctx = (struct r600_common_context *)ctx;
- struct r600_resource *rr = (struct r600_resource *)r;
+ if (rctx->query_result_shader)
+ rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader);
- if (r == NULL) {
- return;
- }
+ if (rctx->gfx.cs)
+ rctx->ws->cs_destroy(rctx->gfx.cs);
+ if (rctx->dma.cs)
+ rctx->ws->cs_destroy(rctx->dma.cs);
+ if (rctx->ctx)
+ rctx->ws->ctx_destroy(rctx->ctx);
- /*
- * The idea is to compute a gross estimate of memory requirement of
- * each draw call. After each draw call, memory will be precisely
- * accounted. So the uncertainty is only on the current draw call.
- * In practice this gave very good estimate (+/- 10% of the target
- * memory limit).
- */
- if (rr->domains & RADEON_DOMAIN_GTT) {
- rctx->gtt += rr->buf->size;
+ if (rctx->uploader) {
+ u_upload_destroy(rctx->uploader);
}
- if (rr->domains & RADEON_DOMAIN_VRAM) {
- rctx->vram += rr->buf->size;
+
+ slab_destroy_child(&rctx->pool_transfers);
+
+ if (rctx->allocator_zeroed_memory) {
+ u_suballocator_destroy(rctx->allocator_zeroed_memory);
}
+ rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL);
+ rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
}
/*
@@ -330,10 +639,8 @@ void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resour
static const struct debug_named_value common_debug_options[] = {
/* logging */
{ "tex", DBG_TEX, "Print texture info" },
- { "texmip", DBG_TEXMIP, "Print texture info (mipmapped only)" },
{ "compute", DBG_COMPUTE, "Print compute info" },
{ "vm", DBG_VM, "Print virtual addresses when creating resources" },
- { "trace_cs", DBG_TRACE_CS, "Trace cs and write rlockup_<csid>.c file with faulty cs" },
{ "info", DBG_INFO, "Print driver information" },
/* shaders */
@@ -347,6 +654,10 @@ static const struct debug_named_value common_debug_options[] = {
{ "noir", DBG_NO_IR, "Don't print the LLVM IR"},
{ "notgsi", DBG_NO_TGSI, "Don't print the TGSI"},
{ "noasm", DBG_NO_ASM, "Don't print disassembled shaders"},
+ { "preoptir", DBG_PREOPT_IR, "Print the LLVM IR before initial optimizations" },
+ { "checkir", DBG_CHECK_IR, "Enable additional sanity checks on shader IR" },
+
+ { "testdma", DBG_TEST_DMA, "Invoke SDMA tests and exit." },
/* features */
{ "nodma", DBG_NO_ASYNC_DMA, "Disable asynchronous DMA" },
@@ -359,6 +670,15 @@ static const struct debug_named_value common_debug_options[] = {
{ "forcedma", DBG_FORCE_DMA, "Use asynchronous DMA for all operations when possible." },
{ "precompile", DBG_PRECOMPILE, "Compile one shader variant at shader creation." },
{ "nowc", DBG_NO_WC, "Disable GTT write combining" },
+ { "check_vm", DBG_CHECK_VM, "Check VM faults and dump debug info." },
+ { "nodcc", DBG_NO_DCC, "Disable DCC." },
+ { "nodccclear", DBG_NO_DCC_CLEAR, "Disable DCC fast clear." },
+ { "norbplus", DBG_NO_RB_PLUS, "Disable RB+ on Stoney." },
+ { "sisched", DBG_SI_SCHED, "Enable LLVM SI Machine Instruction Scheduler." },
+ { "mono", DBG_MONOLITHIC_SHADERS, "Use old-style monolithic shaders compiled on demand" },
+ { "noce", DBG_NO_CE, "Disable the constant engine"},
+ { "unsafemath", DBG_UNSAFE_MATH, "Enable unsafe math shader optimizations" },
+ { "nodccfb", DBG_NO_DCC_FB, "Disable separate DCC on the main framebuffer" },
DEBUG_NAMED_VALUE_END /* must be last */
};
@@ -415,6 +735,8 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
case CHIP_ICELAND: return "AMD ICELAND";
case CHIP_CARRIZO: return "AMD CARRIZO";
case CHIP_FIJI: return "AMD FIJI";
+ case CHIP_POLARIS10: return "AMD POLARIS10";
+ case CHIP_POLARIS11: return "AMD POLARIS11";
case CHIP_STONEY: return "AMD STONEY";
default: return "AMD unknown";
}
@@ -535,25 +857,30 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
case CHIP_KAVERI: return "kaveri";
case CHIP_HAWAII: return "hawaii";
case CHIP_MULLINS:
-#if HAVE_LLVM >= 0x0305
return "mullins";
-#else
- return "kabini";
-#endif
case CHIP_TONGA: return "tonga";
case CHIP_ICELAND: return "iceland";
case CHIP_CARRIZO: return "carrizo";
- case CHIP_FIJI: return "fiji";
#if HAVE_LLVM <= 0x0307
+ case CHIP_FIJI: return "tonga";
case CHIP_STONEY: return "carrizo";
#else
+ case CHIP_FIJI: return "fiji";
case CHIP_STONEY: return "stoney";
#endif
+#if HAVE_LLVM <= 0x0308
+ case CHIP_POLARIS10: return "tonga";
+ case CHIP_POLARIS11: return "tonga";
+#else
+ case CHIP_POLARIS10: return "polaris10";
+ case CHIP_POLARIS11: return "polaris11";
+#endif
default: return "";
}
}
static int r600_get_compute_param(struct pipe_screen *screen,
+ enum pipe_shader_ir ir_type,
enum pipe_compute_cap param,
void *ret)
{
@@ -564,20 +891,19 @@ static int r600_get_compute_param(struct pipe_screen *screen,
case PIPE_COMPUTE_CAP_IR_TARGET: {
const char *gpu;
const char *triple;
- if (rscreen->family <= CHIP_ARUBA || HAVE_LLVM < 0x0306) {
+ if (rscreen->family <= CHIP_ARUBA) {
triple = "r600--";
} else {
- triple = "amdgcn--";
+ if (HAVE_LLVM < 0x0400) {
+ triple = "amdgcn--";
+ } else {
+ triple = "amdgcn-mesa-mesa3d";
+ }
}
switch(rscreen->family) {
/* Clang < 3.6 is missing Hainan in its list of
* GPUs, so we need to use the name of a similar GPU.
*/
-#if HAVE_LLVM < 0x0306
- case CHIP_HAINAN:
- gpu = "oland";
- break;
-#endif
default:
gpu = r600_get_llvm_processor_name(rscreen->family);
break;
@@ -600,32 +926,51 @@ static int r600_get_compute_param(struct pipe_screen *screen,
uint64_t *grid_size = ret;
grid_size[0] = 65535;
grid_size[1] = 65535;
- grid_size[2] = 1;
+ grid_size[2] = 65535;
}
return 3 * sizeof(uint64_t) ;
case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
if (ret) {
uint64_t *block_size = ret;
- block_size[0] = 256;
- block_size[1] = 256;
- block_size[2] = 256;
+ if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+ ir_type == PIPE_SHADER_IR_TGSI) {
+ block_size[0] = 2048;
+ block_size[1] = 2048;
+ block_size[2] = 2048;
+ } else {
+ block_size[0] = 256;
+ block_size[1] = 256;
+ block_size[2] = 256;
+ }
}
return 3 * sizeof(uint64_t);
case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
if (ret) {
uint64_t *max_threads_per_block = ret;
- *max_threads_per_block = 256;
+ if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+ ir_type == PIPE_SHADER_IR_TGSI)
+ *max_threads_per_block = 2048;
+ else
+ *max_threads_per_block = 256;
}
return sizeof(uint64_t);
+ case PIPE_COMPUTE_CAP_ADDRESS_BITS:
+ if (ret) {
+ uint32_t *address_bits = ret;
+ address_bits[0] = 32;
+ if (rscreen->chip_class >= SI)
+ address_bits[0] = 64;
+ }
+ return 1 * sizeof(uint32_t);
case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
if (ret) {
uint64_t *max_global_size = ret;
uint64_t max_mem_alloc_size;
- r600_get_compute_param(screen,
+ r600_get_compute_param(screen, ir_type,
PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE,
&max_mem_alloc_size);
@@ -636,8 +981,8 @@ static int r600_get_compute_param(struct pipe_screen *screen,
* 4 * MAX_MEM_ALLOC_SIZE.
*/
*max_global_size = MIN2(4 * max_mem_alloc_size,
- rscreen->info.gart_size +
- rscreen->info.vram_size);
+ MAX2(rscreen->info.gart_size,
+ rscreen->info.vram_size));
}
return sizeof(uint64_t);
@@ -661,24 +1006,21 @@ static int r600_get_compute_param(struct pipe_screen *screen,
if (ret) {
uint64_t *max_mem_alloc_size = ret;
- /* XXX: The limit in older kernels is 256 MB. We
- * should add a query here for newer kernels.
- */
- *max_mem_alloc_size = 256 * 1024 * 1024;
+ *max_mem_alloc_size = rscreen->info.max_alloc_size;
}
return sizeof(uint64_t);
case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
if (ret) {
uint32_t *max_clock_frequency = ret;
- *max_clock_frequency = rscreen->info.max_sclk;
+ *max_clock_frequency = rscreen->info.max_shader_clock;
}
return sizeof(uint32_t);
case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
if (ret) {
uint32_t *max_compute_units = ret;
- *max_compute_units = rscreen->info.max_compute_units;
+ *max_compute_units = rscreen->info.num_good_compute_units;
}
return sizeof(uint32_t);
@@ -696,6 +1038,16 @@ static int r600_get_compute_param(struct pipe_screen *screen,
*subgroup_size = r600_wavefront_size(rscreen->family);
}
return sizeof(uint32_t);
+ case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
+ if (ret) {
+ uint64_t *max_variable_threads_per_block = ret;
+ if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 &&
+ ir_type == PIPE_SHADER_IR_TGSI)
+ *max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
+ else
+ *max_variable_threads_per_block = 0;
+ }
+ return sizeof(uint64_t);
}
fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
@@ -707,188 +1059,116 @@ static uint64_t r600_get_timestamp(struct pipe_screen *screen)
struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
return 1000000 * rscreen->ws->query_value(rscreen->ws, RADEON_TIMESTAMP) /
- rscreen->info.r600_clock_crystal_freq;
-}
-
-static int r600_get_driver_query_info(struct pipe_screen *screen,
- unsigned index,
- struct pipe_driver_query_info *info)
-{
- struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
- struct pipe_driver_query_info list[] = {
- {"num-compilations", R600_QUERY_NUM_COMPILATIONS, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
- PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
- {"num-shaders-created", R600_QUERY_NUM_SHADERS_CREATED, {0}, PIPE_DRIVER_QUERY_TYPE_UINT64,
- PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
- {"draw-calls", R600_QUERY_DRAW_CALLS, {0}},
- {"requested-VRAM", R600_QUERY_REQUESTED_VRAM, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
- {"requested-GTT", R600_QUERY_REQUESTED_GTT, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
- {"buffer-wait-time", R600_QUERY_BUFFER_WAIT_TIME, {0}, PIPE_DRIVER_QUERY_TYPE_MICROSECONDS,
- PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
- {"num-cs-flushes", R600_QUERY_NUM_CS_FLUSHES, {0}},
- {"num-bytes-moved", R600_QUERY_NUM_BYTES_MOVED, {0}, PIPE_DRIVER_QUERY_TYPE_BYTES,
- PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE},
- {"VRAM-usage", R600_QUERY_VRAM_USAGE, {rscreen->info.vram_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
- {"GTT-usage", R600_QUERY_GTT_USAGE, {rscreen->info.gart_size}, PIPE_DRIVER_QUERY_TYPE_BYTES},
- {"GPU-load", R600_QUERY_GPU_LOAD, {100}},
- {"temperature", R600_QUERY_GPU_TEMPERATURE, {100}},
- {"shader-clock", R600_QUERY_CURRENT_GPU_SCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
- {"memory-clock", R600_QUERY_CURRENT_GPU_MCLK, {0}, PIPE_DRIVER_QUERY_TYPE_HZ},
- };
- unsigned num_queries;
-
- if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
- num_queries = Elements(list);
- else if (rscreen->info.drm_major == 3)
- num_queries = Elements(list) - 3;
- else
- num_queries = Elements(list) - 4;
-
- if (!info)
- return num_queries;
-
- if (index >= num_queries)
- return 0;
-
- *info = list[index];
- return 1;
+ rscreen->info.clock_crystal_freq;
}
static void r600_fence_reference(struct pipe_screen *screen,
- struct pipe_fence_handle **ptr,
- struct pipe_fence_handle *fence)
+ struct pipe_fence_handle **dst,
+ struct pipe_fence_handle *src)
{
- struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
-
- rws->fence_reference(ptr, fence);
+ struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
+ struct r600_multi_fence **rdst = (struct r600_multi_fence **)dst;
+ struct r600_multi_fence *rsrc = (struct r600_multi_fence *)src;
+
+ if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
+ ws->fence_reference(&(*rdst)->gfx, NULL);
+ ws->fence_reference(&(*rdst)->sdma, NULL);
+ FREE(*rdst);
+ }
+ *rdst = rsrc;
}
static boolean r600_fence_finish(struct pipe_screen *screen,
+ struct pipe_context *ctx,
struct pipe_fence_handle *fence,
uint64_t timeout)
{
struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
+ struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
+ struct r600_common_context *rctx =
+ ctx ? (struct r600_common_context*)ctx : NULL;
+ int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
- return rws->fence_wait(rws, fence, timeout);
-}
+ if (rfence->sdma) {
+ if (!rws->fence_wait(rws, rfence->sdma, timeout))
+ return false;
-static bool r600_interpret_tiling(struct r600_common_screen *rscreen,
- uint32_t tiling_config)
-{
- switch ((tiling_config & 0xe) >> 1) {
- case 0:
- rscreen->tiling_info.num_channels = 1;
- break;
- case 1:
- rscreen->tiling_info.num_channels = 2;
- break;
- case 2:
- rscreen->tiling_info.num_channels = 4;
- break;
- case 3:
- rscreen->tiling_info.num_channels = 8;
- break;
- default:
- return false;
+ /* Recompute the timeout after waiting. */
+ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+ int64_t time = os_time_get_nano();
+ timeout = abs_timeout > time ? abs_timeout - time : 0;
+ }
}
- switch ((tiling_config & 0x30) >> 4) {
- case 0:
- rscreen->tiling_info.num_banks = 4;
- break;
- case 1:
- rscreen->tiling_info.num_banks = 8;
- break;
- default:
- return false;
+ if (!rfence->gfx)
+ return true;
- }
- switch ((tiling_config & 0xc0) >> 6) {
- case 0:
- rscreen->tiling_info.group_bytes = 256;
- break;
- case 1:
- rscreen->tiling_info.group_bytes = 512;
- break;
- default:
- return false;
- }
- return true;
-}
+ /* Flush the gfx IB if it hasn't been flushed yet. */
+ if (rctx &&
+ rfence->gfx_unflushed.ctx == rctx &&
+ rfence->gfx_unflushed.ib_index == rctx->num_gfx_cs_flushes) {
+ rctx->gfx.flush(rctx, timeout ? 0 : RADEON_FLUSH_ASYNC, NULL);
+ rfence->gfx_unflushed.ctx = NULL;
-static bool evergreen_interpret_tiling(struct r600_common_screen *rscreen,
- uint32_t tiling_config)
-{
- switch (tiling_config & 0xf) {
- case 0:
- rscreen->tiling_info.num_channels = 1;
- break;
- case 1:
- rscreen->tiling_info.num_channels = 2;
- break;
- case 2:
- rscreen->tiling_info.num_channels = 4;
- break;
- case 3:
- rscreen->tiling_info.num_channels = 8;
- break;
- default:
- return false;
- }
+ if (!timeout)
+ return false;
- switch ((tiling_config & 0xf0) >> 4) {
- case 0:
- rscreen->tiling_info.num_banks = 4;
- break;
- case 1:
- rscreen->tiling_info.num_banks = 8;
- break;
- case 2:
- rscreen->tiling_info.num_banks = 16;
- break;
- default:
- return false;
+ /* Recompute the timeout after all that. */
+ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+ int64_t time = os_time_get_nano();
+ timeout = abs_timeout > time ? abs_timeout - time : 0;
+ }
}
- switch ((tiling_config & 0xf00) >> 8) {
- case 0:
- rscreen->tiling_info.group_bytes = 256;
- break;
- case 1:
- rscreen->tiling_info.group_bytes = 512;
- break;
- default:
- return false;
- }
- return true;
+ return rws->fence_wait(rws, rfence->gfx, timeout);
}
-static bool r600_init_tiling(struct r600_common_screen *rscreen)
+static void r600_query_memory_info(struct pipe_screen *screen,
+ struct pipe_memory_info *info)
{
- uint32_t tiling_config = rscreen->info.r600_tiling_config;
-
- /* set default group bytes, overridden by tiling info ioctl */
- if (rscreen->chip_class <= R700) {
- rscreen->tiling_info.group_bytes = 256;
- } else {
- rscreen->tiling_info.group_bytes = 512;
- }
-
- if (!tiling_config)
- return true;
-
- if (rscreen->chip_class <= R700) {
- return r600_interpret_tiling(rscreen, tiling_config);
- } else {
- return evergreen_interpret_tiling(rscreen, tiling_config);
- }
+ struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+ struct radeon_winsys *ws = rscreen->ws;
+ unsigned vram_usage, gtt_usage;
+
+ info->total_device_memory = rscreen->info.vram_size / 1024;
+ info->total_staging_memory = rscreen->info.gart_size / 1024;
+
+ /* The real TTM memory usage is somewhat random, because:
+ *
+ * 1) TTM delays freeing memory, because it can only free it after
+ * fences expire.
+ *
+ * 2) The memory usage can be really low if big VRAM evictions are
+ * taking place, but the real usage is well above the size of VRAM.
+ *
+ * Instead, return statistics of this process.
+ */
+ vram_usage = ws->query_value(ws, RADEON_REQUESTED_VRAM_MEMORY) / 1024;
+ gtt_usage = ws->query_value(ws, RADEON_REQUESTED_GTT_MEMORY) / 1024;
+
+ info->avail_device_memory =
+ vram_usage <= info->total_device_memory ?
+ info->total_device_memory - vram_usage : 0;
+ info->avail_staging_memory =
+ gtt_usage <= info->total_staging_memory ?
+ info->total_staging_memory - gtt_usage : 0;
+
+ info->device_memory_evicted =
+ ws->query_value(ws, RADEON_NUM_BYTES_MOVED) / 1024;
+
+ if (rscreen->info.drm_major == 3 && rscreen->info.drm_minor >= 4)
+ info->nr_device_memory_evictions =
+ ws->query_value(ws, RADEON_NUM_EVICTIONS);
+ else
+ /* Just return the number of evicted 64KB pages. */
+ info->nr_device_memory_evictions = info->device_memory_evicted / 64;
}
struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
const struct pipe_resource *templ)
{
if (templ->target == PIPE_BUFFER) {
- return r600_buffer_create(screen, templ, 4096);
+ return r600_buffer_create(screen, templ, 256);
} else {
return r600_texture_create(screen, templ);
}
@@ -897,10 +1177,15 @@ struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
bool r600_common_screen_init(struct r600_common_screen *rscreen,
struct radeon_winsys *ws)
{
- char llvm_string[32] = {};
+ char llvm_string[32] = {}, kernel_version[128] = {};
+ struct utsname uname_data;
ws->query_info(ws, &rscreen->info);
+ if (uname(&uname_data) == 0)
+ snprintf(kernel_version, sizeof(kernel_version),
+ " / %s", uname_data.release);
+
#if HAVE_LLVM
snprintf(llvm_string, sizeof(llvm_string),
", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
@@ -908,22 +1193,22 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
#endif
snprintf(rscreen->renderer_string, sizeof(rscreen->renderer_string),
- "%s (DRM %i.%i.%i%s)",
+ "%s (DRM %i.%i.%i%s%s)",
r600_get_chip_name(rscreen), rscreen->info.drm_major,
rscreen->info.drm_minor, rscreen->info.drm_patchlevel,
- llvm_string);
+ kernel_version, llvm_string);
rscreen->b.get_name = r600_get_name;
rscreen->b.get_vendor = r600_get_vendor;
rscreen->b.get_device_vendor = r600_get_device_vendor;
rscreen->b.get_compute_param = r600_get_compute_param;
rscreen->b.get_paramf = r600_get_paramf;
- rscreen->b.get_driver_query_info = r600_get_driver_query_info;
rscreen->b.get_timestamp = r600_get_timestamp;
rscreen->b.fence_finish = r600_fence_finish;
rscreen->b.fence_reference = r600_fence_reference;
rscreen->b.resource_destroy = u_resource_destroy_vtbl;
rscreen->b.resource_from_user_memory = r600_buffer_from_user_memory;
+ rscreen->b.query_memory_info = r600_query_memory_info;
if (rscreen->info.has_uvd) {
rscreen->b.get_video_param = rvid_get_video_param;
@@ -934,109 +1219,115 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
}
r600_init_screen_texture_functions(rscreen);
+ r600_init_screen_query_functions(rscreen);
rscreen->ws = ws;
rscreen->family = rscreen->info.family;
rscreen->chip_class = rscreen->info.chip_class;
rscreen->debug_flags = debug_get_flags_option("R600_DEBUG", common_debug_options, 0);
- if (!r600_init_tiling(rscreen)) {
- return false;
+ slab_create_parent(&rscreen->pool_transfers, sizeof(struct r600_transfer), 64);
+
+ rscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
+ if (rscreen->force_aniso >= 0) {
+ printf("radeon: Forcing anisotropy filter to %ix\n",
+ /* round down to a power of two */
+ 1 << util_logbase2(rscreen->force_aniso));
}
+
util_format_s3tc_init();
pipe_mutex_init(rscreen->aux_context_lock);
pipe_mutex_init(rscreen->gpu_load_mutex);
- if (((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 28) ||
- rscreen->info.drm_major == 3) &&
- (rscreen->debug_flags & DBG_TRACE_CS)) {
- rscreen->trace_bo = (struct r600_resource*)pipe_buffer_create(&rscreen->b,
- PIPE_BIND_CUSTOM,
- PIPE_USAGE_STAGING,
- 4096);
- if (rscreen->trace_bo) {
- rscreen->trace_ptr = rscreen->ws->buffer_map(rscreen->trace_bo->cs_buf, NULL,
- PIPE_TRANSFER_UNSYNCHRONIZED);
- }
- }
-
if (rscreen->debug_flags & DBG_INFO) {
printf("pci_id = 0x%x\n", rscreen->info.pci_id);
- printf("family = %i\n", rscreen->info.family);
+ printf("family = %i (%s)\n", rscreen->info.family,
+ r600_get_chip_name(rscreen));
printf("chip_class = %i\n", rscreen->info.chip_class);
- printf("gart_size = %i MB\n", (int)(rscreen->info.gart_size >> 20));
- printf("vram_size = %i MB\n", (int)(rscreen->info.vram_size >> 20));
- printf("max_sclk = %i\n", rscreen->info.max_sclk);
- printf("max_compute_units = %i\n", rscreen->info.max_compute_units);
- printf("max_se = %i\n", rscreen->info.max_se);
- printf("max_sh_per_se = %i\n", rscreen->info.max_sh_per_se);
- printf("drm = %i.%i.%i\n", rscreen->info.drm_major,
- rscreen->info.drm_minor, rscreen->info.drm_patchlevel);
+ printf("gart_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.gart_size, 1024*1024));
+ printf("vram_size = %i MB\n", (int)DIV_ROUND_UP(rscreen->info.vram_size, 1024*1024));
+ printf("max_alloc_size = %i MB\n",
+ (int)DIV_ROUND_UP(rscreen->info.max_alloc_size, 1024*1024));
+ printf("has_virtual_memory = %i\n", rscreen->info.has_virtual_memory);
+ printf("gfx_ib_pad_with_type2 = %i\n", rscreen->info.gfx_ib_pad_with_type2);
+ printf("has_sdma = %i\n", rscreen->info.has_sdma);
printf("has_uvd = %i\n", rscreen->info.has_uvd);
+ printf("me_fw_version = %i\n", rscreen->info.me_fw_version);
+ printf("pfp_fw_version = %i\n", rscreen->info.pfp_fw_version);
+ printf("ce_fw_version = %i\n", rscreen->info.ce_fw_version);
printf("vce_fw_version = %i\n", rscreen->info.vce_fw_version);
- printf("r600_num_backends = %i\n", rscreen->info.r600_num_backends);
- printf("r600_clock_crystal_freq = %i\n", rscreen->info.r600_clock_crystal_freq);
- printf("r600_tiling_config = 0x%x\n", rscreen->info.r600_tiling_config);
- printf("r600_num_tile_pipes = %i\n", rscreen->info.r600_num_tile_pipes);
- printf("r600_max_pipes = %i\n", rscreen->info.r600_max_pipes);
- printf("r600_virtual_address = %i\n", rscreen->info.r600_virtual_address);
- printf("r600_has_dma = %i\n", rscreen->info.r600_has_dma);
- printf("r600_backend_map = %i\n", rscreen->info.r600_backend_map);
- printf("r600_backend_map_valid = %i\n", rscreen->info.r600_backend_map_valid);
- printf("si_tile_mode_array_valid = %i\n", rscreen->info.si_tile_mode_array_valid);
- printf("cik_macrotile_mode_array_valid = %i\n", rscreen->info.cik_macrotile_mode_array_valid);
+ printf("vce_harvest_config = %i\n", rscreen->info.vce_harvest_config);
+ printf("clock_crystal_freq = %i\n", rscreen->info.clock_crystal_freq);
+ printf("drm = %i.%i.%i\n", rscreen->info.drm_major,
+ rscreen->info.drm_minor, rscreen->info.drm_patchlevel);
+ printf("has_userptr = %i\n", rscreen->info.has_userptr);
+
+ printf("r600_max_quad_pipes = %i\n", rscreen->info.r600_max_quad_pipes);
+ printf("max_shader_clock = %i\n", rscreen->info.max_shader_clock);
+ printf("num_good_compute_units = %i\n", rscreen->info.num_good_compute_units);
+ printf("max_se = %i\n", rscreen->info.max_se);
+ printf("max_sh_per_se = %i\n", rscreen->info.max_sh_per_se);
+
+ printf("r600_gb_backend_map = %i\n", rscreen->info.r600_gb_backend_map);
+ printf("r600_gb_backend_map_valid = %i\n", rscreen->info.r600_gb_backend_map_valid);
+ printf("r600_num_banks = %i\n", rscreen->info.r600_num_banks);
+ printf("num_render_backends = %i\n", rscreen->info.num_render_backends);
+ printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes);
+ printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes);
}
return true;
}
void r600_destroy_common_screen(struct r600_common_screen *rscreen)
{
+ r600_perfcounters_destroy(rscreen);
r600_gpu_load_kill_thread(rscreen);
pipe_mutex_destroy(rscreen->gpu_load_mutex);
pipe_mutex_destroy(rscreen->aux_context_lock);
rscreen->aux_context->destroy(rscreen->aux_context);
- if (rscreen->trace_bo)
- pipe_resource_reference((struct pipe_resource**)&rscreen->trace_bo, NULL);
+ slab_destroy_parent(&rscreen->pool_transfers);
rscreen->ws->destroy(rscreen->ws);
FREE(rscreen);
}
bool r600_can_dump_shader(struct r600_common_screen *rscreen,
- const struct tgsi_token *tokens)
+ unsigned processor)
{
- /* Compute shader don't have tgsi_tokens */
- if (!tokens)
- return (rscreen->debug_flags & DBG_CS) != 0;
-
- switch (tgsi_get_processor_type(tokens)) {
- case TGSI_PROCESSOR_VERTEX:
+ switch (processor) {
+ case PIPE_SHADER_VERTEX:
return (rscreen->debug_flags & DBG_VS) != 0;
- case TGSI_PROCESSOR_TESS_CTRL:
+ case PIPE_SHADER_TESS_CTRL:
return (rscreen->debug_flags & DBG_TCS) != 0;
- case TGSI_PROCESSOR_TESS_EVAL:
+ case PIPE_SHADER_TESS_EVAL:
return (rscreen->debug_flags & DBG_TES) != 0;
- case TGSI_PROCESSOR_GEOMETRY:
+ case PIPE_SHADER_GEOMETRY:
return (rscreen->debug_flags & DBG_GS) != 0;
- case TGSI_PROCESSOR_FRAGMENT:
+ case PIPE_SHADER_FRAGMENT:
return (rscreen->debug_flags & DBG_PS) != 0;
- case TGSI_PROCESSOR_COMPUTE:
+ case PIPE_SHADER_COMPUTE:
return (rscreen->debug_flags & DBG_CS) != 0;
default:
return false;
}
}
+bool r600_extra_shader_checks(struct r600_common_screen *rscreen, unsigned processor)
+{
+ return (rscreen->debug_flags & DBG_CHECK_IR) ||
+ r600_can_dump_shader(rscreen, processor);
+}
+
void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
- unsigned offset, unsigned size, unsigned value,
- bool is_framebuffer)
+ uint64_t offset, uint64_t size, unsigned value,
+ enum r600_coherency coher)
{
struct r600_common_context *rctx = (struct r600_common_context*)rscreen->aux_context;
pipe_mutex_lock(rscreen->aux_context_lock);
- rctx->clear_buffer(&rctx->b, dst, offset, size, value, is_framebuffer);
+ rctx->clear_buffer(&rctx->b, dst, offset, size, value, coher);
rscreen->aux_context->flush(rscreen->aux_context, NULL, 0);
pipe_mutex_unlock(rscreen->aux_context_lock);
}
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h
index 29db1cc4e..86772c0af 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -39,31 +39,22 @@
#include "util/u_blitter.h"
#include "util/list.h"
#include "util/u_range.h"
-#include "util/u_slab.h"
+#include "util/slab.h"
#include "util/u_suballoc.h"
#include "util/u_transfer.h"
+#define ATI_VENDOR_ID 0x1002
+
#define R600_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
#define R600_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
#define R600_RESOURCE_FLAG_FORCE_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
-
-#define R600_QUERY_DRAW_CALLS (PIPE_QUERY_DRIVER_SPECIFIC + 0)
-#define R600_QUERY_REQUESTED_VRAM (PIPE_QUERY_DRIVER_SPECIFIC + 1)
-#define R600_QUERY_REQUESTED_GTT (PIPE_QUERY_DRIVER_SPECIFIC + 2)
-#define R600_QUERY_BUFFER_WAIT_TIME (PIPE_QUERY_DRIVER_SPECIFIC + 3)
-#define R600_QUERY_NUM_CS_FLUSHES (PIPE_QUERY_DRIVER_SPECIFIC + 4)
-#define R600_QUERY_NUM_BYTES_MOVED (PIPE_QUERY_DRIVER_SPECIFIC + 5)
-#define R600_QUERY_VRAM_USAGE (PIPE_QUERY_DRIVER_SPECIFIC + 6)
-#define R600_QUERY_GTT_USAGE (PIPE_QUERY_DRIVER_SPECIFIC + 7)
-#define R600_QUERY_GPU_TEMPERATURE (PIPE_QUERY_DRIVER_SPECIFIC + 8)
-#define R600_QUERY_CURRENT_GPU_SCLK (PIPE_QUERY_DRIVER_SPECIFIC + 9)
-#define R600_QUERY_CURRENT_GPU_MCLK (PIPE_QUERY_DRIVER_SPECIFIC + 10)
-#define R600_QUERY_GPU_LOAD (PIPE_QUERY_DRIVER_SPECIFIC + 11)
-#define R600_QUERY_NUM_COMPILATIONS (PIPE_QUERY_DRIVER_SPECIFIC + 12)
-#define R600_QUERY_NUM_SHADERS_CREATED (PIPE_QUERY_DRIVER_SPECIFIC + 13)
+#define R600_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
#define R600_CONTEXT_STREAMOUT_FLUSH (1u << 0)
-#define R600_CONTEXT_PRIVATE_FLAG (1u << 1)
+/* Pipeline & streamout query controls. */
+#define R600_CONTEXT_START_PIPELINE_STATS (1u << 1)
+#define R600_CONTEXT_STOP_PIPELINE_STATS (1u << 2)
+#define R600_CONTEXT_PRIVATE_FLAG (1u << 3)
/* special primitive types */
#define R600_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX
@@ -71,10 +62,10 @@
/* Debug flags. */
/* logging */
#define DBG_TEX (1 << 0)
-#define DBG_TEXMIP (1 << 1)
+/* gap - reuse */
#define DBG_COMPUTE (1 << 2)
#define DBG_VM (1 << 3)
-#define DBG_TRACE_CS (1 << 4)
+/* gap - reuse */
/* shader logging */
#define DBG_FS (1 << 5)
#define DBG_VS (1 << 6)
@@ -86,6 +77,10 @@
#define DBG_NO_IR (1 << 12)
#define DBG_NO_TGSI (1 << 13)
#define DBG_NO_ASM (1 << 14)
+#define DBG_PREOPT_IR (1 << 15)
+#define DBG_CHECK_IR (1 << 16)
+/* gaps */
+#define DBG_TEST_DMA (1 << 20)
/* Bits 21-31 are reserved for the r600g driver. */
/* features */
#define DBG_NO_ASYNC_DMA (1llu << 32)
@@ -98,13 +93,40 @@
#define DBG_PRECOMPILE (1llu << 39)
#define DBG_INFO (1llu << 40)
#define DBG_NO_WC (1llu << 41)
+#define DBG_CHECK_VM (1llu << 42)
+#define DBG_NO_DCC (1llu << 43)
+#define DBG_NO_DCC_CLEAR (1llu << 44)
+#define DBG_NO_RB_PLUS (1llu << 45)
+#define DBG_SI_SCHED (1llu << 46)
+#define DBG_MONOLITHIC_SHADERS (1llu << 47)
+#define DBG_NO_CE (1llu << 48)
+#define DBG_UNSAFE_MATH (1llu << 49)
+#define DBG_NO_DCC_FB (1llu << 50)
#define R600_MAP_BUFFER_ALIGNMENT 64
+#define R600_MAX_VIEWPORTS 16
+
+#define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024
+
+enum r600_coherency {
+ R600_COHERENCY_NONE, /* no cache flushes needed */
+ R600_COHERENCY_SHADER,
+ R600_COHERENCY_CB_META,
+};
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+#define R600_BIG_ENDIAN 1
+#else
+#define R600_BIG_ENDIAN 0
+#endif
struct r600_common_context;
+struct r600_perfcounters;
+struct tgsi_shader_info;
+struct r600_qbo_state;
struct radeon_shader_reloc {
- char *name;
+ char name[32];
uint64_t offset;
};
@@ -137,18 +159,31 @@ struct radeon_shader_binary {
/** Disassembled shader in a string. */
char *disasm_string;
+ char *llvm_ir_string;
};
+void radeon_shader_binary_init(struct radeon_shader_binary *b);
+void radeon_shader_binary_clean(struct radeon_shader_binary *b);
+
+/* Only 32-bit buffer allocations are supported, gallium doesn't support more
+ * at the moment.
+ */
struct r600_resource {
struct u_resource b;
/* Winsys objects. */
struct pb_buffer *buf;
- struct radeon_winsys_cs_handle *cs_buf;
uint64_t gpu_address;
+ /* Memory usage if the buffer placement is optimal. */
+ uint64_t vram_usage;
+ uint64_t gart_usage;
- /* Resource state. */
+ /* Resource properties. */
+ uint64_t bo_size;
+ unsigned bo_alignment;
enum radeon_bo_domain domains;
+ enum radeon_bo_flag flags;
+ unsigned bind_history;
/* The buffer range which is initialized (with a write transfer,
* streamout, DMA, or as a random access target). The rest of
@@ -171,6 +206,10 @@ struct r600_resource {
* use TC L2.
*/
bool TC_L2_dirty;
+
+ /* Whether the resource has been exported via resource_get_handle. */
+ bool is_shared;
+ unsigned external_usage; /* PIPE_HANDLE_USAGE_* */
};
struct r600_transfer {
@@ -180,51 +219,107 @@ struct r600_transfer {
};
struct r600_fmask_info {
- unsigned offset;
- unsigned size;
+ uint64_t offset;
+ uint64_t size;
unsigned alignment;
- unsigned pitch;
+ unsigned pitch_in_pixels;
unsigned bank_height;
unsigned slice_tile_max;
unsigned tile_mode_index;
};
struct r600_cmask_info {
- unsigned offset;
- unsigned size;
+ uint64_t offset;
+ uint64_t size;
unsigned alignment;
+ unsigned pitch;
+ unsigned height;
+ unsigned xalign;
+ unsigned yalign;
unsigned slice_tile_max;
unsigned base_address_reg;
};
+struct r600_htile_info {
+ unsigned pitch;
+ unsigned height;
+ unsigned xalign;
+ unsigned yalign;
+ unsigned alignment;
+};
+
struct r600_texture {
struct r600_resource resource;
- unsigned size;
- unsigned pitch_override;
+ uint64_t size;
+ unsigned num_level0_transfers;
+ enum pipe_format db_render_format;
bool is_depth;
+ bool db_compatible;
+ bool can_sample_z;
+ bool can_sample_s;
unsigned dirty_level_mask; /* each bit says if that mipmap is compressed */
+ unsigned stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
struct r600_texture *flushed_depth_texture;
- boolean is_flushing_texture;
struct radeon_surf surface;
/* Colorbuffer compression and fast clear. */
struct r600_fmask_info fmask;
struct r600_cmask_info cmask;
struct r600_resource *cmask_buffer;
+ uint64_t dcc_offset; /* 0 = disabled */
unsigned cb_color_info; /* fast clear enable bit */
unsigned color_clear_value[2];
+ unsigned last_msaa_resolve_target_micro_mode;
/* Depth buffer compression and fast clear. */
+ struct r600_htile_info htile;
struct r600_resource *htile_buffer;
+ bool tc_compatible_htile;
bool depth_cleared; /* if it was cleared at least once */
float depth_clear_value;
+ bool stencil_cleared; /* if it was cleared at least once */
+ uint8_t stencil_clear_value;
bool non_disp_tiling; /* R600-Cayman only */
+
+ /* Whether the texture is a displayable back buffer and needs DCC
+ * decompression, which is expensive. Therefore, it's enabled only
+ * if statistics suggest that it will pay off and it's allocated
+ * separately. It can't be bound as a sampler by apps. Limited to
+ * target == 2D and last_level == 0. If enabled, dcc_offset contains
+ * the absolute GPUVM address, not the relative one.
+ */
+ struct r600_resource *dcc_separate_buffer;
+ /* When DCC is temporarily disabled, the separate buffer is here. */
+ struct r600_resource *last_dcc_separate_buffer;
+ /* We need to track DCC dirtiness, because st/dri usually calls
+ * flush_resource twice per frame (not a bug) and we don't wanna
+ * decompress DCC twice. Also, the dirty tracking must be done even
+ * if DCC isn't used, because it's required by the DCC usage analysis
+ * for a possible future enablement.
+ */
+ bool separate_dcc_dirty;
+ /* Statistics gathering for the DCC enablement heuristic. */
+ bool dcc_gather_statistics;
+ /* Estimate of how much this color buffer is written to in units of
+ * full-screen draws: ps_invocations / (width * height)
+ * Shader kills, late Z, and blending with trivial discards make it
+ * inaccurate (we need to count CB updates, not PS invocations).
+ */
+ unsigned ps_draw_ratio;
+ /* The number of clears since the last DCC usage analysis. */
+ unsigned num_slow_clears;
+
+ /* Counter that should be non-zero if the texture is bound to a
+ * framebuffer. Implemented in radeonsi only.
+ */
+ uint32_t framebuffers_bound;
};
struct r600_surface {
struct pipe_surface base;
+ const struct radeon_surf_level *level_info;
bool color_initialized;
bool depth_initialized;
@@ -232,6 +327,8 @@ struct r600_surface {
/* Misc. color flags. */
bool alphatest_bypass;
bool export_16bpc;
+ bool color_is_int8;
+ bool color_is_int10;
/* Color registers. */
unsigned cb_color_info;
@@ -247,6 +344,10 @@ struct r600_surface {
unsigned cb_color_fmask_slice; /* EG and later */
unsigned cb_color_cmask; /* CB_COLORn_TILE (r600 only) */
unsigned cb_color_mask; /* R600 only */
+ unsigned spi_shader_col_format; /* SI+, no blending, no alpha-to-coverage. */
+ unsigned spi_shader_col_format_alpha; /* SI+, alpha-to-coverage */
+ unsigned spi_shader_col_format_blend; /* SI+, blending without alpha. */
+ unsigned spi_shader_col_format_blend_alpha; /* SI+, blending with alpha. */
struct r600_resource *cb_buffer_fmask; /* Used for FMASK relocations. R600 only */
struct r600_resource *cb_buffer_cmask; /* Used for CMASK relocations. R600 only */
@@ -263,13 +364,6 @@ struct r600_surface {
unsigned db_htile_surface;
unsigned db_htile_data_base;
unsigned db_preload_control; /* EG and later */
- unsigned pa_su_poly_offset_db_fmt_cntl;
-};
-
-struct r600_tiling_info {
- unsigned num_channels;
- unsigned num_banks;
- unsigned group_bytes;
};
struct r600_common_screen {
@@ -278,20 +372,20 @@ struct r600_common_screen {
enum radeon_family family;
enum chip_class chip_class;
struct radeon_info info;
- struct r600_tiling_info tiling_info;
uint64_t debug_flags;
bool has_cp_dma;
bool has_streamout;
+ struct slab_parent_pool pool_transfers;
+
+ /* Texture filter settings. */
+ int force_aniso; /* -1 = disabled */
+
/* Auxiliary context. Mainly used to initialize resources.
* It must be locked prior to using and flushed before unlocking. */
struct pipe_context *aux_context;
pipe_mutex aux_context_lock;
- struct r600_resource *trace_bo;
- uint32_t *trace_ptr;
- unsigned cs_count;
-
/* This must be in the screen, because UE4 uses one context for
* compilation and another one for rendering.
*/
@@ -308,7 +402,49 @@ struct r600_common_screen {
unsigned gpu_load_counter_idle;
volatile unsigned gpu_load_stop_thread; /* bool */
- char renderer_string[64];
+ char renderer_string[100];
+
+ /* Performance counters. */
+ struct r600_perfcounters *perfcounters;
+
+ /* If pipe_screen wants to re-emit the framebuffer state of all
+ * contexts, it should atomically increment this. Each context will
+ * compare this with its own last known value of the counter before
+ * drawing and re-emit the framebuffer state accordingly.
+ */
+ unsigned dirty_fb_counter;
+
+ /* Atomically increment this counter when an existing texture's
+ * metadata is enabled or disabled in a way that requires changing
+ * contexts' compressed texture binding masks.
+ */
+ unsigned compressed_colortex_counter;
+
+ /* Atomically increment this counter when an existing texture's
+ * backing buffer or tile mode parameters have changed that requires
+ * recomputation of shader descriptors.
+ */
+ unsigned dirty_tex_descriptor_counter;
+
+ struct {
+ /* Context flags to set so that all writes from earlier jobs
+ * in the CP are seen by L2 clients.
+ */
+ unsigned cp_to_L2;
+
+ /* Context flags to set so that all writes from earlier
+ * compute jobs are seen by L2 clients.
+ */
+ unsigned compute_to_L2;
+ } barrier_flags;
+
+ void (*query_opaque_metadata)(struct r600_common_screen *rscreen,
+ struct r600_texture *rtex,
+ struct radeon_bo_metadata *md);
+
+ void (*apply_opaque_metadata)(struct r600_common_screen *rscreen,
+ struct r600_texture *rtex,
+ struct radeon_bo_metadata *md);
};
/* This encapsulates a state or an operation which can emitted into the GPU
@@ -316,8 +452,7 @@ struct r600_common_screen {
struct r600_atom {
void (*emit)(struct r600_common_context *ctx, struct r600_atom *state);
unsigned num_dw;
- unsigned short id; /* used by r600 only */
- bool dirty;
+ unsigned short id;
};
struct r600_so_target {
@@ -358,16 +493,40 @@ struct r600_streamout {
int num_prims_gen_queries;
};
+struct r600_signed_scissor {
+ int minx;
+ int miny;
+ int maxx;
+ int maxy;
+};
+
+struct r600_scissors {
+ struct r600_atom atom;
+ unsigned dirty_mask;
+ struct pipe_scissor_state states[R600_MAX_VIEWPORTS];
+};
+
+struct r600_viewports {
+ struct r600_atom atom;
+ unsigned dirty_mask;
+ unsigned depth_range_dirty_mask;
+ struct pipe_viewport_state states[R600_MAX_VIEWPORTS];
+ struct r600_signed_scissor as_scissor[R600_MAX_VIEWPORTS];
+};
+
struct r600_ring {
struct radeon_winsys_cs *cs;
- bool flushing;
void (*flush)(void *ctx, unsigned flags,
struct pipe_fence_handle **fence);
};
-struct r600_rings {
- struct r600_ring gfx;
- struct r600_ring dma;
+/* Saved CS data for debugging features. */
+struct radeon_saved_cs {
+ uint32_t *ib;
+ unsigned num_dw;
+
+ struct radeon_bo_list_item *bo_list;
+ unsigned bo_count;
};
struct r600_common_context {
@@ -378,13 +537,20 @@ struct r600_common_context {
struct radeon_winsys_ctx *ctx;
enum radeon_family family;
enum chip_class chip_class;
- struct r600_rings rings;
+ struct r600_ring gfx;
+ struct r600_ring dma;
+ struct pipe_fence_handle *last_gfx_fence;
+ struct pipe_fence_handle *last_sdma_fence;
+ unsigned num_gfx_cs_flushes;
unsigned initial_gfx_cs_size;
unsigned gpu_reset_counter;
+ unsigned last_dirty_fb_counter;
+ unsigned last_compressed_colortex_counter;
+ unsigned last_dirty_tex_descriptor_counter;
struct u_upload_mgr *uploader;
- struct u_suballocator *allocator_so_filled_size;
- struct util_slab_mempool pool_transfers;
+ struct u_suballocator *allocator_zeroed_memory;
+ struct slab_child_pool pool_transfers;
/* Current unaccounted memory usage. */
uint64_t vram;
@@ -392,38 +558,43 @@ struct r600_common_context {
/* States. */
struct r600_streamout streamout;
+ struct r600_scissors scissors;
+ struct r600_viewports viewports;
+ bool scissor_enabled;
+ bool clip_halfz;
+ bool vs_writes_viewport_index;
+ bool vs_disables_clipping_viewport;
/* Additional context states. */
unsigned flags; /* flush flags */
/* Queries. */
- /* The list of active queries. Only one query of each type can be active. */
+ /* Maintain the list of active queries for pausing between IBs. */
int num_occlusion_queries;
- /* Keep track of non-timer queries, because they should be suspended
- * during context flushing.
- * The timer queries (TIME_ELAPSED) shouldn't be suspended for blits,
- * but they should be suspended between IBs. */
- struct list_head active_nontimer_queries;
- struct list_head active_timer_queries;
- unsigned num_cs_dw_nontimer_queries_suspend;
- unsigned num_cs_dw_timer_queries_suspend;
- /* If queries have been suspended. */
- bool queries_suspended_for_flush;
+ int num_perfect_occlusion_queries;
+ struct list_head active_queries;
+ unsigned num_cs_dw_queries_suspend;
/* Additional hardware info. */
unsigned backend_mask;
unsigned max_db; /* for OQ */
/* Misc stats. */
unsigned num_draw_calls;
+ unsigned num_spill_draw_calls;
+ unsigned num_compute_calls;
+ unsigned num_spill_compute_calls;
+ unsigned num_dma_calls;
+ unsigned num_vs_flushes;
+ unsigned num_ps_flushes;
+ unsigned num_cs_flushes;
+ uint64_t num_alloc_tex_transfer_bytes;
+ unsigned last_tex_ps_draw_ratio; /* for query */
/* Render condition. */
- struct pipe_query *current_render_cond;
- unsigned current_render_cond_mode;
- boolean current_render_cond_cond;
- boolean predicate_drawing;
- /* For context flushing. */
- struct pipe_query *saved_render_cond;
- boolean saved_render_cond_cond;
- unsigned saved_render_cond_mode;
+ struct r600_atom render_cond_atom;
+ struct pipe_query *render_cond;
+ unsigned render_cond_mode;
+ bool render_cond_invert;
+ bool render_cond_force_off; /* for u_blitter */
/* MSAA sample locations.
* The first index is the sample index.
@@ -434,10 +605,29 @@ struct r600_common_context {
float sample_locations_8x[8][2];
float sample_locations_16x[16][2];
- /* The list of all texture buffer objects in this context.
- * This list is walked when a buffer is invalidated/reallocated and
- * the GPU addresses are updated. */
- struct list_head texture_buffers;
+ /* Statistics gathering for the DCC enablement heuristic. It can't be
+ * in r600_texture because r600_texture can be shared by multiple
+ * contexts. This is for back buffers only. We shouldn't get too many
+ * of those.
+ *
+ * X11 DRI3 rotates among a finite set of back buffers. They should
+ * all fit in this array. If they don't, separate DCC might never be
+ * enabled by DCC stat gathering.
+ */
+ struct {
+ struct r600_texture *tex;
+ /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
+ struct pipe_query *ps_stats[3];
+ /* If all slots are used and another slot is needed,
+ * the least recently used slot is evicted based on this. */
+ int64_t last_use_timestamp;
+ bool query_active;
+ } dcc_stats[5];
+
+ struct pipe_debug_callback debug;
+ struct pipe_device_reset_callback device_reset_callback;
+
+ void *query_result_shader;
/* Copy one resource to another using async DMA. */
void (*dma_copy)(struct pipe_context *ctx,
@@ -449,8 +639,8 @@ struct r600_common_context {
const struct pipe_box *src_box);
void (*clear_buffer)(struct pipe_context *ctx, struct pipe_resource *dst,
- unsigned offset, unsigned size, unsigned value,
- bool is_framebuffer);
+ uint64_t offset, uint64_t size, unsigned value,
+ enum r600_coherency coher);
void (*blit_decompress_depth)(struct pipe_context *ctx,
struct r600_texture *texture,
@@ -459,6 +649,9 @@ struct r600_common_context {
unsigned first_layer, unsigned last_layer,
unsigned first_sample, unsigned last_sample);
+ void (*decompress_dcc)(struct pipe_context *ctx,
+ struct r600_texture *rtex);
+
/* Reallocate the buffer and update all resource bindings where
* the buffer is bound, including all resource descriptors. */
void (*invalidate_buffer)(struct pipe_context *ctx, struct pipe_resource *buf);
@@ -466,34 +659,58 @@ struct r600_common_context {
/* Enable or disable occlusion queries. */
void (*set_occlusion_query_state)(struct pipe_context *ctx, bool enable);
+ void (*save_qbo_state)(struct pipe_context *ctx, struct r600_qbo_state *st);
+
/* This ensures there is enough space in the command stream. */
void (*need_gfx_cs_space)(struct pipe_context *ctx, unsigned num_dw,
bool include_draw_vbo);
void (*set_atom_dirty)(struct r600_common_context *ctx,
struct r600_atom *atom, bool dirty);
+
+ void (*check_vm_faults)(struct r600_common_context *ctx,
+ struct radeon_saved_cs *saved,
+ enum ring_type ring);
};
/* r600_buffer.c */
-boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
- struct radeon_winsys_cs_handle *buf,
- enum radeon_bo_usage usage);
+bool r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
+ struct pb_buffer *buf,
+ enum radeon_bo_usage usage);
void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
struct r600_resource *resource,
unsigned usage);
-bool r600_init_resource(struct r600_common_screen *rscreen,
- struct r600_resource *res,
- unsigned size, unsigned alignment,
- bool use_reusable_pool);
+void r600_buffer_subdata(struct pipe_context *ctx,
+ struct pipe_resource *buffer,
+ unsigned usage, unsigned offset,
+ unsigned size, const void *data);
+void r600_init_resource_fields(struct r600_common_screen *rscreen,
+ struct r600_resource *res,
+ uint64_t size, unsigned alignment);
+bool r600_alloc_resource(struct r600_common_screen *rscreen,
+ struct r600_resource *res);
struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
const struct pipe_resource *templ,
unsigned alignment);
+struct pipe_resource * r600_aligned_buffer_create(struct pipe_screen *screen,
+ unsigned bind,
+ unsigned usage,
+ unsigned size,
+ unsigned alignment);
struct pipe_resource *
r600_buffer_from_user_memory(struct pipe_screen *screen,
const struct pipe_resource *templ,
void *user_memory);
+void
+r600_invalidate_resource(struct pipe_context *ctx,
+ struct pipe_resource *resource);
/* r600_common_pipe.c */
+void r600_gfx_write_fence(struct r600_common_context *ctx, struct r600_resource *buf,
+ uint64_t va, uint32_t old_value, uint32_t new_value);
+unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen);
+void r600_gfx_wait_fence(struct r600_common_context *ctx,
+ uint64_t va, uint32_t ref, uint32_t mask);
void r600_draw_rectangle(struct blitter_context *blitter,
int x1, int y1, int x2, int y2, float depth,
enum blitter_attrib_type type,
@@ -504,30 +721,40 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen);
void r600_preflush_suspend_features(struct r600_common_context *ctx);
void r600_postflush_resume_features(struct r600_common_context *ctx);
bool r600_common_context_init(struct r600_common_context *rctx,
- struct r600_common_screen *rscreen);
+ struct r600_common_screen *rscreen,
+ unsigned context_flags);
void r600_common_context_cleanup(struct r600_common_context *rctx);
-void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r);
bool r600_can_dump_shader(struct r600_common_screen *rscreen,
- const struct tgsi_token *tokens);
+ unsigned processor);
+bool r600_extra_shader_checks(struct r600_common_screen *rscreen,
+ unsigned processor);
void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
- unsigned offset, unsigned size, unsigned value,
- bool is_framebuffer);
+ uint64_t offset, uint64_t size, unsigned value,
+ enum r600_coherency coher);
struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
const struct pipe_resource *templ);
const char *r600_get_llvm_processor_name(enum radeon_family family);
-void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw);
+void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
+ struct r600_resource *dst, struct r600_resource *src);
+void r600_dma_emit_wait_idle(struct r600_common_context *rctx);
+void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
+ struct radeon_saved_cs *saved);
+void radeon_clear_saved_cs(struct radeon_saved_cs *saved);
+bool r600_check_device_reset(struct r600_common_context *rctx);
/* r600_gpu_load.c */
void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen);
uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen);
unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin);
+/* r600_perfcounters.c */
+void r600_perfcounters_destroy(struct r600_common_screen *rscreen);
+
/* r600_query.c */
+void r600_init_screen_query_functions(struct r600_common_screen *rscreen);
void r600_query_init(struct r600_common_context *rctx);
-void r600_suspend_nontimer_queries(struct r600_common_context *ctx);
-void r600_resume_nontimer_queries(struct r600_common_context *ctx);
-void r600_suspend_timer_queries(struct r600_common_context *ctx);
-void r600_resume_timer_queries(struct r600_common_context *ctx);
+void r600_suspend_queries(struct r600_common_context *ctx);
+void r600_resume_queries(struct r600_common_context *ctx);
void r600_query_init_backend_mask(struct r600_common_context *ctx);
/* r600_streamout.c */
@@ -541,7 +768,17 @@ void r600_update_prims_generated_query_state(struct r600_common_context *rctx,
unsigned type, int diff);
void r600_streamout_init(struct r600_common_context *rctx);
+/* r600_test_dma.c */
+void r600_test_dma(struct r600_common_screen *rscreen);
+
/* r600_texture.c */
+bool r600_prepare_for_dma_blit(struct r600_common_context *rctx,
+ struct r600_texture *rdst,
+ unsigned dst_level, unsigned dstx,
+ unsigned dsty, unsigned dstz,
+ struct r600_texture *rsrc,
+ unsigned src_level,
+ const struct pipe_box *src_box);
void r600_texture_get_fmask_info(struct r600_common_screen *rscreen,
struct r600_texture *rtex,
unsigned nr_samples,
@@ -552,21 +789,48 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen,
bool r600_init_flushed_depth_texture(struct pipe_context *ctx,
struct pipe_resource *texture,
struct r600_texture **staging);
+void r600_print_texture_info(struct r600_texture *rtex, FILE *f);
struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
const struct pipe_resource *templ);
+bool vi_dcc_formats_compatible(enum pipe_format format1,
+ enum pipe_format format2);
+void vi_dcc_disable_if_incompatible_format(struct r600_common_context *rctx,
+ struct pipe_resource *tex,
+ unsigned level,
+ enum pipe_format view_format);
struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
struct pipe_resource *texture,
const struct pipe_surface *templ,
unsigned width, unsigned height);
-unsigned r600_translate_colorswap(enum pipe_format format);
+unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap);
+void vi_separate_dcc_start_query(struct pipe_context *ctx,
+ struct r600_texture *tex);
+void vi_separate_dcc_stop_query(struct pipe_context *ctx,
+ struct r600_texture *tex);
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
+ struct r600_texture *tex);
+void vi_dcc_clear_level(struct r600_common_context *rctx,
+ struct r600_texture *rtex,
+ unsigned level, unsigned clear_value);
void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
struct pipe_framebuffer_state *fb,
struct r600_atom *fb_state,
- unsigned *buffers,
+ unsigned *buffers, unsigned *dirty_cbufs,
const union pipe_color_union *color);
+bool r600_texture_disable_dcc(struct r600_common_context *rctx,
+ struct r600_texture *rtex);
void r600_init_screen_texture_functions(struct r600_common_screen *rscreen);
void r600_init_context_texture_functions(struct r600_common_context *rctx);
+/* r600_viewport.c */
+void evergreen_apply_scissor_bug_workaround(struct r600_common_context *rctx,
+ struct pipe_scissor_state *scissor);
+void r600_viewport_set_rast_deps(struct r600_common_context *rctx,
+ bool scissor_enable, bool clip_halfz);
+void r600_update_vs_writes_viewport_index(struct r600_common_context *rctx,
+ struct tgsi_shader_info *info);
+void r600_init_viewport_functions(struct r600_common_context *rctx);
+
/* cayman_msaa.c */
extern const uint32_t eg_sample_locs_2x[4];
extern const unsigned eg_max_dist_2x;
@@ -577,7 +841,8 @@ void cayman_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
void cayman_init_msaa(struct pipe_context *ctx);
void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples);
void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
- int ps_iter_samples, int overrast_samples);
+ int ps_iter_samples, int overrast_samples,
+ unsigned sc_mode_cntl_1);
/* Inline helpers. */
@@ -594,13 +859,57 @@ r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res)
(struct pipe_resource *)res);
}
+static inline void
+r600_texture_reference(struct r600_texture **ptr, struct r600_texture *res)
+{
+ pipe_resource_reference((struct pipe_resource **)ptr, &res->resource.b.b);
+}
+
+static inline void
+r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r)
+{
+ struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+ struct r600_resource *res = (struct r600_resource *)r;
+
+ if (res) {
+ /* Add memory usage for need_gfx_cs_space */
+ rctx->vram += res->vram_usage;
+ rctx->gtt += res->gart_usage;
+ }
+}
+
+static inline bool r600_get_strmout_en(struct r600_common_context *rctx)
+{
+ return rctx->streamout.streamout_enabled ||
+ rctx->streamout.prims_gen_query_enabled;
+}
+
+#define SQ_TEX_XY_FILTER_POINT 0x00
+#define SQ_TEX_XY_FILTER_BILINEAR 0x01
+#define SQ_TEX_XY_FILTER_ANISO_POINT 0x02
+#define SQ_TEX_XY_FILTER_ANISO_BILINEAR 0x03
+
+static inline unsigned eg_tex_filter(unsigned filter, unsigned max_aniso)
+{
+ if (filter == PIPE_TEX_FILTER_LINEAR)
+ return max_aniso > 1 ? SQ_TEX_XY_FILTER_ANISO_BILINEAR
+ : SQ_TEX_XY_FILTER_BILINEAR;
+ else
+ return max_aniso > 1 ? SQ_TEX_XY_FILTER_ANISO_POINT
+ : SQ_TEX_XY_FILTER_POINT;
+}
+
static inline unsigned r600_tex_aniso_filter(unsigned filter)
{
- if (filter <= 1) return 0;
- if (filter <= 2) return 1;
- if (filter <= 4) return 2;
- if (filter <= 8) return 3;
- /* else */ return 4;
+ if (filter < 2)
+ return 0;
+ if (filter < 4)
+ return 1;
+ if (filter < 8)
+ return 2;
+ if (filter < 16)
+ return 3;
+ return 4;
}
static inline unsigned r600_wavefront_size(enum radeon_family family)
@@ -623,19 +932,38 @@ static inline unsigned r600_wavefront_size(enum radeon_family family)
}
}
+static inline enum radeon_bo_priority
+r600_get_sampler_view_priority(struct r600_resource *res)
+{
+ if (res->b.b.target == PIPE_BUFFER)
+ return RADEON_PRIO_SAMPLER_BUFFER;
+
+ if (res->b.b.nr_samples > 1)
+ return RADEON_PRIO_SAMPLER_TEXTURE_MSAA;
+
+ return RADEON_PRIO_SAMPLER_TEXTURE;
+}
+
+static inline bool
+r600_can_sample_zs(struct r600_texture *tex, bool stencil_sampler)
+{
+ return (stencil_sampler && tex->can_sample_s) ||
+ (!stencil_sampler && tex->can_sample_z);
+}
+
#define COMPUTE_DBG(rscreen, fmt, args...) \
do { \
if ((rscreen->b.debug_flags & DBG_COMPUTE)) fprintf(stderr, fmt, ##args); \
} while (0);
#define R600_ERR(fmt, args...) \
- fprintf(stderr, "EE %s:%d %s - "fmt, __FILE__, __LINE__, __func__, ##args)
+ fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
/* For MSAA sample positions. */
#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \
- (((s0x) & 0xf) | (((s0y) & 0xf) << 4) | \
- (((s1x) & 0xf) << 8) | (((s1y) & 0xf) << 12) | \
- (((s2x) & 0xf) << 16) | (((s2y) & 0xf) << 20) | \
- (((s3x) & 0xf) << 24) | (((s3y) & 0xf) << 28))
+ (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) | \
+ (((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) | \
+ (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \
+ (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28))
#endif
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_query.c b/lib/mesa/src/gallium/drivers/radeon/r600_query.c
index 65339bbb6..4b6767dd3 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_query.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_query.c
@@ -22,81 +22,317 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
+#include "r600_query.h"
#include "r600_cs.h"
#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
+#include "tgsi/tgsi_text.h"
-struct r600_query_buffer {
- /* The buffer where query results are stored. */
- struct r600_resource *buf;
- /* Offset of the next free result after current query data */
- unsigned results_end;
- /* If a query buffer is full, a new buffer is created and the old one
- * is put in here. When we calculate the result, we sum up the samples
- * from all buffers. */
- struct r600_query_buffer *previous;
+struct r600_hw_query_params {
+ unsigned start_offset;
+ unsigned end_offset;
+ unsigned fence_offset;
+ unsigned pair_stride;
+ unsigned pair_count;
};
-struct r600_query {
- /* The query buffer and how many results are in it. */
- struct r600_query_buffer buffer;
- /* The type of query */
- unsigned type;
- /* Size of the result in memory for both begin_query and end_query,
- * this can be one or two numbers, or it could even be a size of a structure. */
- unsigned result_size;
- /* The number of dwords for begin_query or end_query. */
- unsigned num_cs_dw;
- /* linked list of queries */
- struct list_head list;
- /* for custom non-GPU queries */
+/* Queries without buffer handling or suspend/resume. */
+struct r600_query_sw {
+ struct r600_query b;
+
uint64_t begin_result;
uint64_t end_result;
/* Fence for GPU_FINISHED. */
struct pipe_fence_handle *fence;
- /* For transform feedback: which stream the query is for */
- unsigned stream;
};
-
-static bool r600_is_timer_query(unsigned type)
+static void r600_query_sw_destroy(struct r600_common_context *rctx,
+ struct r600_query *rquery)
{
- return type == PIPE_QUERY_TIME_ELAPSED ||
- type == PIPE_QUERY_TIMESTAMP;
+ struct pipe_screen *screen = rctx->b.screen;
+ struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+
+ screen->fence_reference(screen, &query->fence, NULL);
+ FREE(query);
}
-static bool r600_query_needs_begin(unsigned type)
+static enum radeon_value_id winsys_id_from_type(unsigned type)
{
- return type != PIPE_QUERY_GPU_FINISHED &&
- type != PIPE_QUERY_TIMESTAMP;
+ switch (type) {
+ case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
+ case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
+ case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
+ case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
+ case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
+ case R600_QUERY_NUM_CTX_FLUSHES: return RADEON_NUM_CS_FLUSHES;
+ case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
+ case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
+ case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
+ case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
+ case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
+ case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
+ case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
+ default: unreachable("query type does not correspond to winsys id");
+ }
}
-static struct r600_resource *r600_new_query_buffer(struct r600_common_context *ctx, unsigned type)
+static bool r600_query_sw_begin(struct r600_common_context *rctx,
+ struct r600_query *rquery)
{
- unsigned j, i, num_results, buf_size = 4096;
- uint32_t *results;
+ struct r600_query_sw *query = (struct r600_query_sw *)rquery;
- /* Non-GPU queries. */
- switch (type) {
+ switch(query->b.type) {
case PIPE_QUERY_TIMESTAMP_DISJOINT:
case PIPE_QUERY_GPU_FINISHED:
+ break;
case R600_QUERY_DRAW_CALLS:
+ query->begin_result = rctx->num_draw_calls;
+ break;
+ case R600_QUERY_SPILL_DRAW_CALLS:
+ query->begin_result = rctx->num_spill_draw_calls;
+ break;
+ case R600_QUERY_COMPUTE_CALLS:
+ query->begin_result = rctx->num_compute_calls;
+ break;
+ case R600_QUERY_SPILL_COMPUTE_CALLS:
+ query->begin_result = rctx->num_spill_compute_calls;
+ break;
+ case R600_QUERY_DMA_CALLS:
+ query->begin_result = rctx->num_dma_calls;
+ break;
+ case R600_QUERY_NUM_VS_FLUSHES:
+ query->begin_result = rctx->num_vs_flushes;
+ break;
+ case R600_QUERY_NUM_PS_FLUSHES:
+ query->begin_result = rctx->num_ps_flushes;
+ break;
+ case R600_QUERY_NUM_CS_FLUSHES:
+ query->begin_result = rctx->num_cs_flushes;
+ break;
case R600_QUERY_REQUESTED_VRAM:
case R600_QUERY_REQUESTED_GTT:
+ case R600_QUERY_MAPPED_VRAM:
+ case R600_QUERY_MAPPED_GTT:
+ case R600_QUERY_VRAM_USAGE:
+ case R600_QUERY_GTT_USAGE:
+ case R600_QUERY_GPU_TEMPERATURE:
+ case R600_QUERY_CURRENT_GPU_SCLK:
+ case R600_QUERY_CURRENT_GPU_MCLK:
+ case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+ query->begin_result = 0;
+ break;
case R600_QUERY_BUFFER_WAIT_TIME:
- case R600_QUERY_NUM_CS_FLUSHES:
+ case R600_QUERY_NUM_CTX_FLUSHES:
case R600_QUERY_NUM_BYTES_MOVED:
+ case R600_QUERY_NUM_EVICTIONS: {
+ enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+ query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
+ break;
+ }
+ case R600_QUERY_GPU_LOAD:
+ query->begin_result = r600_gpu_load_begin(rctx->screen);
+ break;
+ case R600_QUERY_NUM_COMPILATIONS:
+ query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
+ break;
+ case R600_QUERY_NUM_SHADERS_CREATED:
+ query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
+ break;
+ case R600_QUERY_GPIN_ASIC_ID:
+ case R600_QUERY_GPIN_NUM_SIMD:
+ case R600_QUERY_GPIN_NUM_RB:
+ case R600_QUERY_GPIN_NUM_SPI:
+ case R600_QUERY_GPIN_NUM_SE:
+ break;
+ default:
+ unreachable("r600_query_sw_begin: bad query type");
+ }
+
+ return true;
+}
+
+static bool r600_query_sw_end(struct r600_common_context *rctx,
+ struct r600_query *rquery)
+{
+ struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+
+ switch(query->b.type) {
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ break;
+ case PIPE_QUERY_GPU_FINISHED:
+ rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
+ break;
+ case R600_QUERY_DRAW_CALLS:
+ query->end_result = rctx->num_draw_calls;
+ break;
+ case R600_QUERY_SPILL_DRAW_CALLS:
+ query->end_result = rctx->num_spill_draw_calls;
+ break;
+ case R600_QUERY_COMPUTE_CALLS:
+ query->end_result = rctx->num_compute_calls;
+ break;
+ case R600_QUERY_SPILL_COMPUTE_CALLS:
+ query->end_result = rctx->num_spill_compute_calls;
+ break;
+ case R600_QUERY_DMA_CALLS:
+ query->end_result = rctx->num_dma_calls;
+ break;
+ case R600_QUERY_NUM_VS_FLUSHES:
+ query->end_result = rctx->num_vs_flushes;
+ break;
+ case R600_QUERY_NUM_PS_FLUSHES:
+ query->end_result = rctx->num_ps_flushes;
+ break;
+ case R600_QUERY_NUM_CS_FLUSHES:
+ query->end_result = rctx->num_cs_flushes;
+ break;
+ case R600_QUERY_REQUESTED_VRAM:
+ case R600_QUERY_REQUESTED_GTT:
+ case R600_QUERY_MAPPED_VRAM:
+ case R600_QUERY_MAPPED_GTT:
case R600_QUERY_VRAM_USAGE:
case R600_QUERY_GTT_USAGE:
case R600_QUERY_GPU_TEMPERATURE:
case R600_QUERY_CURRENT_GPU_SCLK:
case R600_QUERY_CURRENT_GPU_MCLK:
+ case R600_QUERY_BUFFER_WAIT_TIME:
+ case R600_QUERY_NUM_CTX_FLUSHES:
+ case R600_QUERY_NUM_BYTES_MOVED:
+ case R600_QUERY_NUM_EVICTIONS: {
+ enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
+ query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
+ break;
+ }
case R600_QUERY_GPU_LOAD:
+ query->end_result = r600_gpu_load_end(rctx->screen,
+ query->begin_result);
+ query->begin_result = 0;
+ break;
case R600_QUERY_NUM_COMPILATIONS:
+ query->end_result = p_atomic_read(&rctx->screen->num_compilations);
+ break;
case R600_QUERY_NUM_SHADERS_CREATED:
+ query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
+ break;
+ case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
+ query->end_result = rctx->last_tex_ps_draw_ratio;
+ break;
+ case R600_QUERY_GPIN_ASIC_ID:
+ case R600_QUERY_GPIN_NUM_SIMD:
+ case R600_QUERY_GPIN_NUM_RB:
+ case R600_QUERY_GPIN_NUM_SPI:
+ case R600_QUERY_GPIN_NUM_SE:
+ break;
+ default:
+ unreachable("r600_query_sw_end: bad query type");
+ }
+
+ return true;
+}
+
+static bool r600_query_sw_get_result(struct r600_common_context *rctx,
+ struct r600_query *rquery,
+ bool wait,
+ union pipe_query_result *result)
+{
+ struct r600_query_sw *query = (struct r600_query_sw *)rquery;
+
+ switch (query->b.type) {
+ case PIPE_QUERY_TIMESTAMP_DISJOINT:
+ /* Convert from cycles per millisecond to cycles per second (Hz). */
+ result->timestamp_disjoint.frequency =
+ (uint64_t)rctx->screen->info.clock_crystal_freq * 1000;
+ result->timestamp_disjoint.disjoint = false;
+ return true;
+ case PIPE_QUERY_GPU_FINISHED: {
+ struct pipe_screen *screen = rctx->b.screen;
+ result->b = screen->fence_finish(screen, &rctx->b, query->fence,
+ wait ? PIPE_TIMEOUT_INFINITE : 0);
+ return result->b;
+ }
+
+ case R600_QUERY_GPIN_ASIC_ID:
+ result->u32 = 0;
+ return true;
+ case R600_QUERY_GPIN_NUM_SIMD:
+ result->u32 = rctx->screen->info.num_good_compute_units;
+ return true;
+ case R600_QUERY_GPIN_NUM_RB:
+ result->u32 = rctx->screen->info.num_render_backends;
+ return true;
+ case R600_QUERY_GPIN_NUM_SPI:
+ result->u32 = 1; /* all supported chips have one SPI per SE */
+ return true;
+ case R600_QUERY_GPIN_NUM_SE:
+ result->u32 = rctx->screen->info.max_se;
+ return true;
+ }
+
+ result->u64 = query->end_result - query->begin_result;
+
+ switch (query->b.type) {
+ case R600_QUERY_BUFFER_WAIT_TIME:
+ case R600_QUERY_GPU_TEMPERATURE:
+ result->u64 /= 1000;
+ break;
+ case R600_QUERY_CURRENT_GPU_SCLK:
+ case R600_QUERY_CURRENT_GPU_MCLK:
+ result->u64 *= 1000000;
+ break;
+ }
+
+ return true;
+}
+
+
+static struct r600_query_ops sw_query_ops = {
+ .destroy = r600_query_sw_destroy,
+ .begin = r600_query_sw_begin,
+ .end = r600_query_sw_end,
+ .get_result = r600_query_sw_get_result,
+ .get_result_resource = NULL
+};
+
+static struct pipe_query *r600_query_sw_create(struct pipe_context *ctx,
+ unsigned query_type)
+{
+ struct r600_query_sw *query;
+
+ query = CALLOC_STRUCT(r600_query_sw);
+ if (!query)
return NULL;
+
+ query->b.type = query_type;
+ query->b.ops = &sw_query_ops;
+
+ return (struct pipe_query *)query;
+}
+
+void r600_query_hw_destroy(struct r600_common_context *rctx,
+ struct r600_query *rquery)
+{
+ struct r600_query_hw *query = (struct r600_query_hw *)rquery;
+ struct r600_query_buffer *prev = query->buffer.previous;
+
+ /* Release all query buffers. */
+ while (prev) {
+ struct r600_query_buffer *qbuf = prev;
+ prev = prev->previous;
+ r600_resource_reference(&qbuf->buf, NULL);
+ FREE(qbuf);
}
+ r600_resource_reference(&query->buffer.buf, NULL);
+ FREE(rquery);
+}
+
+static struct r600_resource *r600_new_query_buffer(struct r600_common_context *ctx,
+ struct r600_query_hw *query)
+{
+ unsigned buf_size = MAX2(query->result_size,
+ ctx->screen->info.min_alloc_size);
+
/* Queries are normally read by the CPU after
* being written by the gpu, hence staging is probably a good
* usage pattern.
@@ -104,15 +340,37 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
struct r600_resource *buf = (struct r600_resource*)
pipe_buffer_create(ctx->b.screen, PIPE_BIND_CUSTOM,
PIPE_USAGE_STAGING, buf_size);
+ if (!buf)
+ return NULL;
- switch (type) {
- case PIPE_QUERY_OCCLUSION_COUNTER:
- case PIPE_QUERY_OCCLUSION_PREDICATE:
- results = r600_buffer_map_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE);
- memset(results, 0, buf_size);
+ if (!query->ops->prepare_buffer(ctx, query, buf)) {
+ r600_resource_reference(&buf, NULL);
+ return NULL;
+ }
+
+ return buf;
+}
+
+static bool r600_query_hw_prepare_buffer(struct r600_common_context *ctx,
+ struct r600_query_hw *query,
+ struct r600_resource *buffer)
+{
+ /* Callers ensure that the buffer is currently unused by the GPU. */
+ uint32_t *results = ctx->ws->buffer_map(buffer->buf, NULL,
+ PIPE_TRANSFER_WRITE |
+ PIPE_TRANSFER_UNSYNCHRONIZED);
+ if (!results)
+ return false;
+
+ memset(results, 0, buffer->b.b.width0);
+
+ if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
+ query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+ unsigned num_results;
+ unsigned i, j;
/* Set top bits for unused backends. */
- num_results = buf_size / (16 * ctx->max_db);
+ num_results = buffer->b.b.width0 / query->result_size;
for (j = 0; j < num_results; j++) {
for (i = 0; i < ctx->max_db; i++) {
if (!(ctx->backend_mask & (1<<i))) {
@@ -122,22 +380,118 @@ static struct r600_resource *r600_new_query_buffer(struct r600_common_context *c
}
results += 4 * ctx->max_db;
}
+ }
+
+ return true;
+}
+
+static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
+ struct r600_query *rquery,
+ bool wait,
+ enum pipe_query_value_type result_type,
+ int index,
+ struct pipe_resource *resource,
+ unsigned offset);
+
+static struct r600_query_ops query_hw_ops = {
+ .destroy = r600_query_hw_destroy,
+ .begin = r600_query_hw_begin,
+ .end = r600_query_hw_end,
+ .get_result = r600_query_hw_get_result,
+ .get_result_resource = r600_query_hw_get_result_resource,
+};
+
+static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
+ struct r600_query_hw *query,
+ struct r600_resource *buffer,
+ uint64_t va);
+static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
+ struct r600_query_hw *query,
+ struct r600_resource *buffer,
+ uint64_t va);
+static void r600_query_hw_add_result(struct r600_common_context *ctx,
+ struct r600_query_hw *, void *buffer,
+ union pipe_query_result *result);
+static void r600_query_hw_clear_result(struct r600_query_hw *,
+ union pipe_query_result *);
+
+static struct r600_query_hw_ops query_hw_default_hw_ops = {
+ .prepare_buffer = r600_query_hw_prepare_buffer,
+ .emit_start = r600_query_hw_do_emit_start,
+ .emit_stop = r600_query_hw_do_emit_stop,
+ .clear_result = r600_query_hw_clear_result,
+ .add_result = r600_query_hw_add_result,
+};
+
+bool r600_query_hw_init(struct r600_common_context *rctx,
+ struct r600_query_hw *query)
+{
+ query->buffer.buf = r600_new_query_buffer(rctx, query);
+ if (!query->buffer.buf)
+ return false;
+
+ return true;
+}
+
+static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
+ unsigned query_type,
+ unsigned index)
+{
+ struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw);
+ if (!query)
+ return NULL;
+
+ query->b.type = query_type;
+ query->b.ops = &query_hw_ops;
+ query->ops = &query_hw_default_hw_ops;
+
+ switch (query_type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ query->result_size = 16 * rctx->max_db;
+ query->result_size += 16; /* for the fence + alignment */
+ query->num_cs_dw_begin = 6;
+ query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen);
break;
case PIPE_QUERY_TIME_ELAPSED:
+ query->result_size = 24;
+ query->num_cs_dw_begin = 8;
+ query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen);
+ break;
case PIPE_QUERY_TIMESTAMP:
+ query->result_size = 16;
+ query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rctx->screen);
+ query->flags = R600_QUERY_HW_FLAG_NO_START;
break;
case PIPE_QUERY_PRIMITIVES_EMITTED:
case PIPE_QUERY_PRIMITIVES_GENERATED:
case PIPE_QUERY_SO_STATISTICS:
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+ /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+ query->result_size = 32;
+ query->num_cs_dw_begin = 6;
+ query->num_cs_dw_end = 6;
+ query->stream = index;
+ break;
case PIPE_QUERY_PIPELINE_STATISTICS:
- results = r600_buffer_map_sync_with_rings(ctx, buf, PIPE_TRANSFER_WRITE);
- memset(results, 0, buf_size);
+ /* 11 values on EG, 8 on R600. */
+ query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16;
+ query->result_size += 8; /* for the fence + alignment */
+ query->num_cs_dw_begin = 6;
+ query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rctx->screen);
break;
default:
assert(0);
+ FREE(query);
+ return NULL;
}
- return buf;
+
+ if (!r600_query_hw_init(rctx, query)) {
+ FREE(query);
+ return NULL;
+ }
+
+ return (struct pipe_query *)query;
}
static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
@@ -146,20 +500,28 @@ static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
type == PIPE_QUERY_OCCLUSION_PREDICATE) {
bool old_enable = rctx->num_occlusion_queries != 0;
- bool enable;
+ bool old_perfect_enable =
+ rctx->num_perfect_occlusion_queries != 0;
+ bool enable, perfect_enable;
rctx->num_occlusion_queries += diff;
assert(rctx->num_occlusion_queries >= 0);
+ if (type == PIPE_QUERY_OCCLUSION_COUNTER) {
+ rctx->num_perfect_occlusion_queries += diff;
+ assert(rctx->num_perfect_occlusion_queries >= 0);
+ }
+
enable = rctx->num_occlusion_queries != 0;
+ perfect_enable = rctx->num_perfect_occlusion_queries != 0;
- if (enable != old_enable) {
+ if (enable != old_enable || perfect_enable != old_perfect_enable) {
rctx->set_occlusion_query_state(&rctx->b, enable);
}
}
}
-static unsigned event_type_for_stream(struct r600_query *query)
+static unsigned event_type_for_stream(struct r600_query_hw *query)
{
switch (query->stream) {
default:
@@ -170,28 +532,14 @@ static unsigned event_type_for_stream(struct r600_query *query)
}
}
-static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_query *query)
+static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
+ struct r600_query_hw *query,
+ struct r600_resource *buffer,
+ uint64_t va)
{
- struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
- uint64_t va;
-
- r600_update_occlusion_query_state(ctx, query->type, 1);
- r600_update_prims_generated_query_state(ctx, query->type, 1);
- ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw * 2, TRUE);
-
- /* Get a new query buffer if needed. */
- if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
- struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
- *qbuf = query->buffer;
- query->buffer.buf = r600_new_query_buffer(ctx, query->type);
- query->buffer.results_end = 0;
- query->buffer.previous = qbuf;
- }
+ struct radeon_winsys_cs *cs = ctx->gfx.cs;
- /* emit begin query */
- va = query->buffer.buf->gpu_address + query->buffer.results_end;
-
- switch (query->type) {
+ switch (query->b.type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -210,7 +558,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
break;
case PIPE_QUERY_TIME_ELAPSED:
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
- radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5));
+ radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5));
radeon_emit(cs, va);
radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF));
radeon_emit(cs, 0);
@@ -225,226 +573,210 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
default:
assert(0);
}
- r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE,
- RADEON_PRIO_MIN);
-
- if (r600_is_timer_query(query->type))
- ctx->num_cs_dw_timer_queries_suspend += query->num_cs_dw;
- else
- ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw;
+ r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
+ RADEON_PRIO_QUERY);
}
-static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_query *query)
+static void r600_query_hw_emit_start(struct r600_common_context *ctx,
+ struct r600_query_hw *query)
{
- struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
uint64_t va;
- /* The queries which need begin already called this in begin_query. */
- if (!r600_query_needs_begin(query->type)) {
- ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw, FALSE);
+ if (!query->buffer.buf)
+ return; // previous buffer allocation failure
+
+ r600_update_occlusion_query_state(ctx, query->b.type, 1);
+ r600_update_prims_generated_query_state(ctx, query->b.type, 1);
+
+ ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end,
+ true);
+
+ /* Get a new query buffer if needed. */
+ if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
+ struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
+ *qbuf = query->buffer;
+ query->buffer.results_end = 0;
+ query->buffer.previous = qbuf;
+ query->buffer.buf = r600_new_query_buffer(ctx, query);
+ if (!query->buffer.buf)
+ return;
}
- va = query->buffer.buf->gpu_address;
+ /* emit begin query */
+ va = query->buffer.buf->gpu_address + query->buffer.results_end;
- /* emit end query */
- switch (query->type) {
+ query->ops->emit_start(ctx, query, query->buffer.buf, va);
+
+ ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;
+}
+
+static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
+ struct r600_query_hw *query,
+ struct r600_resource *buffer,
+ uint64_t va)
+{
+ struct radeon_winsys_cs *cs = ctx->gfx.cs;
+ uint64_t fence_va = 0;
+
+ switch (query->b.type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
- va += query->buffer.results_end + 8;
+ va += 8;
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
radeon_emit(cs, va);
radeon_emit(cs, (va >> 32) & 0xFFFF);
+
+ fence_va = va + ctx->max_db * 16 - 8;
break;
case PIPE_QUERY_PRIMITIVES_EMITTED:
case PIPE_QUERY_PRIMITIVES_GENERATED:
case PIPE_QUERY_SO_STATISTICS:
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- va += query->buffer.results_end + query->result_size/2;
+ va += query->result_size/2;
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
radeon_emit(cs, va);
radeon_emit(cs, (va >> 32) & 0xFFFF);
break;
case PIPE_QUERY_TIME_ELAPSED:
- va += query->buffer.results_end + query->result_size/2;
+ va += 8;
/* fall through */
case PIPE_QUERY_TIMESTAMP:
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
- radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5));
+ radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5));
radeon_emit(cs, va);
radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF));
radeon_emit(cs, 0);
radeon_emit(cs, 0);
+
+ fence_va = va + 8;
break;
- case PIPE_QUERY_PIPELINE_STATISTICS:
- va += query->buffer.results_end + query->result_size/2;
+ case PIPE_QUERY_PIPELINE_STATISTICS: {
+ unsigned sample_size = (query->result_size - 8) / 2;
+
+ va += sample_size;
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
radeon_emit(cs, va);
radeon_emit(cs, (va >> 32) & 0xFFFF);
+
+ fence_va = va + sample_size;
break;
+ }
default:
assert(0);
}
- r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE,
- RADEON_PRIO_MIN);
+ r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
+ RADEON_PRIO_QUERY);
- query->buffer.results_end += query->result_size;
+ if (fence_va)
+ r600_gfx_write_fence(ctx, query->buffer.buf, fence_va, 0, 0x80000000);
+}
- if (r600_query_needs_begin(query->type)) {
- if (r600_is_timer_query(query->type))
- ctx->num_cs_dw_timer_queries_suspend -= query->num_cs_dw;
- else
- ctx->num_cs_dw_nontimer_queries_suspend -= query->num_cs_dw;
+static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
+ struct r600_query_hw *query)
+{
+ uint64_t va;
+
+ if (!query->buffer.buf)
+ return; // previous buffer allocation failure
+
+ /* The queries which need begin already called this in begin_query. */
+ if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
+ ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, false);
}
- r600_update_occlusion_query_state(ctx, query->type, -1);
- r600_update_prims_generated_query_state(ctx, query->type, -1);
-}
+ /* emit end query */
+ va = query->buffer.buf->gpu_address + query->buffer.results_end;
-static void r600_emit_query_predication(struct r600_common_context *ctx, struct r600_query *query,
- int operation, bool flag_wait)
-{
- struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
- uint32_t op = PRED_OP(operation);
+ query->ops->emit_stop(ctx, query, query->buffer.buf, va);
- /* if true then invert, see GL_ARB_conditional_render_inverted */
- if (ctx->current_render_cond_cond)
- op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */
- else
- op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */
+ query->buffer.results_end += query->result_size;
- if (operation == PREDICATION_OP_CLEAR) {
- ctx->need_gfx_cs_space(&ctx->b, 3, FALSE);
+ if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
+ ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;
- radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
- radeon_emit(cs, 0);
- radeon_emit(cs, PRED_OP(PREDICATION_OP_CLEAR));
- } else {
- struct r600_query_buffer *qbuf;
- unsigned count;
- /* Find how many results there are. */
- count = 0;
- for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
- count += qbuf->results_end / query->result_size;
- }
-
- ctx->need_gfx_cs_space(&ctx->b, 5 * count, TRUE);
-
- op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
-
- /* emit predicate packets for all data blocks */
- for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
- unsigned results_base = 0;
- uint64_t va = qbuf->buf->gpu_address;
-
- while (results_base < qbuf->results_end) {
- radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
- radeon_emit(cs, (va + results_base) & 0xFFFFFFFFUL);
- radeon_emit(cs, op | (((va + results_base) >> 32UL) & 0xFF));
- r600_emit_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ,
- RADEON_PRIO_MIN);
- results_base += query->result_size;
-
- /* set CONTINUE bit for all packets except the first */
- op |= PREDICATION_CONTINUE;
- }
- }
- }
+ r600_update_occlusion_query_state(ctx, query->b.type, -1);
+ r600_update_prims_generated_query_state(ctx, query->b.type, -1);
}
-static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
+static void r600_emit_query_predication(struct r600_common_context *ctx,
+ struct r600_atom *atom)
{
- struct r600_common_context *rctx = (struct r600_common_context *)ctx;
- struct r600_query *query;
- bool skip_allocation = false;
+ struct radeon_winsys_cs *cs = ctx->gfx.cs;
+ struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond;
+ struct r600_query_buffer *qbuf;
+ uint32_t op;
+ bool flag_wait;
- query = CALLOC_STRUCT(r600_query);
- if (query == NULL)
- return NULL;
+ if (!query)
+ return;
- query->type = query_type;
+ flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+ ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
- switch (query_type) {
+ switch (query->b.type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
- query->result_size = 16 * rctx->max_db;
- query->num_cs_dw = 6;
- break;
- break;
- case PIPE_QUERY_TIME_ELAPSED:
- query->result_size = 16;
- query->num_cs_dw = 8;
- break;
- case PIPE_QUERY_TIMESTAMP:
- query->result_size = 8;
- query->num_cs_dw = 8;
+ op = PRED_OP(PREDICATION_OP_ZPASS);
break;
case PIPE_QUERY_PRIMITIVES_EMITTED:
case PIPE_QUERY_PRIMITIVES_GENERATED:
case PIPE_QUERY_SO_STATISTICS:
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
- query->result_size = 32;
- query->num_cs_dw = 6;
- query->stream = index;
- break;
- case PIPE_QUERY_PIPELINE_STATISTICS:
- /* 11 values on EG, 8 on R600. */
- query->result_size = (rctx->chip_class >= EVERGREEN ? 11 : 8) * 16;
- query->num_cs_dw = 6;
- break;
- /* Non-GPU queries and queries not requiring a buffer. */
- case PIPE_QUERY_TIMESTAMP_DISJOINT:
- case PIPE_QUERY_GPU_FINISHED:
- case R600_QUERY_DRAW_CALLS:
- case R600_QUERY_REQUESTED_VRAM:
- case R600_QUERY_REQUESTED_GTT:
- case R600_QUERY_BUFFER_WAIT_TIME:
- case R600_QUERY_NUM_CS_FLUSHES:
- case R600_QUERY_NUM_BYTES_MOVED:
- case R600_QUERY_VRAM_USAGE:
- case R600_QUERY_GTT_USAGE:
- case R600_QUERY_GPU_TEMPERATURE:
- case R600_QUERY_CURRENT_GPU_SCLK:
- case R600_QUERY_CURRENT_GPU_MCLK:
- case R600_QUERY_GPU_LOAD:
- case R600_QUERY_NUM_COMPILATIONS:
- case R600_QUERY_NUM_SHADERS_CREATED:
- skip_allocation = true;
+ op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
break;
default:
assert(0);
- FREE(query);
- return NULL;
+ return;
}
- if (!skip_allocation) {
- query->buffer.buf = r600_new_query_buffer(rctx, query_type);
- if (!query->buffer.buf) {
- FREE(query);
- return NULL;
+ /* if true then invert, see GL_ARB_conditional_render_inverted */
+ if (ctx->render_cond_invert)
+ op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */
+ else
+ op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */
+
+ op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
+
+ /* emit predicate packets for all data blocks */
+ for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+ unsigned results_base = 0;
+ uint64_t va = qbuf->buf->gpu_address;
+
+ while (results_base < qbuf->results_end) {
+ radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+ radeon_emit(cs, va + results_base);
+ radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF));
+ r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ,
+ RADEON_PRIO_QUERY);
+ results_base += query->result_size;
+
+ /* set CONTINUE bit for all packets except the first */
+ op |= PREDICATION_CONTINUE;
}
}
- return (struct pipe_query*)query;
}
-static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
+static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
{
- struct r600_query *rquery = (struct r600_query*)query;
- struct r600_query_buffer *prev = rquery->buffer.previous;
+ struct r600_common_context *rctx = (struct r600_common_context *)ctx;
- /* Release all query buffers. */
- while (prev) {
- struct r600_query_buffer *qbuf = prev;
- prev = prev->previous;
- pipe_resource_reference((struct pipe_resource**)&qbuf->buf, NULL);
- FREE(qbuf);
- }
+ if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
+ query_type == PIPE_QUERY_GPU_FINISHED ||
+ query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
+ return r600_query_sw_create(ctx, query_type);
- pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL);
- FREE(query);
+ return r600_query_hw_create(rctx, query_type, index);
+}
+
+static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+ struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+ struct r600_query *rquery = (struct r600_query *)query;
+
+ rquery->ops->destroy(rctx, rquery);
}
static boolean r600_begin_query(struct pipe_context *ctx,
@@ -452,139 +784,141 @@ static boolean r600_begin_query(struct pipe_context *ctx,
{
struct r600_common_context *rctx = (struct r600_common_context *)ctx;
struct r600_query *rquery = (struct r600_query *)query;
- struct r600_query_buffer *prev = rquery->buffer.previous;
- if (!r600_query_needs_begin(rquery->type)) {
- assert(0);
- return false;
- }
+ return rquery->ops->begin(rctx, rquery);
+}
- /* Non-GPU queries. */
- switch (rquery->type) {
- case PIPE_QUERY_TIMESTAMP_DISJOINT:
- return true;
- case R600_QUERY_DRAW_CALLS:
- rquery->begin_result = rctx->num_draw_calls;
- return true;
- case R600_QUERY_REQUESTED_VRAM:
- case R600_QUERY_REQUESTED_GTT:
- case R600_QUERY_VRAM_USAGE:
- case R600_QUERY_GTT_USAGE:
- case R600_QUERY_GPU_TEMPERATURE:
- case R600_QUERY_CURRENT_GPU_SCLK:
- case R600_QUERY_CURRENT_GPU_MCLK:
- rquery->begin_result = 0;
- return true;
- case R600_QUERY_BUFFER_WAIT_TIME:
- rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000;
- return true;
- case R600_QUERY_NUM_CS_FLUSHES:
- rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES);
- return true;
- case R600_QUERY_NUM_BYTES_MOVED:
- rquery->begin_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_BYTES_MOVED);
- return true;
- case R600_QUERY_GPU_LOAD:
- rquery->begin_result = r600_gpu_load_begin(rctx->screen);
- return true;
- case R600_QUERY_NUM_COMPILATIONS:
- rquery->begin_result = p_atomic_read(&rctx->screen->num_compilations);
- return true;
- case R600_QUERY_NUM_SHADERS_CREATED:
- rquery->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
- return true;
- }
+void r600_query_hw_reset_buffers(struct r600_common_context *rctx,
+ struct r600_query_hw *query)
+{
+ struct r600_query_buffer *prev = query->buffer.previous;
/* Discard the old query buffers. */
while (prev) {
struct r600_query_buffer *qbuf = prev;
prev = prev->previous;
- pipe_resource_reference((struct pipe_resource**)&qbuf->buf, NULL);
+ r600_resource_reference(&qbuf->buf, NULL);
FREE(qbuf);
}
+ query->buffer.results_end = 0;
+ query->buffer.previous = NULL;
+
/* Obtain a new buffer if the current one can't be mapped without a stall. */
- if (r600_rings_is_buffer_referenced(rctx, rquery->buffer.buf->cs_buf, RADEON_USAGE_READWRITE) ||
- !rctx->ws->buffer_wait(rquery->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
- pipe_resource_reference((struct pipe_resource**)&rquery->buffer.buf, NULL);
- rquery->buffer.buf = r600_new_query_buffer(rctx, rquery->type);
+ if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
+ !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
+ r600_resource_reference(&query->buffer.buf, NULL);
+ query->buffer.buf = r600_new_query_buffer(rctx, query);
+ } else {
+ if (!query->ops->prepare_buffer(rctx, query, query->buffer.buf))
+ r600_resource_reference(&query->buffer.buf, NULL);
}
+}
+
+bool r600_query_hw_begin(struct r600_common_context *rctx,
+ struct r600_query *rquery)
+{
+ struct r600_query_hw *query = (struct r600_query_hw *)rquery;
- rquery->buffer.results_end = 0;
- rquery->buffer.previous = NULL;
+ if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
+ assert(0);
+ return false;
+ }
- r600_emit_query_begin(rctx, rquery);
+ if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES))
+ r600_query_hw_reset_buffers(rctx, query);
- if (r600_is_timer_query(rquery->type))
- LIST_ADDTAIL(&rquery->list, &rctx->active_timer_queries);
- else
- LIST_ADDTAIL(&rquery->list, &rctx->active_nontimer_queries);
- return true;
+ r600_query_hw_emit_start(rctx, query);
+ if (!query->buffer.buf)
+ return false;
+
+ LIST_ADDTAIL(&query->list, &rctx->active_queries);
+ return true;
}
-static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
+static bool r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
{
struct r600_common_context *rctx = (struct r600_common_context *)ctx;
struct r600_query *rquery = (struct r600_query *)query;
- /* Non-GPU queries. */
- switch (rquery->type) {
- case PIPE_QUERY_TIMESTAMP_DISJOINT:
- return;
- case PIPE_QUERY_GPU_FINISHED:
- rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC, &rquery->fence);
- return;
- case R600_QUERY_DRAW_CALLS:
- rquery->end_result = rctx->num_draw_calls;
- return;
- case R600_QUERY_REQUESTED_VRAM:
- rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_REQUESTED_VRAM_MEMORY);
- return;
- case R600_QUERY_REQUESTED_GTT:
- rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_REQUESTED_GTT_MEMORY);
- return;
- case R600_QUERY_BUFFER_WAIT_TIME:
- rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_BUFFER_WAIT_TIME_NS) / 1000;
- return;
- case R600_QUERY_NUM_CS_FLUSHES:
- rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_CS_FLUSHES);
- return;
- case R600_QUERY_NUM_BYTES_MOVED:
- rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_NUM_BYTES_MOVED);
- return;
- case R600_QUERY_VRAM_USAGE:
- rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_VRAM_USAGE);
- return;
- case R600_QUERY_GTT_USAGE:
- rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_GTT_USAGE);
- return;
- case R600_QUERY_GPU_TEMPERATURE:
- rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_GPU_TEMPERATURE) / 1000;
- return;
- case R600_QUERY_CURRENT_GPU_SCLK:
- rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_CURRENT_SCLK) * 1000000;
- return;
- case R600_QUERY_CURRENT_GPU_MCLK:
- rquery->end_result = rctx->ws->query_value(rctx->ws, RADEON_CURRENT_MCLK) * 1000000;
- return;
- case R600_QUERY_GPU_LOAD:
- rquery->end_result = r600_gpu_load_end(rctx->screen, rquery->begin_result);
- return;
- case R600_QUERY_NUM_COMPILATIONS:
- rquery->end_result = p_atomic_read(&rctx->screen->num_compilations);
- return;
- case R600_QUERY_NUM_SHADERS_CREATED:
- rquery->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
- return;
- }
+ return rquery->ops->end(rctx, rquery);
+}
- r600_emit_query_end(rctx, rquery);
+bool r600_query_hw_end(struct r600_common_context *rctx,
+ struct r600_query *rquery)
+{
+ struct r600_query_hw *query = (struct r600_query_hw *)rquery;
+
+ if (query->flags & R600_QUERY_HW_FLAG_NO_START)
+ r600_query_hw_reset_buffers(rctx, query);
+
+ r600_query_hw_emit_stop(rctx, query);
+
+ if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
+ LIST_DELINIT(&query->list);
+
+ if (!query->buffer.buf)
+ return false;
+
+ return true;
+}
+
+static void r600_get_hw_query_params(struct r600_common_context *rctx,
+ struct r600_query_hw *rquery, int index,
+ struct r600_hw_query_params *params)
+{
+ params->pair_stride = 0;
+ params->pair_count = 1;
- if (r600_query_needs_begin(rquery->type))
- LIST_DELINIT(&rquery->list);
+ switch (rquery->b.type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER:
+ case PIPE_QUERY_OCCLUSION_PREDICATE:
+ params->start_offset = 0;
+ params->end_offset = 8;
+ params->fence_offset = rctx->max_db * 16;
+ params->pair_stride = 16;
+ params->pair_count = rctx->max_db;
+ break;
+ case PIPE_QUERY_TIME_ELAPSED:
+ params->start_offset = 0;
+ params->end_offset = 8;
+ params->fence_offset = 16;
+ break;
+ case PIPE_QUERY_TIMESTAMP:
+ params->start_offset = 0;
+ params->end_offset = 0;
+ params->fence_offset = 8;
+ break;
+ case PIPE_QUERY_PRIMITIVES_EMITTED:
+ params->start_offset = 8;
+ params->end_offset = 24;
+ params->fence_offset = params->end_offset + 4;
+ break;
+ case PIPE_QUERY_PRIMITIVES_GENERATED:
+ params->start_offset = 0;
+ params->end_offset = 16;
+ params->fence_offset = params->end_offset + 4;
+ break;
+ case PIPE_QUERY_SO_STATISTICS:
+ params->start_offset = 8 - index * 8;
+ params->end_offset = 24 - index * 8;
+ params->fence_offset = params->end_offset + 4;
+ break;
+ case PIPE_QUERY_PIPELINE_STATISTICS:
+ {
+ /* Offsets apply to EG+ */
+ static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
+ params->start_offset = offsets[index];
+ params->end_offset = 88 + offsets[index];
+ params->fence_offset = 2 * 88;
+ break;
+ }
+ default:
+ unreachable("r600_get_hw_query_params unsupported");
+ }
}
-static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
+static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index,
bool test_status_bit)
{
uint32_t *current_result = (uint32_t*)map;
@@ -602,84 +936,34 @@ static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned
return 0;
}
-static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
- struct r600_query *query,
- struct r600_query_buffer *qbuf,
- boolean wait,
- union pipe_query_result *result)
+static void r600_query_hw_add_result(struct r600_common_context *ctx,
+ struct r600_query_hw *query,
+ void *buffer,
+ union pipe_query_result *result)
{
- struct pipe_screen *screen = ctx->b.screen;
- unsigned results_base = 0;
- char *map;
-
- /* Non-GPU queries. */
- switch (query->type) {
- case PIPE_QUERY_TIMESTAMP_DISJOINT:
- /* Convert from cycles per millisecond to cycles per second (Hz). */
- result->timestamp_disjoint.frequency =
- (uint64_t)ctx->screen->info.r600_clock_crystal_freq * 1000;
- result->timestamp_disjoint.disjoint = FALSE;
- return TRUE;
- case PIPE_QUERY_GPU_FINISHED:
- result->b = screen->fence_finish(screen, query->fence,
- wait ? PIPE_TIMEOUT_INFINITE : 0);
- return result->b;
- case R600_QUERY_DRAW_CALLS:
- case R600_QUERY_REQUESTED_VRAM:
- case R600_QUERY_REQUESTED_GTT:
- case R600_QUERY_BUFFER_WAIT_TIME:
- case R600_QUERY_NUM_CS_FLUSHES:
- case R600_QUERY_NUM_BYTES_MOVED:
- case R600_QUERY_VRAM_USAGE:
- case R600_QUERY_GTT_USAGE:
- case R600_QUERY_GPU_TEMPERATURE:
- case R600_QUERY_CURRENT_GPU_SCLK:
- case R600_QUERY_CURRENT_GPU_MCLK:
- case R600_QUERY_NUM_COMPILATIONS:
- case R600_QUERY_NUM_SHADERS_CREATED:
- result->u64 = query->end_result - query->begin_result;
- return TRUE;
- case R600_QUERY_GPU_LOAD:
- result->u64 = query->end_result;
- return TRUE;
- }
-
- map = r600_buffer_map_sync_with_rings(ctx, qbuf->buf,
- PIPE_TRANSFER_READ |
- (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
- if (!map)
- return FALSE;
-
- /* count all results across all data blocks */
- switch (query->type) {
- case PIPE_QUERY_OCCLUSION_COUNTER:
- while (results_base != qbuf->results_end) {
+ switch (query->b.type) {
+ case PIPE_QUERY_OCCLUSION_COUNTER: {
+ for (unsigned i = 0; i < ctx->max_db; ++i) {
+ unsigned results_base = i * 16;
result->u64 +=
- r600_query_read_result(map + results_base, 0, 2, true);
- results_base += 16;
+ r600_query_read_result(buffer + results_base, 0, 2, true);
}
break;
- case PIPE_QUERY_OCCLUSION_PREDICATE:
- while (results_base != qbuf->results_end) {
+ }
+ case PIPE_QUERY_OCCLUSION_PREDICATE: {
+ for (unsigned i = 0; i < ctx->max_db; ++i) {
+ unsigned results_base = i * 16;
result->b = result->b ||
- r600_query_read_result(map + results_base, 0, 2, true) != 0;
- results_base += 16;
+ r600_query_read_result(buffer + results_base, 0, 2, true) != 0;
}
break;
+ }
case PIPE_QUERY_TIME_ELAPSED:
- while (results_base != qbuf->results_end) {
- result->u64 +=
- r600_query_read_result(map + results_base, 0, 2, false);
- results_base += query->result_size;
- }
+ result->u64 += r600_query_read_result(buffer, 0, 2, false);
break;
case PIPE_QUERY_TIMESTAMP:
- {
- uint32_t *current_result = (uint32_t*)map;
- result->u64 = (uint64_t)current_result[0] |
- (uint64_t)current_result[1] << 32;
+ result->u64 = *(uint64_t*)buffer;
break;
- }
case PIPE_QUERY_PRIMITIVES_EMITTED:
/* SAMPLE_STREAMOUTSTATS stores this structure:
* {
@@ -687,84 +971,64 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
* u64 PrimitiveStorageNeeded;
* }
* We only need NumPrimitivesWritten here. */
- while (results_base != qbuf->results_end) {
- result->u64 +=
- r600_query_read_result(map + results_base, 2, 6, true);
- results_base += query->result_size;
- }
+ result->u64 += r600_query_read_result(buffer, 2, 6, true);
break;
case PIPE_QUERY_PRIMITIVES_GENERATED:
/* Here we read PrimitiveStorageNeeded. */
- while (results_base != qbuf->results_end) {
- result->u64 +=
- r600_query_read_result(map + results_base, 0, 4, true);
- results_base += query->result_size;
- }
+ result->u64 += r600_query_read_result(buffer, 0, 4, true);
break;
case PIPE_QUERY_SO_STATISTICS:
- while (results_base != qbuf->results_end) {
- result->so_statistics.num_primitives_written +=
- r600_query_read_result(map + results_base, 2, 6, true);
- result->so_statistics.primitives_storage_needed +=
- r600_query_read_result(map + results_base, 0, 4, true);
- results_base += query->result_size;
- }
+ result->so_statistics.num_primitives_written +=
+ r600_query_read_result(buffer, 2, 6, true);
+ result->so_statistics.primitives_storage_needed +=
+ r600_query_read_result(buffer, 0, 4, true);
break;
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- while (results_base != qbuf->results_end) {
- result->b = result->b ||
- r600_query_read_result(map + results_base, 2, 6, true) !=
- r600_query_read_result(map + results_base, 0, 4, true);
- results_base += query->result_size;
- }
+ result->b = result->b ||
+ r600_query_read_result(buffer, 2, 6, true) !=
+ r600_query_read_result(buffer, 0, 4, true);
break;
case PIPE_QUERY_PIPELINE_STATISTICS:
if (ctx->chip_class >= EVERGREEN) {
- while (results_base != qbuf->results_end) {
- result->pipeline_statistics.ps_invocations +=
- r600_query_read_result(map + results_base, 0, 22, false);
- result->pipeline_statistics.c_primitives +=
- r600_query_read_result(map + results_base, 2, 24, false);
- result->pipeline_statistics.c_invocations +=
- r600_query_read_result(map + results_base, 4, 26, false);
- result->pipeline_statistics.vs_invocations +=
- r600_query_read_result(map + results_base, 6, 28, false);
- result->pipeline_statistics.gs_invocations +=
- r600_query_read_result(map + results_base, 8, 30, false);
- result->pipeline_statistics.gs_primitives +=
- r600_query_read_result(map + results_base, 10, 32, false);
- result->pipeline_statistics.ia_primitives +=
- r600_query_read_result(map + results_base, 12, 34, false);
- result->pipeline_statistics.ia_vertices +=
- r600_query_read_result(map + results_base, 14, 36, false);
- result->pipeline_statistics.hs_invocations +=
- r600_query_read_result(map + results_base, 16, 38, false);
- result->pipeline_statistics.ds_invocations +=
- r600_query_read_result(map + results_base, 18, 40, false);
- result->pipeline_statistics.cs_invocations +=
- r600_query_read_result(map + results_base, 20, 42, false);
- results_base += query->result_size;
- }
+ result->pipeline_statistics.ps_invocations +=
+ r600_query_read_result(buffer, 0, 22, false);
+ result->pipeline_statistics.c_primitives +=
+ r600_query_read_result(buffer, 2, 24, false);
+ result->pipeline_statistics.c_invocations +=
+ r600_query_read_result(buffer, 4, 26, false);
+ result->pipeline_statistics.vs_invocations +=
+ r600_query_read_result(buffer, 6, 28, false);
+ result->pipeline_statistics.gs_invocations +=
+ r600_query_read_result(buffer, 8, 30, false);
+ result->pipeline_statistics.gs_primitives +=
+ r600_query_read_result(buffer, 10, 32, false);
+ result->pipeline_statistics.ia_primitives +=
+ r600_query_read_result(buffer, 12, 34, false);
+ result->pipeline_statistics.ia_vertices +=
+ r600_query_read_result(buffer, 14, 36, false);
+ result->pipeline_statistics.hs_invocations +=
+ r600_query_read_result(buffer, 16, 38, false);
+ result->pipeline_statistics.ds_invocations +=
+ r600_query_read_result(buffer, 18, 40, false);
+ result->pipeline_statistics.cs_invocations +=
+ r600_query_read_result(buffer, 20, 42, false);
} else {
- while (results_base != qbuf->results_end) {
- result->pipeline_statistics.ps_invocations +=
- r600_query_read_result(map + results_base, 0, 16, false);
- result->pipeline_statistics.c_primitives +=
- r600_query_read_result(map + results_base, 2, 18, false);
- result->pipeline_statistics.c_invocations +=
- r600_query_read_result(map + results_base, 4, 20, false);
- result->pipeline_statistics.vs_invocations +=
- r600_query_read_result(map + results_base, 6, 22, false);
- result->pipeline_statistics.gs_invocations +=
- r600_query_read_result(map + results_base, 8, 24, false);
- result->pipeline_statistics.gs_primitives +=
- r600_query_read_result(map + results_base, 10, 26, false);
- result->pipeline_statistics.ia_primitives +=
- r600_query_read_result(map + results_base, 12, 28, false);
- result->pipeline_statistics.ia_vertices +=
- r600_query_read_result(map + results_base, 14, 30, false);
- results_base += query->result_size;
- }
+ result->pipeline_statistics.ps_invocations +=
+ r600_query_read_result(buffer, 0, 16, false);
+ result->pipeline_statistics.c_primitives +=
+ r600_query_read_result(buffer, 2, 18, false);
+ result->pipeline_statistics.c_invocations +=
+ r600_query_read_result(buffer, 4, 20, false);
+ result->pipeline_statistics.vs_invocations +=
+ r600_query_read_result(buffer, 6, 22, false);
+ result->pipeline_statistics.gs_invocations +=
+ r600_query_read_result(buffer, 8, 24, false);
+ result->pipeline_statistics.gs_primitives +=
+ r600_query_read_result(buffer, 10, 26, false);
+ result->pipeline_statistics.ia_primitives +=
+ r600_query_read_result(buffer, 12, 28, false);
+ result->pipeline_statistics.ia_vertices +=
+ r600_query_read_result(buffer, 14, 30, false);
}
#if 0 /* for testing */
printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
@@ -786,118 +1050,482 @@ static boolean r600_get_query_buffer_result(struct r600_common_context *ctx,
default:
assert(0);
}
-
- return TRUE;
}
static boolean r600_get_query_result(struct pipe_context *ctx,
- struct pipe_query *query,
- boolean wait, union pipe_query_result *result)
+ struct pipe_query *query, boolean wait,
+ union pipe_query_result *result)
{
struct r600_common_context *rctx = (struct r600_common_context *)ctx;
struct r600_query *rquery = (struct r600_query *)query;
+
+ return rquery->ops->get_result(rctx, rquery, wait, result);
+}
+
+static void r600_get_query_result_resource(struct pipe_context *ctx,
+ struct pipe_query *query,
+ boolean wait,
+ enum pipe_query_value_type result_type,
+ int index,
+ struct pipe_resource *resource,
+ unsigned offset)
+{
+ struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+ struct r600_query *rquery = (struct r600_query *)query;
+
+ rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index,
+ resource, offset);
+}
+
+static void r600_query_hw_clear_result(struct r600_query_hw *query,
+ union pipe_query_result *result)
+{
+ util_query_clear_result(result, query->b.type);
+}
+
+bool r600_query_hw_get_result(struct r600_common_context *rctx,
+ struct r600_query *rquery,
+ bool wait, union pipe_query_result *result)
+{
+ struct r600_query_hw *query = (struct r600_query_hw *)rquery;
struct r600_query_buffer *qbuf;
- util_query_clear_result(result, rquery->type);
+ query->ops->clear_result(query, result);
+
+ for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+ unsigned results_base = 0;
+ void *map;
- for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous) {
- if (!r600_get_query_buffer_result(rctx, rquery, qbuf, wait, result)) {
- return FALSE;
+ map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf,
+ PIPE_TRANSFER_READ |
+ (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
+ if (!map)
+ return false;
+
+ while (results_base != qbuf->results_end) {
+ query->ops->add_result(rctx, query, map + results_base,
+ result);
+ results_base += query->result_size;
}
}
/* Convert the time to expected units. */
if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
rquery->type == PIPE_QUERY_TIMESTAMP) {
- result->u64 = (1000000 * result->u64) / rctx->screen->info.r600_clock_crystal_freq;
+ result->u64 = (1000000 * result->u64) / rctx->screen->info.clock_crystal_freq;
}
- return TRUE;
+ return true;
}
-static void r600_render_condition(struct pipe_context *ctx,
- struct pipe_query *query,
- boolean condition,
- uint mode)
+/* Create the compute shader that is used to collect the results.
+ *
+ * One compute grid with a single thread is launched for every query result
+ * buffer. The thread (optionally) reads a previous summary buffer, then
+ * accumulates data from the query result buffer, and writes the result either
+ * to a summary buffer to be consumed by the next grid invocation or to the
+ * user-supplied buffer.
+ *
+ * Data layout:
+ *
+ * CONST
+ * 0.x = end_offset
+ * 0.y = result_stride
+ * 0.z = result_count
+ * 0.w = bit field:
+ * 1: read previously accumulated values
+ * 2: write accumulated values for chaining
+ * 4: write result available
+ * 8: convert result to boolean (0/1)
+ * 16: only read one dword and use that as result
+ * 32: apply timestamp conversion
+ * 64: store full 64 bits result
+ * 128: store signed 32 bits result
+ * 1.x = fence_offset
+ * 1.y = pair_stride
+ * 1.z = pair_count
+ *
+ * BUFFER[0] = query result buffer
+ * BUFFER[1] = previous summary buffer
+ * BUFFER[2] = next summary buffer or user-supplied buffer
+ */
+static void r600_create_query_result_shader(struct r600_common_context *rctx)
{
- struct r600_common_context *rctx = (struct r600_common_context *)ctx;
- struct r600_query *rquery = (struct r600_query *)query;
- bool wait_flag = false;
-
- rctx->current_render_cond = query;
- rctx->current_render_cond_cond = condition;
- rctx->current_render_cond_mode = mode;
+ /* TEMP[0].xy = accumulated result so far
+ * TEMP[0].z = result not available
+ *
+ * TEMP[1].x = current result index
+ * TEMP[1].y = current pair index
+ */
+ static const char text_tmpl[] =
+ "COMP\n"
+ "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
+ "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+ "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+ "DCL BUFFER[0]\n"
+ "DCL BUFFER[1]\n"
+ "DCL BUFFER[2]\n"
+ "DCL CONST[0..1]\n"
+ "DCL TEMP[0..5]\n"
+ "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
+ "IMM[1] UINT32 {1, 2, 4, 8}\n"
+ "IMM[2] UINT32 {16, 32, 64, 128}\n"
+ "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
+
+ "AND TEMP[5], CONST[0].wwww, IMM[2].xxxx\n"
+ "UIF TEMP[5]\n"
+ /* Check result availability. */
+ "LOAD TEMP[1].x, BUFFER[0], CONST[1].xxxx\n"
+ "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
+ "MOV TEMP[1], TEMP[0].zzzz\n"
+ "NOT TEMP[0].z, TEMP[0].zzzz\n"
+
+ /* Load result if available. */
+ "UIF TEMP[1]\n"
+ "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
+ "ENDIF\n"
+ "ELSE\n"
+ /* Load previously accumulated result if requested. */
+ "MOV TEMP[0], IMM[0].xxxx\n"
+ "AND TEMP[4], CONST[0].wwww, IMM[1].xxxx\n"
+ "UIF TEMP[4]\n"
+ "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
+ "ENDIF\n"
+
+ "MOV TEMP[1].x, IMM[0].xxxx\n"
+ "BGNLOOP\n"
+ /* Break if accumulated result so far is not available. */
+ "UIF TEMP[0].zzzz\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ /* Break if result_index >= result_count. */
+ "USGE TEMP[5], TEMP[1].xxxx, CONST[0].zzzz\n"
+ "UIF TEMP[5]\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ /* Load fence and check result availability */
+ "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy, CONST[1].xxxx\n"
+ "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
+ "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
+ "NOT TEMP[0].z, TEMP[0].zzzz\n"
+ "UIF TEMP[0].zzzz\n"
+ "BRK\n"
+ "ENDIF\n"
+
+ "MOV TEMP[1].y, IMM[0].xxxx\n"
+ "BGNLOOP\n"
+ /* Load start and end. */
+ "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy\n"
+ "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[1].yyyy, TEMP[5].xxxx\n"
+ "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
+
+ "UADD TEMP[5].x, TEMP[5].xxxx, CONST[0].xxxx\n"
+ "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].xxxx\n"
+
+ "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
+ "U64ADD TEMP[0].xy, TEMP[0], TEMP[3]\n"
+
+ /* Increment pair index */
+ "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
+ "USGE TEMP[5], TEMP[1].yyyy, CONST[1].zzzz\n"
+ "UIF TEMP[5]\n"
+ "BRK\n"
+ "ENDIF\n"
+ "ENDLOOP\n"
+
+ /* Increment result index */
+ "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
+ "ENDLOOP\n"
+ "ENDIF\n"
+
+ "AND TEMP[4], CONST[0].wwww, IMM[1].yyyy\n"
+ "UIF TEMP[4]\n"
+ /* Store accumulated data for chaining. */
+ "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
+ "ELSE\n"
+ "AND TEMP[4], CONST[0].wwww, IMM[1].zzzz\n"
+ "UIF TEMP[4]\n"
+ /* Store result availability. */
+ "NOT TEMP[0].z, TEMP[0]\n"
+ "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
+ "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
+
+ "AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n"
+ "UIF TEMP[4]\n"
+ "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
+ "ENDIF\n"
+ "ELSE\n"
+ /* Store result if it is available. */
+ "NOT TEMP[4], TEMP[0].zzzz\n"
+ "UIF TEMP[4]\n"
+ /* Apply timestamp conversion */
+ "AND TEMP[4], CONST[0].wwww, IMM[2].yyyy\n"
+ "UIF TEMP[4]\n"
+ "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
+ "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
+ "ENDIF\n"
+
+ /* Convert to boolean */
+ "AND TEMP[4], CONST[0].wwww, IMM[1].wwww\n"
+ "UIF TEMP[4]\n"
+ "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[0].xxxx\n"
+ "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
+ "MOV TEMP[0].y, IMM[0].xxxx\n"
+ "ENDIF\n"
+
+ "AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n"
+ "UIF TEMP[4]\n"
+ "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
+ "ELSE\n"
+ /* Clamping */
+ "UIF TEMP[0].yyyy\n"
+ "MOV TEMP[0].x, IMM[0].wwww\n"
+ "ENDIF\n"
+
+ "AND TEMP[4], CONST[0].wwww, IMM[2].wwww\n"
+ "UIF TEMP[4]\n"
+ "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
+ "ENDIF\n"
+
+ "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
+ "ENDIF\n"
+ "ENDIF\n"
+ "ENDIF\n"
+ "ENDIF\n"
+
+ "END\n";
+
+ char text[sizeof(text_tmpl) + 32];
+ struct tgsi_token tokens[1024];
+ struct pipe_compute_state state = {};
+
+ /* Hard code the frequency into the shader so that the backend can
+ * use the full range of optimizations for divide-by-constant.
+ */
+ snprintf(text, sizeof(text), text_tmpl,
+ rctx->screen->info.clock_crystal_freq);
- if (query == NULL) {
- if (rctx->predicate_drawing) {
- rctx->predicate_drawing = false;
- r600_emit_query_predication(rctx, NULL, PREDICATION_OP_CLEAR, false);
- }
+ if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+ assert(false);
return;
}
- if (mode == PIPE_RENDER_COND_WAIT ||
- mode == PIPE_RENDER_COND_BY_REGION_WAIT) {
- wait_flag = true;
+ state.ir_type = PIPE_SHADER_IR_TGSI;
+ state.prog = tokens;
+
+ rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state);
+}
+
+static void r600_restore_qbo_state(struct r600_common_context *rctx,
+ struct r600_qbo_state *st)
+{
+ rctx->b.bind_compute_state(&rctx->b, st->saved_compute);
+
+ rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
+ pipe_resource_reference(&st->saved_const0.buffer, NULL);
+
+ rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
+ for (unsigned i = 0; i < 3; ++i)
+ pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
+}
+
+static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
+ struct r600_query *rquery,
+ bool wait,
+ enum pipe_query_value_type result_type,
+ int index,
+ struct pipe_resource *resource,
+ unsigned offset)
+{
+ struct r600_query_hw *query = (struct r600_query_hw *)rquery;
+ struct r600_query_buffer *qbuf;
+ struct r600_query_buffer *qbuf_prev;
+ struct pipe_resource *tmp_buffer = NULL;
+ unsigned tmp_buffer_offset = 0;
+ struct r600_qbo_state saved_state = {};
+ struct pipe_grid_info grid = {};
+ struct pipe_constant_buffer constant_buffer = {};
+ struct pipe_shader_buffer ssbo[3];
+ struct r600_hw_query_params params;
+ struct {
+ uint32_t end_offset;
+ uint32_t result_stride;
+ uint32_t result_count;
+ uint32_t config;
+ uint32_t fence_offset;
+ uint32_t pair_stride;
+ uint32_t pair_count;
+ } consts;
+
+ if (!rctx->query_result_shader) {
+ r600_create_query_result_shader(rctx);
+ if (!rctx->query_result_shader)
+ return;
}
- rctx->predicate_drawing = true;
+ if (query->buffer.previous) {
+ u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16,
+ &tmp_buffer_offset, &tmp_buffer);
+ if (!tmp_buffer)
+ return;
+ }
- switch (rquery->type) {
- case PIPE_QUERY_OCCLUSION_COUNTER:
- case PIPE_QUERY_OCCLUSION_PREDICATE:
- r600_emit_query_predication(rctx, rquery, PREDICATION_OP_ZPASS, wait_flag);
+ rctx->save_qbo_state(&rctx->b, &saved_state);
+
+ r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, &params);
+ consts.end_offset = params.end_offset - params.start_offset;
+ consts.fence_offset = params.fence_offset - params.start_offset;
+ consts.result_stride = query->result_size;
+ consts.pair_stride = params.pair_stride;
+ consts.pair_count = params.pair_count;
+
+ constant_buffer.buffer_size = sizeof(consts);
+ constant_buffer.user_buffer = &consts;
+
+ ssbo[1].buffer = tmp_buffer;
+ ssbo[1].buffer_offset = tmp_buffer_offset;
+ ssbo[1].buffer_size = 16;
+
+ ssbo[2] = ssbo[1];
+
+ rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader);
+
+ grid.block[0] = 1;
+ grid.block[1] = 1;
+ grid.block[2] = 1;
+ grid.grid[0] = 1;
+ grid.grid[1] = 1;
+ grid.grid[2] = 1;
+
+ consts.config = 0;
+ if (index < 0)
+ consts.config |= 4;
+ if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+ query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE)
+ consts.config |= 8;
+ else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
+ query->b.type == PIPE_QUERY_TIME_ELAPSED)
+ consts.config |= 32;
+
+ switch (result_type) {
+ case PIPE_QUERY_TYPE_U64:
+ case PIPE_QUERY_TYPE_I64:
+ consts.config |= 64;
break;
- case PIPE_QUERY_PRIMITIVES_EMITTED:
- case PIPE_QUERY_PRIMITIVES_GENERATED:
- case PIPE_QUERY_SO_STATISTICS:
- case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
- r600_emit_query_predication(rctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag);
+ case PIPE_QUERY_TYPE_I32:
+ consts.config |= 128;
+ break;
+ case PIPE_QUERY_TYPE_U32:
break;
- default:
- assert(0);
}
-}
-static void r600_suspend_queries(struct r600_common_context *ctx,
- struct list_head *query_list,
- unsigned *num_cs_dw_queries_suspend)
-{
- struct r600_query *query;
+ rctx->flags |= rctx->screen->barrier_flags.cp_to_L2;
+
+ for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
+ if (query->b.type != PIPE_QUERY_TIMESTAMP) {
+ qbuf_prev = qbuf->previous;
+ consts.result_count = qbuf->results_end / query->result_size;
+ consts.config &= ~3;
+ if (qbuf != &query->buffer)
+ consts.config |= 1;
+ if (qbuf->previous)
+ consts.config |= 2;
+ } else {
+ /* Only read the last timestamp. */
+ qbuf_prev = NULL;
+ consts.result_count = 0;
+ consts.config |= 16;
+ params.start_offset += qbuf->results_end - query->result_size;
+ }
- LIST_FOR_EACH_ENTRY(query, query_list, list) {
- r600_emit_query_end(ctx, query);
+ rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
+
+ ssbo[0].buffer = &qbuf->buf->b.b;
+ ssbo[0].buffer_offset = params.start_offset;
+ ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
+
+ if (!qbuf->previous) {
+ ssbo[2].buffer = resource;
+ ssbo[2].buffer_offset = offset;
+ ssbo[2].buffer_size = 8;
+
+ ((struct r600_resource *)resource)->TC_L2_dirty = true;
+ }
+
+ rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo);
+
+ if (wait && qbuf == &query->buffer) {
+ uint64_t va;
+
+ /* Wait for result availability. Wait only for readiness
+ * of the last entry, since the fence writes should be
+ * serialized in the CP.
+ */
+ va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
+ va += params.fence_offset;
+
+ r600_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000);
+ }
+
+ rctx->b.launch_grid(&rctx->b, &grid);
+ rctx->flags |= rctx->screen->barrier_flags.compute_to_L2;
}
- assert(*num_cs_dw_queries_suspend == 0);
+
+ r600_restore_qbo_state(rctx, &saved_state);
+ pipe_resource_reference(&tmp_buffer, NULL);
}
-void r600_suspend_nontimer_queries(struct r600_common_context *ctx)
+static void r600_render_condition(struct pipe_context *ctx,
+ struct pipe_query *query,
+ boolean condition,
+ uint mode)
{
- r600_suspend_queries(ctx, &ctx->active_nontimer_queries,
- &ctx->num_cs_dw_nontimer_queries_suspend);
+ struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+ struct r600_query_hw *rquery = (struct r600_query_hw *)query;
+ struct r600_query_buffer *qbuf;
+ struct r600_atom *atom = &rctx->render_cond_atom;
+
+ rctx->render_cond = query;
+ rctx->render_cond_invert = condition;
+ rctx->render_cond_mode = mode;
+
+ /* Compute the size of SET_PREDICATION packets. */
+ atom->num_dw = 0;
+ if (query) {
+ for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)
+ atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;
+ }
+
+ rctx->set_atom_dirty(rctx, atom, query != NULL);
}
-void r600_suspend_timer_queries(struct r600_common_context *ctx)
+void r600_suspend_queries(struct r600_common_context *ctx)
{
- r600_suspend_queries(ctx, &ctx->active_timer_queries,
- &ctx->num_cs_dw_timer_queries_suspend);
+ struct r600_query_hw *query;
+
+ LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
+ r600_query_hw_emit_stop(ctx, query);
+ }
+ assert(ctx->num_cs_dw_queries_suspend == 0);
}
static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
struct list_head *query_list)
{
- struct r600_query *query;
+ struct r600_query_hw *query;
unsigned num_dw = 0;
LIST_FOR_EACH_ENTRY(query, query_list, list) {
/* begin + end */
- num_dw += query->num_cs_dw * 2;
+ num_dw += query->num_cs_dw_begin + query->num_cs_dw_end;
/* Workaround for the fact that
* num_cs_dw_nontimer_queries_suspend is incremented for every
* resumed query, which raises the bar in need_cs_space for
* queries about to be resumed.
*/
- num_dw += query->num_cs_dw;
+ num_dw += query->num_cs_dw_end;
}
/* primitives generated query */
num_dw += ctx->streamout.enable_atom.num_dw;
@@ -907,48 +1535,34 @@ static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *
return num_dw;
}
-static void r600_resume_queries(struct r600_common_context *ctx,
- struct list_head *query_list,
- unsigned *num_cs_dw_queries_suspend)
+void r600_resume_queries(struct r600_common_context *ctx)
{
- struct r600_query *query;
- unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, query_list);
+ struct r600_query_hw *query;
+ unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries);
- assert(*num_cs_dw_queries_suspend == 0);
+ assert(ctx->num_cs_dw_queries_suspend == 0);
/* Check CS space here. Resuming must not be interrupted by flushes. */
- ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, TRUE);
+ ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, true);
- LIST_FOR_EACH_ENTRY(query, query_list, list) {
- r600_emit_query_begin(ctx, query);
+ LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
+ r600_query_hw_emit_start(ctx, query);
}
}
-void r600_resume_nontimer_queries(struct r600_common_context *ctx)
-{
- r600_resume_queries(ctx, &ctx->active_nontimer_queries,
- &ctx->num_cs_dw_nontimer_queries_suspend);
-}
-
-void r600_resume_timer_queries(struct r600_common_context *ctx)
-{
- r600_resume_queries(ctx, &ctx->active_timer_queries,
- &ctx->num_cs_dw_timer_queries_suspend);
-}
-
/* Get backends mask */
void r600_query_init_backend_mask(struct r600_common_context *ctx)
{
- struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+ struct radeon_winsys_cs *cs = ctx->gfx.cs;
struct r600_resource *buffer;
uint32_t *results;
- unsigned num_backends = ctx->screen->info.r600_num_backends;
+ unsigned num_backends = ctx->screen->info.num_render_backends;
unsigned i, mask = 0;
/* if backend_map query is supported by the kernel */
- if (ctx->screen->info.r600_backend_map_valid) {
- unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
- unsigned backend_map = ctx->screen->info.r600_backend_map;
+ if (ctx->screen->info.r600_gb_backend_map_valid) {
+ unsigned num_tile_pipes = ctx->screen->info.num_tile_pipes;
+ unsigned backend_map = ctx->screen->info.r600_gb_backend_map;
unsigned item_width, item_mask;
if (ctx->chip_class >= EVERGREEN) {
@@ -959,7 +1573,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
item_mask = 0x3;
}
- while(num_tile_pipes--) {
+ while (num_tile_pipes--) {
i = backend_map & item_mask;
mask |= (1<<i);
backend_map >>= item_width;
@@ -990,7 +1604,8 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
radeon_emit(cs, buffer->gpu_address);
radeon_emit(cs, buffer->gpu_address >> 32);
- r600_emit_reloc(ctx, &ctx->rings.gfx, buffer, RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
+ r600_emit_reloc(ctx, &ctx->gfx, buffer,
+ RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
/* analyze results */
results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ);
@@ -1003,7 +1618,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
}
}
- pipe_resource_reference((struct pipe_resource**)&buffer, NULL);
+ r600_resource_reference(&buffer, NULL);
if (mask != 0) {
ctx->backend_mask = mask;
@@ -1016,17 +1631,167 @@ err:
return;
}
+#define XFULL(name_, query_type_, type_, result_type_, group_id_) \
+ { \
+ .name = name_, \
+ .query_type = R600_QUERY_##query_type_, \
+ .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
+ .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
+ .group_id = group_id_ \
+ }
+
+#define X(name_, query_type_, type_, result_type_) \
+ XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
+
+#define XG(group_, name_, query_type_, type_, result_type_) \
+ XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_)
+
+static struct pipe_driver_query_info r600_driver_query_list[] = {
+ X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
+ X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
+ X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
+ X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),
+ X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
+ X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE),
+ X("dma-calls", DMA_CALLS, UINT64, AVERAGE),
+ X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
+ X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
+ X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
+ X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
+ X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
+ X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
+ X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
+ X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
+ X("num-ctx-flushes", NUM_CTX_FLUSHES, UINT64, AVERAGE),
+ X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
+ X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
+ X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
+ X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
+ X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
+
+ /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
+ * which use it as a fallback path to detect the GPU type.
+ *
+ * Note: The names of these queries are significant for GPUPerfStudio
+ * (and possibly their order as well). */
+ XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
+ XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
+ XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
+ XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
+ XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
+
+ /* The following queries must be at the end of the list because their
+ * availability is adjusted dynamically based on the DRM version. */
+ X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
+ X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
+ X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
+ X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
+};
+
+#undef X
+#undef XG
+#undef XFULL
+
+static unsigned r600_get_num_queries(struct r600_common_screen *rscreen)
+{
+ if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
+ return ARRAY_SIZE(r600_driver_query_list);
+ else if (rscreen->info.drm_major == 3)
+ return ARRAY_SIZE(r600_driver_query_list) - 3;
+ else
+ return ARRAY_SIZE(r600_driver_query_list) - 4;
+}
+
+static int r600_get_driver_query_info(struct pipe_screen *screen,
+ unsigned index,
+ struct pipe_driver_query_info *info)
+{
+ struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+ unsigned num_queries = r600_get_num_queries(rscreen);
+
+ if (!info) {
+ unsigned num_perfcounters =
+ r600_get_perfcounter_info(rscreen, 0, NULL);
+
+ return num_queries + num_perfcounters;
+ }
+
+ if (index >= num_queries)
+ return r600_get_perfcounter_info(rscreen, index - num_queries, info);
+
+ *info = r600_driver_query_list[index];
+
+ switch (info->query_type) {
+ case R600_QUERY_REQUESTED_VRAM:
+ case R600_QUERY_VRAM_USAGE:
+ case R600_QUERY_MAPPED_VRAM:
+ info->max_value.u64 = rscreen->info.vram_size;
+ break;
+ case R600_QUERY_REQUESTED_GTT:
+ case R600_QUERY_GTT_USAGE:
+ case R600_QUERY_MAPPED_GTT:
+ info->max_value.u64 = rscreen->info.gart_size;
+ break;
+ case R600_QUERY_GPU_TEMPERATURE:
+ info->max_value.u64 = 125;
+ break;
+ }
+
+ if (info->group_id != ~(unsigned)0 && rscreen->perfcounters)
+ info->group_id += rscreen->perfcounters->num_groups;
+
+ return 1;
+}
+
+/* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
+ * performance counter groups, so be careful when changing this and related
+ * functions.
+ */
+static int r600_get_driver_query_group_info(struct pipe_screen *screen,
+ unsigned index,
+ struct pipe_driver_query_group_info *info)
+{
+ struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
+ unsigned num_pc_groups = 0;
+
+ if (rscreen->perfcounters)
+ num_pc_groups = rscreen->perfcounters->num_groups;
+
+ if (!info)
+ return num_pc_groups + R600_NUM_SW_QUERY_GROUPS;
+
+ if (index < num_pc_groups)
+ return r600_get_perfcounter_group_info(rscreen, index, info);
+
+ index -= num_pc_groups;
+ if (index >= R600_NUM_SW_QUERY_GROUPS)
+ return 0;
+
+ info->name = "GPIN";
+ info->max_active_queries = 5;
+ info->num_queries = 5;
+ return 1;
+}
+
void r600_query_init(struct r600_common_context *rctx)
{
rctx->b.create_query = r600_create_query;
+ rctx->b.create_batch_query = r600_create_batch_query;
rctx->b.destroy_query = r600_destroy_query;
rctx->b.begin_query = r600_begin_query;
rctx->b.end_query = r600_end_query;
rctx->b.get_query_result = r600_get_query_result;
+ rctx->b.get_query_result_resource = r600_get_query_result_resource;
+ rctx->render_cond_atom.emit = r600_emit_query_predication;
- if (((struct r600_common_screen*)rctx->b.screen)->info.r600_num_backends > 0)
+ if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0)
rctx->b.render_condition = r600_render_condition;
- LIST_INITHEAD(&rctx->active_nontimer_queries);
- LIST_INITHEAD(&rctx->active_timer_queries);
+ LIST_INITHEAD(&rctx->active_queries);
+}
+
+void r600_init_screen_query_functions(struct r600_common_screen *rscreen)
+{
+ rscreen->b.get_driver_query_info = r600_get_driver_query_info;
+ rscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info;
}
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_query.h b/lib/mesa/src/gallium/drivers/radeon/r600_query.h
index 8b2c4e3fe..14c433d91 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_query.h
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_query.h
@@ -29,10 +29,12 @@
#define R600_QUERY_H
#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
#include "util/list.h"
struct pipe_context;
struct pipe_query;
+struct pipe_resource;
struct r600_common_context;
struct r600_common_screen;
@@ -40,26 +42,40 @@ struct r600_query;
struct r600_query_hw;
struct r600_resource;
-#define R600_QUERY_DRAW_CALLS (PIPE_QUERY_DRIVER_SPECIFIC + 0)
-#define R600_QUERY_REQUESTED_VRAM (PIPE_QUERY_DRIVER_SPECIFIC + 1)
-#define R600_QUERY_REQUESTED_GTT (PIPE_QUERY_DRIVER_SPECIFIC + 2)
-#define R600_QUERY_BUFFER_WAIT_TIME (PIPE_QUERY_DRIVER_SPECIFIC + 3)
-#define R600_QUERY_NUM_CS_FLUSHES (PIPE_QUERY_DRIVER_SPECIFIC + 4)
-#define R600_QUERY_NUM_BYTES_MOVED (PIPE_QUERY_DRIVER_SPECIFIC + 5)
-#define R600_QUERY_VRAM_USAGE (PIPE_QUERY_DRIVER_SPECIFIC + 6)
-#define R600_QUERY_GTT_USAGE (PIPE_QUERY_DRIVER_SPECIFIC + 7)
-#define R600_QUERY_GPU_TEMPERATURE (PIPE_QUERY_DRIVER_SPECIFIC + 8)
-#define R600_QUERY_CURRENT_GPU_SCLK (PIPE_QUERY_DRIVER_SPECIFIC + 9)
-#define R600_QUERY_CURRENT_GPU_MCLK (PIPE_QUERY_DRIVER_SPECIFIC + 10)
-#define R600_QUERY_GPU_LOAD (PIPE_QUERY_DRIVER_SPECIFIC + 11)
-#define R600_QUERY_NUM_COMPILATIONS (PIPE_QUERY_DRIVER_SPECIFIC + 12)
-#define R600_QUERY_NUM_SHADERS_CREATED (PIPE_QUERY_DRIVER_SPECIFIC + 13)
-#define R600_QUERY_GPIN_ASIC_ID (PIPE_QUERY_DRIVER_SPECIFIC + 14)
-#define R600_QUERY_GPIN_NUM_SIMD (PIPE_QUERY_DRIVER_SPECIFIC + 15)
-#define R600_QUERY_GPIN_NUM_RB (PIPE_QUERY_DRIVER_SPECIFIC + 16)
-#define R600_QUERY_GPIN_NUM_SPI (PIPE_QUERY_DRIVER_SPECIFIC + 17)
-#define R600_QUERY_GPIN_NUM_SE (PIPE_QUERY_DRIVER_SPECIFIC + 18)
-#define R600_QUERY_FIRST_PERFCOUNTER (PIPE_QUERY_DRIVER_SPECIFIC + 100)
+enum {
+ R600_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
+ R600_QUERY_SPILL_DRAW_CALLS,
+ R600_QUERY_COMPUTE_CALLS,
+ R600_QUERY_SPILL_COMPUTE_CALLS,
+ R600_QUERY_DMA_CALLS,
+ R600_QUERY_NUM_VS_FLUSHES,
+ R600_QUERY_NUM_PS_FLUSHES,
+ R600_QUERY_NUM_CS_FLUSHES,
+ R600_QUERY_REQUESTED_VRAM,
+ R600_QUERY_REQUESTED_GTT,
+ R600_QUERY_MAPPED_VRAM,
+ R600_QUERY_MAPPED_GTT,
+ R600_QUERY_BUFFER_WAIT_TIME,
+ R600_QUERY_NUM_CTX_FLUSHES,
+ R600_QUERY_NUM_BYTES_MOVED,
+ R600_QUERY_NUM_EVICTIONS,
+ R600_QUERY_VRAM_USAGE,
+ R600_QUERY_GTT_USAGE,
+ R600_QUERY_GPU_TEMPERATURE,
+ R600_QUERY_CURRENT_GPU_SCLK,
+ R600_QUERY_CURRENT_GPU_MCLK,
+ R600_QUERY_GPU_LOAD,
+ R600_QUERY_NUM_COMPILATIONS,
+ R600_QUERY_NUM_SHADERS_CREATED,
+ R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
+ R600_QUERY_GPIN_ASIC_ID,
+ R600_QUERY_GPIN_NUM_SIMD,
+ R600_QUERY_GPIN_NUM_RB,
+ R600_QUERY_GPIN_NUM_SPI,
+ R600_QUERY_GPIN_NUM_SE,
+
+ R600_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
+};
enum {
R600_QUERY_GROUP_GPIN = 0,
@@ -68,11 +84,17 @@ enum {
struct r600_query_ops {
void (*destroy)(struct r600_common_context *, struct r600_query *);
- boolean (*begin)(struct r600_common_context *, struct r600_query *);
- void (*end)(struct r600_common_context *, struct r600_query *);
- boolean (*get_result)(struct r600_common_context *,
- struct r600_query *, boolean wait,
- union pipe_query_result *result);
+ bool (*begin)(struct r600_common_context *, struct r600_query *);
+ bool (*end)(struct r600_common_context *, struct r600_query *);
+ bool (*get_result)(struct r600_common_context *,
+ struct r600_query *, bool wait,
+ union pipe_query_result *result);
+ void (*get_result_resource)(struct r600_common_context *,
+ struct r600_query *, bool wait,
+ enum pipe_query_value_type result_type,
+ int index,
+ struct pipe_resource *resource,
+ unsigned offset);
};
struct r600_query {
@@ -84,12 +106,13 @@ struct r600_query {
enum {
R600_QUERY_HW_FLAG_NO_START = (1 << 0),
- R600_QUERY_HW_FLAG_TIMER = (1 << 1),
- R600_QUERY_HW_FLAG_PREDICATE = (1 << 2),
+ /* gap */
+ /* whether begin_query doesn't clear the result */
+ R600_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
};
struct r600_query_hw_ops {
- void (*prepare_buffer)(struct r600_common_context *,
+ bool (*prepare_buffer)(struct r600_common_context *,
struct r600_query_hw *,
struct r600_resource *);
void (*emit_start)(struct r600_common_context *,
@@ -134,18 +157,18 @@ struct r600_query_hw {
unsigned stream;
};
-boolean r600_query_hw_init(struct r600_common_context *rctx,
- struct r600_query_hw *query);
+bool r600_query_hw_init(struct r600_common_context *rctx,
+ struct r600_query_hw *query);
void r600_query_hw_destroy(struct r600_common_context *rctx,
struct r600_query *rquery);
-boolean r600_query_hw_begin(struct r600_common_context *rctx,
- struct r600_query *rquery);
-void r600_query_hw_end(struct r600_common_context *rctx,
+bool r600_query_hw_begin(struct r600_common_context *rctx,
+ struct r600_query *rquery);
+bool r600_query_hw_end(struct r600_common_context *rctx,
struct r600_query *rquery);
-boolean r600_query_hw_get_result(struct r600_common_context *rctx,
- struct r600_query *rquery,
- boolean wait,
- union pipe_query_result *result);
+bool r600_query_hw_get_result(struct r600_common_context *rctx,
+ struct r600_query *rquery,
+ bool wait,
+ union pipe_query_result *result);
/* Performance counters */
enum {
@@ -227,8 +250,8 @@ struct r600_perfcounters {
void (*cleanup)(struct r600_common_screen *);
- boolean separate_se;
- boolean separate_instance;
+ bool separate_se;
+ bool separate_instance;
};
struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
@@ -242,12 +265,20 @@ int r600_get_perfcounter_group_info(struct r600_common_screen *,
unsigned index,
struct pipe_driver_query_group_info *info);
-boolean r600_perfcounters_init(struct r600_perfcounters *, unsigned num_blocks);
+bool r600_perfcounters_init(struct r600_perfcounters *, unsigned num_blocks);
void r600_perfcounters_add_block(struct r600_common_screen *,
struct r600_perfcounters *,
const char *name, unsigned flags,
unsigned counters, unsigned selectors,
unsigned instances, void *data);
void r600_perfcounters_do_destroy(struct r600_perfcounters *);
+void r600_query_hw_reset_buffers(struct r600_common_context *rctx,
+ struct r600_query_hw *query);
+
+struct r600_qbo_state {
+ void *saved_compute;
+ struct pipe_constant_buffer saved_const0;
+ struct pipe_shader_buffer saved_ssbo[3];
+};
#endif /* R600_QUERY_H */
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c b/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c
index 0853f636a..b5296aa56 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_streamout.c
@@ -46,7 +46,7 @@ r600_create_so_target(struct pipe_context *ctx,
return NULL;
}
- u_suballocator_alloc(rctx->allocator_so_filled_size, 4,
+ u_suballocator_alloc(rctx->allocator_zeroed_memory, 4, 4,
&t->buf_filled_size_offset,
(struct pipe_resource**)&t->buf_filled_size);
if (!t->buf_filled_size) {
@@ -70,7 +70,7 @@ static void r600_so_target_destroy(struct pipe_context *ctx,
{
struct r600_so_target *t = (struct r600_so_target*)target;
pipe_resource_reference(&t->b.buffer, NULL);
- pipe_resource_reference((struct pipe_resource**)&t->buf_filled_size, NULL);
+ r600_resource_reference(&t->buf_filled_size, NULL);
FREE(t);
}
@@ -116,7 +116,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx,
{
struct r600_common_context *rctx = (struct r600_common_context *)ctx;
unsigned i;
- unsigned append_bitmask = 0;
+ unsigned enabled_mask = 0, append_bitmask = 0;
/* Stop streamout. */
if (rctx->streamout.num_targets && rctx->streamout.begin_emitted) {
@@ -126,18 +126,19 @@ void r600_set_streamout_targets(struct pipe_context *ctx,
/* Set the new targets. */
for (i = 0; i < num_targets; i++) {
pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->streamout.targets[i], targets[i]);
+ if (!targets[i])
+ continue;
+
r600_context_add_resource_size(ctx, targets[i]->buffer);
+ enabled_mask |= 1 << i;
if (offsets[i] == ((unsigned)-1))
- append_bitmask |= 1 << i;
+ append_bitmask |= 1 << i;
}
for (; i < rctx->streamout.num_targets; i++) {
pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->streamout.targets[i], NULL);
}
- rctx->streamout.enabled_mask = (num_targets >= 1 && targets[0] ? 1 : 0) |
- (num_targets >= 2 && targets[1] ? 2 : 0) |
- (num_targets >= 3 && targets[2] ? 4 : 0) |
- (num_targets >= 4 && targets[3] ? 8 : 0);
+ rctx->streamout.enabled_mask = enabled_mask;
rctx->streamout.num_targets = num_targets;
rctx->streamout.append_bitmask = append_bitmask;
@@ -152,7 +153,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx,
static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
{
- struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+ struct radeon_winsys_cs *cs = rctx->gfx.cs;
unsigned reg_strmout_cntl;
/* The register is at different places on different ASICs. */
@@ -165,9 +166,9 @@ static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
}
if (rctx->chip_class >= CIK) {
- cik_write_uconfig_reg(cs, reg_strmout_cntl, 0);
+ radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
} else {
- r600_write_config_reg(cs, reg_strmout_cntl, 0);
+ radeon_set_config_reg(cs, reg_strmout_cntl, 0);
}
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
@@ -184,7 +185,7 @@ static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+ struct radeon_winsys_cs *cs = rctx->gfx.cs;
struct r600_so_target **t = rctx->streamout.targets;
unsigned *stride_in_dw = rctx->streamout.stride_in_dw;
unsigned i, update_flags = 0;
@@ -201,7 +202,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
/* SI binds streamout buffers as shader resources.
* VGT only counts primitives and tells the shader
* through SGPRs what to do. */
- r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
+ radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
radeon_emit(cs, (t[i]->b.buffer_offset +
t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */
@@ -210,14 +211,14 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i);
- r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 3);
+ radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 3);
radeon_emit(cs, (t[i]->b.buffer_offset +
t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */
radeon_emit(cs, va >> 8); /* BUFFER_BASE */
- r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer),
- RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
+ r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer),
+ RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RW_BUFFER);
/* R7xx requires this packet after updating BUFFER_BASE.
* Without this, R7xx locks up. */
@@ -226,8 +227,8 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
radeon_emit(cs, i);
radeon_emit(cs, va >> 8);
- r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer),
- RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
+ r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer),
+ RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_RW_BUFFER);
}
}
@@ -244,8 +245,8 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
radeon_emit(cs, va); /* src address lo */
radeon_emit(cs, va >> 32); /* src address hi */
- r600_emit_reloc(rctx, &rctx->rings.gfx, t[i]->buf_filled_size,
- RADEON_USAGE_READ, RADEON_PRIO_MIN);
+ r600_emit_reloc(rctx, &rctx->gfx, t[i]->buf_filled_size,
+ RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE);
} else {
/* Start from the beginning. */
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
@@ -267,7 +268,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
void r600_emit_streamout_end(struct r600_common_context *rctx)
{
- struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+ struct radeon_winsys_cs *cs = rctx->gfx.cs;
struct r600_so_target **t = rctx->streamout.targets;
unsigned i;
uint64_t va;
@@ -288,14 +289,14 @@ void r600_emit_streamout_end(struct r600_common_context *rctx)
radeon_emit(cs, 0); /* unused */
radeon_emit(cs, 0); /* unused */
- r600_emit_reloc(rctx, &rctx->rings.gfx, t[i]->buf_filled_size,
- RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
+ r600_emit_reloc(rctx, &rctx->gfx, t[i]->buf_filled_size,
+ RADEON_USAGE_WRITE, RADEON_PRIO_SO_FILLED_SIZE);
/* Zero the buffer size. The counters (primitives generated,
* primitives emitted) may be enabled even if there is not
* buffer bound. This ensures that the primitives-emitted query
* won't increment. */
- r600_write_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
+ radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
t[i]->buf_filled_size_valid = true;
}
@@ -311,12 +312,6 @@ void r600_emit_streamout_end(struct r600_common_context *rctx)
* are no buffers bound.
*/
-static bool r600_get_strmout_en(struct r600_common_context *rctx)
-{
- return rctx->streamout.streamout_enabled ||
- rctx->streamout.prims_gen_query_enabled;
-}
-
static void r600_emit_streamout_enable(struct r600_common_context *rctx,
struct r600_atom *atom)
{
@@ -336,8 +331,8 @@ static void r600_emit_streamout_enable(struct r600_common_context *rctx,
S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) |
S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx));
}
- r600_write_context_reg(rctx->rings.gfx.cs, strmout_buffer_reg, strmout_buffer_val);
- r600_write_context_reg(rctx->rings.gfx.cs, strmout_config_reg, strmout_config_val);
+ radeon_set_context_reg(rctx->gfx.cs, strmout_buffer_reg, strmout_buffer_val);
+ radeon_set_context_reg(rctx->gfx.cs, strmout_config_reg, strmout_config_val);
}
static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable)
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_texture.c b/lib/mesa/src/gallium/drivers/radeon/r600_texture.c
index e9bd4a21f..27035c0fa 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_texture.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_texture.c
@@ -26,12 +26,82 @@
*/
#include "r600_pipe_common.h"
#include "r600_cs.h"
+#include "r600_query.h"
#include "util/u_format.h"
#include "util/u_memory.h"
#include "util/u_pack_color.h"
+#include "util/u_surface.h"
+#include "os/os_time.h"
#include <errno.h>
#include <inttypes.h>
+static void r600_texture_discard_cmask(struct r600_common_screen *rscreen,
+ struct r600_texture *rtex);
+static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
+ const struct pipe_resource *templ);
+
+
+bool r600_prepare_for_dma_blit(struct r600_common_context *rctx,
+ struct r600_texture *rdst,
+ unsigned dst_level, unsigned dstx,
+ unsigned dsty, unsigned dstz,
+ struct r600_texture *rsrc,
+ unsigned src_level,
+ const struct pipe_box *src_box)
+{
+ if (!rctx->dma.cs)
+ return false;
+
+ if (util_format_get_blocksizebits(rdst->resource.b.b.format) !=
+ util_format_get_blocksizebits(rsrc->resource.b.b.format))
+ return false;
+
+ /* MSAA: Blits don't exist in the real world. */
+ if (rsrc->resource.b.b.nr_samples > 1 ||
+ rdst->resource.b.b.nr_samples > 1)
+ return false;
+
+ /* Depth-stencil surfaces:
+ * When dst is linear, the DB->CB copy preserves HTILE.
+ * When dst is tiled, the 3D path must be used to update HTILE.
+ */
+ if (rsrc->is_depth || rdst->is_depth)
+ return false;
+
+ /* DCC as:
+ * src: Use the 3D path. DCC decompression is expensive.
+ * dst: Use the 3D path to compress the pixels with DCC.
+ */
+ if ((rsrc->dcc_offset && rsrc->surface.level[src_level].dcc_enabled) ||
+ (rdst->dcc_offset && rdst->surface.level[dst_level].dcc_enabled))
+ return false;
+
+ /* CMASK as:
+ * src: Both texture and SDMA paths need decompression. Use SDMA.
+ * dst: If overwriting the whole texture, discard CMASK and use
+ * SDMA. Otherwise, use the 3D path.
+ */
+ if (rdst->cmask.size && rdst->dirty_level_mask & (1 << dst_level)) {
+ /* The CMASK clear is only enabled for the first level. */
+ assert(dst_level == 0);
+ if (!util_texrange_covers_whole_level(&rdst->resource.b.b, dst_level,
+ dstx, dsty, dstz, src_box->width,
+ src_box->height, src_box->depth))
+ return false;
+
+ r600_texture_discard_cmask(rctx->screen, rdst);
+ }
+
+ /* All requirements are met. Prepare textures for SDMA. */
+ if (rsrc->cmask.size && rsrc->dirty_level_mask & (1 << src_level))
+ rctx->b.flush_resource(&rctx->b, &rsrc->resource.b.b);
+
+ assert(!(rsrc->dirty_level_mask & (1 << src_level)));
+ assert(!(rdst->dirty_level_mask & (1 << dst_level)));
+
+ return true;
+}
+
/* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */
static void r600_copy_region_with_blit(struct pipe_context *pipe,
struct pipe_resource *dst,
@@ -122,7 +192,8 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
struct radeon_surf *surface,
const struct pipe_resource *ptex,
unsigned array_mode,
- bool is_flushed_depth)
+ bool is_flushed_depth,
+ bool tc_compatible_htile)
{
const struct util_format_description *desc =
util_format_description(ptex->format);
@@ -169,8 +240,9 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_1D_ARRAY, TYPE);
surface->array_size = ptex->array_size;
break;
- case PIPE_TEXTURE_2D_ARRAY:
case PIPE_TEXTURE_CUBE_ARRAY: /* cube array layout like 2d array */
+ assert(ptex->array_size % 6 == 0);
+ case PIPE_TEXTURE_2D_ARRAY:
surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_2D_ARRAY, TYPE);
surface->array_size = ptex->array_size;
break;
@@ -181,29 +253,55 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
default:
return -EINVAL;
}
- if (ptex->bind & PIPE_BIND_SCANOUT) {
- surface->flags |= RADEON_SURF_SCANOUT;
- }
if (!is_flushed_depth && is_depth) {
surface->flags |= RADEON_SURF_ZBUFFER;
+ if (tc_compatible_htile &&
+ array_mode == RADEON_SURF_MODE_2D) {
+ /* TC-compatible HTILE only supports Z32_FLOAT.
+ * Promote Z16 to Z32. DB->CB copies will convert
+ * the format for transfers.
+ */
+ surface->bpe = 4;
+ surface->flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
+ }
+
if (is_stencil) {
surface->flags |= RADEON_SURF_SBUFFER |
RADEON_SURF_HAS_SBUFFER_MIPTREE;
}
}
+
if (rscreen->chip_class >= SI) {
surface->flags |= RADEON_SURF_HAS_TILE_MODE_INDEX;
}
+
+ if (rscreen->chip_class >= VI &&
+ (ptex->flags & R600_RESOURCE_FLAG_DISABLE_DCC ||
+ ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT))
+ surface->flags |= RADEON_SURF_DISABLE_DCC;
+
+ if (ptex->bind & PIPE_BIND_SCANOUT) {
+ /* This should catch bugs in gallium users setting incorrect flags. */
+ assert(surface->nsamples == 1 &&
+ surface->array_size == 1 &&
+ surface->npix_z == 1 &&
+ surface->last_level == 0 &&
+ !(surface->flags & RADEON_SURF_Z_OR_SBUFFER));
+
+ surface->flags |= RADEON_SURF_SCANOUT;
+ }
return 0;
}
static int r600_setup_surface(struct pipe_screen *screen,
struct r600_texture *rtex,
- unsigned pitch_in_bytes_override)
+ unsigned pitch_in_bytes_override,
+ unsigned offset)
{
struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+ unsigned i;
int r;
r = rscreen->ws->surface_init(rscreen->ws, &rtex->surface);
@@ -220,39 +318,292 @@ static int r600_setup_surface(struct pipe_screen *screen,
rtex->surface.level[0].nblk_x = pitch_in_bytes_override / rtex->surface.bpe;
rtex->surface.level[0].pitch_bytes = pitch_in_bytes_override;
rtex->surface.level[0].slice_size = pitch_in_bytes_override * rtex->surface.level[0].nblk_y;
- if (rtex->surface.flags & RADEON_SURF_SBUFFER) {
- rtex->surface.stencil_offset =
- rtex->surface.stencil_level[0].offset = rtex->surface.level[0].slice_size;
- }
+ }
+
+ if (offset) {
+ for (i = 0; i < ARRAY_SIZE(rtex->surface.level); ++i)
+ rtex->surface.level[i].offset += offset;
}
return 0;
}
-static boolean r600_texture_get_handle(struct pipe_screen* screen,
- struct pipe_resource *ptex,
- struct winsys_handle *whandle)
+static void r600_texture_init_metadata(struct r600_texture *rtex,
+ struct radeon_bo_metadata *metadata)
{
- struct r600_texture *rtex = (struct r600_texture*)ptex;
- struct r600_resource *resource = &rtex->resource;
struct radeon_surf *surface = &rtex->surface;
+
+ memset(metadata, 0, sizeof(*metadata));
+ metadata->microtile = surface->level[0].mode >= RADEON_SURF_MODE_1D ?
+ RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+ metadata->macrotile = surface->level[0].mode >= RADEON_SURF_MODE_2D ?
+ RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+ metadata->pipe_config = surface->pipe_config;
+ metadata->bankw = surface->bankw;
+ metadata->bankh = surface->bankh;
+ metadata->tile_split = surface->tile_split;
+ metadata->mtilea = surface->mtilea;
+ metadata->num_banks = surface->num_banks;
+ metadata->stride = surface->level[0].pitch_bytes;
+ metadata->scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+}
+
+static void r600_dirty_all_framebuffer_states(struct r600_common_screen *rscreen)
+{
+ p_atomic_inc(&rscreen->dirty_fb_counter);
+}
+
+static void r600_eliminate_fast_color_clear(struct r600_common_context *rctx,
+ struct r600_texture *rtex)
+{
+ struct r600_common_screen *rscreen = rctx->screen;
+ struct pipe_context *ctx = &rctx->b;
+
+ if (ctx == rscreen->aux_context)
+ pipe_mutex_lock(rscreen->aux_context_lock);
+
+ ctx->flush_resource(ctx, &rtex->resource.b.b);
+ ctx->flush(ctx, NULL, 0);
+
+ if (ctx == rscreen->aux_context)
+ pipe_mutex_unlock(rscreen->aux_context_lock);
+}
+
+static void r600_texture_discard_cmask(struct r600_common_screen *rscreen,
+ struct r600_texture *rtex)
+{
+ if (!rtex->cmask.size)
+ return;
+
+ assert(rtex->resource.b.b.nr_samples <= 1);
+
+ /* Disable CMASK. */
+ memset(&rtex->cmask, 0, sizeof(rtex->cmask));
+ rtex->cmask.base_address_reg = rtex->resource.gpu_address >> 8;
+ rtex->dirty_level_mask = 0;
+
+ if (rscreen->chip_class >= SI)
+ rtex->cb_color_info &= ~SI_S_028C70_FAST_CLEAR(1);
+ else
+ rtex->cb_color_info &= ~EG_S_028C70_FAST_CLEAR(1);
+
+ if (rtex->cmask_buffer != &rtex->resource)
+ r600_resource_reference(&rtex->cmask_buffer, NULL);
+
+ /* Notify all contexts about the change. */
+ r600_dirty_all_framebuffer_states(rscreen);
+ p_atomic_inc(&rscreen->compressed_colortex_counter);
+}
+
+static bool r600_can_disable_dcc(struct r600_texture *rtex)
+{
+ /* We can't disable DCC if it can be written by another process. */
+ return rtex->dcc_offset &&
+ (!rtex->resource.is_shared ||
+ !(rtex->resource.external_usage & PIPE_HANDLE_USAGE_WRITE));
+}
+
+static bool r600_texture_discard_dcc(struct r600_common_screen *rscreen,
+ struct r600_texture *rtex)
+{
+ if (!r600_can_disable_dcc(rtex))
+ return false;
+
+ assert(rtex->dcc_separate_buffer == NULL);
+
+ /* Disable DCC. */
+ rtex->dcc_offset = 0;
+
+ /* Notify all contexts about the change. */
+ r600_dirty_all_framebuffer_states(rscreen);
+ return true;
+}
+
+/**
+ * Disable DCC for the texture. (first decompress, then discard metadata).
+ *
+ * There is unresolved multi-context synchronization issue between
+ * screen::aux_context and the current context. If applications do this with
+ * multiple contexts, it's already undefined behavior for them and we don't
+ * have to worry about that. The scenario is:
+ *
+ * If context 1 disables DCC and context 2 has queued commands that write
+ * to the texture via CB with DCC enabled, and the order of operations is
+ * as follows:
+ * context 2 queues draw calls rendering to the texture, but doesn't flush
+ * context 1 disables DCC and flushes
+ * context 1 & 2 reset descriptors and FB state
+ * context 2 flushes (new compressed tiles written by the draw calls)
+ * context 1 & 2 read garbage, because DCC is disabled, yet there are
+ * compressed tiled
+ *
+ * \param rctx the current context if you have one, or rscreen->aux_context
+ * if you don't.
+ */
+bool r600_texture_disable_dcc(struct r600_common_context *rctx,
+ struct r600_texture *rtex)
+{
+ struct r600_common_screen *rscreen = rctx->screen;
+
+ if (!r600_can_disable_dcc(rtex))
+ return false;
+
+ if (&rctx->b == rscreen->aux_context)
+ pipe_mutex_lock(rscreen->aux_context_lock);
+
+ /* Decompress DCC. */
+ rctx->decompress_dcc(&rctx->b, rtex);
+ rctx->b.flush(&rctx->b, NULL, 0);
+
+ if (&rctx->b == rscreen->aux_context)
+ pipe_mutex_unlock(rscreen->aux_context_lock);
+
+ return r600_texture_discard_dcc(rscreen, rtex);
+}
+
+static void r600_degrade_tile_mode_to_linear(struct r600_common_context *rctx,
+ struct r600_texture *rtex,
+ bool invalidate_storage)
+{
+ struct pipe_screen *screen = rctx->b.screen;
+ struct r600_texture *new_tex;
+ struct pipe_resource templ = rtex->resource.b.b;
+ unsigned i;
+
+ templ.bind |= PIPE_BIND_LINEAR;
+
+ /* r600g doesn't react to dirty_tex_descriptor_counter */
+ if (rctx->chip_class < SI)
+ return;
+
+ if (rtex->resource.is_shared ||
+ rtex->surface.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED)
+ return;
+
+ /* This fails with MSAA, depth, and compressed textures. */
+ if (r600_choose_tiling(rctx->screen, &templ) !=
+ RADEON_SURF_MODE_LINEAR_ALIGNED)
+ return;
+
+ new_tex = (struct r600_texture*)screen->resource_create(screen, &templ);
+ if (!new_tex)
+ return;
+
+ /* Copy the pixels to the new texture. */
+ if (!invalidate_storage) {
+ for (i = 0; i <= templ.last_level; i++) {
+ struct pipe_box box;
+
+ u_box_3d(0, 0, 0,
+ u_minify(templ.width0, i), u_minify(templ.height0, i),
+ util_max_layer(&templ, i) + 1, &box);
+
+ rctx->dma_copy(&rctx->b, &new_tex->resource.b.b, i, 0, 0, 0,
+ &rtex->resource.b.b, i, &box);
+ }
+ }
+
+ r600_texture_discard_cmask(rctx->screen, rtex);
+ r600_texture_discard_dcc(rctx->screen, rtex);
+
+ /* Replace the structure fields of rtex. */
+ rtex->resource.b.b.bind = templ.bind;
+ pb_reference(&rtex->resource.buf, new_tex->resource.buf);
+ rtex->resource.gpu_address = new_tex->resource.gpu_address;
+ rtex->resource.vram_usage = new_tex->resource.vram_usage;
+ rtex->resource.gart_usage = new_tex->resource.gart_usage;
+ rtex->resource.bo_size = new_tex->resource.bo_size;
+ rtex->resource.bo_alignment = new_tex->resource.bo_alignment;
+ rtex->resource.domains = new_tex->resource.domains;
+ rtex->resource.flags = new_tex->resource.flags;
+ rtex->size = new_tex->size;
+ rtex->surface = new_tex->surface;
+ rtex->non_disp_tiling = new_tex->non_disp_tiling;
+ rtex->cb_color_info = new_tex->cb_color_info;
+ rtex->cmask = new_tex->cmask; /* needed even without CMASK */
+
+ assert(!rtex->htile_buffer);
+ assert(!rtex->cmask.size);
+ assert(!rtex->fmask.size);
+ assert(!rtex->dcc_offset);
+ assert(!rtex->is_depth);
+
+ r600_texture_reference(&new_tex, NULL);
+
+ r600_dirty_all_framebuffer_states(rctx->screen);
+ p_atomic_inc(&rctx->screen->dirty_tex_descriptor_counter);
+}
+
+static boolean r600_texture_get_handle(struct pipe_screen* screen,
+ struct pipe_context *ctx,
+ struct pipe_resource *resource,
+ struct winsys_handle *whandle,
+ unsigned usage)
+{
struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+ struct r600_common_context *rctx = (struct r600_common_context*)
+ (ctx ? ctx : rscreen->aux_context);
+ struct r600_resource *res = (struct r600_resource*)resource;
+ struct r600_texture *rtex = (struct r600_texture*)resource;
+ struct radeon_bo_metadata metadata;
+ bool update_metadata = false;
+
+ /* This is not supported now, but it might be required for OpenCL
+ * interop in the future.
+ */
+ if (resource->target != PIPE_BUFFER &&
+ (resource->nr_samples > 1 || rtex->is_depth))
+ return false;
+
+ if (resource->target != PIPE_BUFFER) {
+ /* Since shader image stores don't support DCC on VI,
+ * disable it for external clients that want write
+ * access.
+ */
+ if (usage & PIPE_HANDLE_USAGE_WRITE && rtex->dcc_offset) {
+ if (r600_texture_disable_dcc(rctx, rtex))
+ update_metadata = true;
+ }
- rscreen->ws->buffer_set_tiling(resource->buf,
- NULL,
- surface->level[0].mode >= RADEON_SURF_MODE_1D ?
- RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR,
- surface->level[0].mode >= RADEON_SURF_MODE_2D ?
- RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR,
- surface->pipe_config,
- surface->bankw, surface->bankh,
- surface->tile_split,
- surface->stencil_tile_split,
- surface->mtilea, surface->num_banks,
- surface->level[0].pitch_bytes,
- (surface->flags & RADEON_SURF_SCANOUT) != 0);
-
- return rscreen->ws->buffer_get_handle(resource->buf,
- surface->level[0].pitch_bytes, whandle);
+ if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
+ (rtex->cmask.size || rtex->dcc_offset)) {
+ /* Eliminate fast clear (both CMASK and DCC) */
+ r600_eliminate_fast_color_clear(rctx, rtex);
+
+ /* Disable CMASK if flush_resource isn't going
+ * to be called.
+ */
+ if (rtex->cmask.size)
+ r600_texture_discard_cmask(rscreen, rtex);
+ }
+
+ /* Set metadata. */
+ if (!res->is_shared || update_metadata) {
+ r600_texture_init_metadata(rtex, &metadata);
+ if (rscreen->query_opaque_metadata)
+ rscreen->query_opaque_metadata(rscreen, rtex,
+ &metadata);
+
+ rscreen->ws->buffer_set_metadata(res->buf, &metadata);
+ }
+ }
+
+ if (res->is_shared) {
+ /* USAGE_EXPLICIT_FLUSH must be cleared if at least one user
+ * doesn't set it.
+ */
+ res->external_usage |= usage & ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+ if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+ res->external_usage &= ~PIPE_HANDLE_USAGE_EXPLICIT_FLUSH;
+ } else {
+ res->is_shared = true;
+ res->external_usage = usage;
+ }
+
+ return rscreen->ws->buffer_get_handle(res->buf,
+ rtex->surface.level[0].pitch_bytes,
+ rtex->surface.level[0].offset,
+ rtex->surface.level[0].slice_size,
+ whandle);
}
static void r600_texture_destroy(struct pipe_screen *screen,
@@ -261,14 +612,15 @@ static void r600_texture_destroy(struct pipe_screen *screen,
struct r600_texture *rtex = (struct r600_texture*)ptex;
struct r600_resource *resource = &rtex->resource;
- if (rtex->flushed_depth_texture)
- pipe_resource_reference((struct pipe_resource **)&rtex->flushed_depth_texture, NULL);
+ r600_texture_reference(&rtex->flushed_depth_texture, NULL);
- pipe_resource_reference((struct pipe_resource**)&rtex->htile_buffer, NULL);
+ r600_resource_reference(&rtex->htile_buffer, NULL);
if (rtex->cmask_buffer != &rtex->resource) {
- pipe_resource_reference((struct pipe_resource**)&rtex->cmask_buffer, NULL);
+ r600_resource_reference(&rtex->cmask_buffer, NULL);
}
pb_reference(&resource->buf, NULL);
+ r600_resource_reference(&rtex->dcc_separate_buffer, NULL);
+ r600_resource_reference(&rtex->last_dcc_separate_buffer, NULL);
FREE(rtex);
}
@@ -335,7 +687,7 @@ void r600_texture_get_fmask_info(struct r600_common_screen *rscreen,
out->slice_tile_max -= 1;
out->tile_mode_index = fmask.tiling_index[0];
- out->pitch = fmask.level[0].nblk_x;
+ out->pitch_in_pixels = fmask.level[0].nblk_x;
out->bank_height = fmask.bankh;
out->alignment = MAX2(256, fmask.bo_alignment);
out->size = fmask.bo_size;
@@ -347,7 +699,7 @@ static void r600_texture_allocate_fmask(struct r600_common_screen *rscreen,
r600_texture_get_fmask_info(rscreen, rtex,
rtex->resource.b.b.nr_samples, &rtex->fmask);
- rtex->fmask.offset = align(rtex->size, rtex->fmask.alignment);
+ rtex->fmask.offset = align64(rtex->size, rtex->fmask.alignment);
rtex->size = rtex->fmask.offset + rtex->fmask.size;
}
@@ -360,8 +712,8 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen,
unsigned cmask_tile_elements = cmask_tile_width * cmask_tile_height;
unsigned element_bits = 4;
unsigned cmask_cache_bits = 1024;
- unsigned num_pipes = rscreen->tiling_info.num_channels;
- unsigned pipe_interleave_bytes = rscreen->tiling_info.group_bytes;
+ unsigned num_pipes = rscreen->info.num_tile_pipes;
+ unsigned pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
unsigned elements_per_macro_tile = (cmask_cache_bits / element_bits) * num_pipes;
unsigned pixels_per_macro_tile = elements_per_macro_tile * cmask_tile_elements;
@@ -379,6 +731,10 @@ void r600_texture_get_cmask_info(struct r600_common_screen *rscreen,
assert(macro_tile_width % 128 == 0);
assert(macro_tile_height % 128 == 0);
+ out->pitch = pitch_elements;
+ out->height = height;
+ out->xalign = macro_tile_width;
+ out->yalign = macro_tile_height;
out->slice_tile_max = ((pitch_elements * height) / (128*128)) - 1;
out->alignment = MAX2(256, base_align);
out->size = (util_max_layer(&rtex->resource.b.b, 0) + 1) *
@@ -389,8 +745,8 @@ static void si_texture_get_cmask_info(struct r600_common_screen *rscreen,
struct r600_texture *rtex,
struct r600_cmask_info *out)
{
- unsigned pipe_interleave_bytes = rscreen->tiling_info.group_bytes;
- unsigned num_pipes = rscreen->tiling_info.num_channels;
+ unsigned pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
+ unsigned num_pipes = rscreen->info.num_tile_pipes;
unsigned cl_width, cl_height;
switch (num_pipes) {
@@ -424,6 +780,10 @@ static void si_texture_get_cmask_info(struct r600_common_screen *rscreen,
/* Each element of CMASK is a nibble. */
unsigned slice_bytes = slice_elements / 2;
+ out->pitch = width;
+ out->height = height;
+ out->xalign = cl_width * 8;
+ out->yalign = cl_height * 8;
out->slice_tile_max = (width * height) / (128*128);
if (out->slice_tile_max)
out->slice_tile_max -= 1;
@@ -442,7 +802,7 @@ static void r600_texture_allocate_cmask(struct r600_common_screen *rscreen,
r600_texture_get_cmask_info(rscreen, rtex, &rtex->cmask);
}
- rtex->cmask.offset = align(rtex->size, rtex->cmask.alignment);
+ rtex->cmask.offset = align64(rtex->size, rtex->cmask.alignment);
rtex->size = rtex->cmask.offset + rtex->cmask.size;
if (rscreen->chip_class >= SI)
@@ -466,8 +826,9 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen
}
rtex->cmask_buffer = (struct r600_resource *)
- pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
- PIPE_USAGE_DEFAULT, rtex->cmask.size);
+ r600_aligned_buffer_create(&rscreen->b, 0, PIPE_USAGE_DEFAULT,
+ rtex->cmask.size,
+ rtex->cmask.alignment);
if (rtex->cmask_buffer == NULL) {
rtex->cmask.size = 0;
return;
@@ -480,6 +841,8 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen
rtex->cb_color_info |= SI_S_028C70_FAST_CLEAR(1);
else
rtex->cb_color_info |= EG_S_028C70_FAST_CLEAR(1);
+
+ p_atomic_inc(&rscreen->compressed_colortex_counter);
}
static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
@@ -487,7 +850,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
{
unsigned cl_width, cl_height, width, height;
unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align;
- unsigned num_pipes = rscreen->tiling_info.num_channels;
+ unsigned num_pipes = rscreen->info.num_tile_pipes;
if (rscreen->chip_class <= EVERGREEN &&
rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 26)
@@ -505,6 +868,16 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38)
return 0;
+ /* Overalign HTILE on P2 configs to work around GPU hangs in
+ * piglit/depthstencil-render-miplevels 585.
+ *
+ * This has been confirmed to help Kabini & Stoney, where the hangs
+ * are always reproducible. I think I have seen the test hang
+ * on Carrizo too, though it was very rare there.
+ */
+ if (rscreen->chip_class >= CIK && num_pipes < 4)
+ num_pipes = 4;
+
switch (num_pipes) {
case 1:
cl_width = 32;
@@ -537,9 +910,15 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
slice_elements = (width * height) / (8 * 8);
slice_bytes = slice_elements * 4;
- pipe_interleave_bytes = rscreen->tiling_info.group_bytes;
+ pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
base_align = num_pipes * pipe_interleave_bytes;
+ rtex->htile.pitch = width;
+ rtex->htile.height = height;
+ rtex->htile.xalign = cl_width * 8;
+ rtex->htile.yalign = cl_height * 8;
+ rtex->htile.alignment = base_align;
+
return (util_max_layer(&rtex->resource.b.b, 0) + 1) *
align(slice_bytes, base_align);
}
@@ -547,21 +926,126 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
static void r600_texture_allocate_htile(struct r600_common_screen *rscreen,
struct r600_texture *rtex)
{
- unsigned htile_size = r600_texture_get_htile_size(rscreen, rtex);
+ uint64_t htile_size, alignment;
+ uint32_t clear_value;
+
+ if (rtex->tc_compatible_htile) {
+ htile_size = rtex->surface.htile_size;
+ alignment = rtex->surface.htile_alignment;
+ clear_value = 0x0000030F;
+ } else {
+ htile_size = r600_texture_get_htile_size(rscreen, rtex);
+ alignment = rtex->htile.alignment;
+ clear_value = 0;
+ }
if (!htile_size)
return;
rtex->htile_buffer = (struct r600_resource*)
- pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
- PIPE_USAGE_DEFAULT, htile_size);
+ r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
+ PIPE_USAGE_DEFAULT,
+ htile_size, alignment);
if (rtex->htile_buffer == NULL) {
/* this is not a fatal error as we can still keep rendering
* without htile buffer */
R600_ERR("Failed to create buffer object for htile buffer.\n");
} else {
- r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b, 0,
- htile_size, 0, true);
+ r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b,
+ 0, htile_size, clear_value,
+ R600_COHERENCY_NONE);
+ }
+}
+
+void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
+{
+ int i;
+
+ fprintf(f, " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
+ "blk_h=%u, blk_d=%u, array_size=%u, last_level=%u, "
+ "bpe=%u, nsamples=%u, flags=0x%x, %s\n",
+ rtex->surface.npix_x, rtex->surface.npix_y,
+ rtex->surface.npix_z, rtex->surface.blk_w,
+ rtex->surface.blk_h, rtex->surface.blk_d,
+ rtex->surface.array_size, rtex->surface.last_level,
+ rtex->surface.bpe, rtex->surface.nsamples,
+ rtex->surface.flags, util_format_short_name(rtex->resource.b.b.format));
+
+ fprintf(f, " Layout: size=%"PRIu64", alignment=%"PRIu64", bankw=%u, "
+ "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n",
+ rtex->surface.bo_size, rtex->surface.bo_alignment, rtex->surface.bankw,
+ rtex->surface.bankh, rtex->surface.num_banks, rtex->surface.mtilea,
+ rtex->surface.tile_split, rtex->surface.pipe_config,
+ (rtex->surface.flags & RADEON_SURF_SCANOUT) != 0);
+
+ if (rtex->fmask.size)
+ fprintf(f, " FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, "
+ "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n",
+ rtex->fmask.offset, rtex->fmask.size, rtex->fmask.alignment,
+ rtex->fmask.pitch_in_pixels, rtex->fmask.bank_height,
+ rtex->fmask.slice_tile_max, rtex->fmask.tile_mode_index);
+
+ if (rtex->cmask.size)
+ fprintf(f, " CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch=%u, "
+ "height=%u, xalign=%u, yalign=%u, slice_tile_max=%u\n",
+ rtex->cmask.offset, rtex->cmask.size, rtex->cmask.alignment,
+ rtex->cmask.pitch, rtex->cmask.height, rtex->cmask.xalign,
+ rtex->cmask.yalign, rtex->cmask.slice_tile_max);
+
+ if (rtex->htile_buffer)
+ fprintf(f, " HTile: size=%u, alignment=%u, pitch=%u, height=%u, "
+ "xalign=%u, yalign=%u, TC_compatible = %u\n",
+ rtex->htile_buffer->b.b.width0,
+ rtex->htile_buffer->buf->alignment, rtex->htile.pitch,
+ rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign,
+ rtex->tc_compatible_htile);
+
+ if (rtex->dcc_offset) {
+ fprintf(f, " DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%"PRIu64"\n",
+ rtex->dcc_offset, rtex->surface.dcc_size,
+ rtex->surface.dcc_alignment);
+ for (i = 0; i <= rtex->surface.last_level; i++)
+ fprintf(f, " DCCLevel[%i]: enabled=%u, offset=%"PRIu64", "
+ "fast_clear_size=%"PRIu64"\n",
+ i, rtex->surface.level[i].dcc_enabled,
+ rtex->surface.level[i].dcc_offset,
+ rtex->surface.level[i].dcc_fast_clear_size);
+ }
+
+ for (i = 0; i <= rtex->surface.last_level; i++)
+ fprintf(f, " Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", "
+ "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+ "nblk_z=%u, pitch_bytes=%u, mode=%u\n",
+ i, rtex->surface.level[i].offset,
+ rtex->surface.level[i].slice_size,
+ u_minify(rtex->resource.b.b.width0, i),
+ u_minify(rtex->resource.b.b.height0, i),
+ u_minify(rtex->resource.b.b.depth0, i),
+ rtex->surface.level[i].nblk_x,
+ rtex->surface.level[i].nblk_y,
+ rtex->surface.level[i].nblk_z,
+ rtex->surface.level[i].pitch_bytes,
+ rtex->surface.level[i].mode);
+
+ if (rtex->surface.flags & RADEON_SURF_SBUFFER) {
+ fprintf(f, " StencilLayout: tilesplit=%u\n",
+ rtex->surface.stencil_tile_split);
+ for (i = 0; i <= rtex->surface.last_level; i++) {
+ fprintf(f, " StencilLevel[%i]: offset=%"PRIu64", "
+ "slice_size=%"PRIu64", npix_x=%u, "
+ "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
+ "nblk_z=%u, pitch_bytes=%u, mode=%u\n",
+ i, rtex->surface.stencil_level[i].offset,
+ rtex->surface.stencil_level[i].slice_size,
+ u_minify(rtex->resource.b.b.width0, i),
+ u_minify(rtex->resource.b.b.height0, i),
+ u_minify(rtex->resource.b.b.depth0, i),
+ rtex->surface.stencil_level[i].nblk_x,
+ rtex->surface.stencil_level[i].nblk_y,
+ rtex->surface.stencil_level[i].nblk_z,
+ rtex->surface.stencil_level[i].pitch_bytes,
+ rtex->surface.stencil_level[i].mode);
+ }
}
}
@@ -570,6 +1054,7 @@ static struct r600_texture *
r600_texture_create_object(struct pipe_screen *screen,
const struct pipe_resource *base,
unsigned pitch_in_bytes_override,
+ unsigned offset,
struct pb_buffer *buf,
struct radeon_surf *surface)
{
@@ -578,36 +1063,67 @@ r600_texture_create_object(struct pipe_screen *screen,
struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
rtex = CALLOC_STRUCT(r600_texture);
- if (rtex == NULL)
+ if (!rtex)
return NULL;
resource = &rtex->resource;
resource->b.b = *base;
+ resource->b.b.next = NULL;
resource->b.vtbl = &r600_texture_vtbl;
pipe_reference_init(&resource->b.b.reference, 1);
resource->b.b.screen = screen;
- rtex->pitch_override = pitch_in_bytes_override;
/* don't include stencil-only formats which we don't support for rendering */
rtex->is_depth = util_format_has_depth(util_format_description(rtex->resource.b.b.format));
rtex->surface = *surface;
- if (r600_setup_surface(screen, rtex, pitch_in_bytes_override)) {
+ if (r600_setup_surface(screen, rtex, pitch_in_bytes_override, offset)) {
FREE(rtex);
return NULL;
}
+ rtex->tc_compatible_htile = rtex->surface.htile_size != 0;
+ assert(!!(rtex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE) ==
+ rtex->tc_compatible_htile);
+
+ /* TC-compatible HTILE only supports Z32_FLOAT. */
+ if (rtex->tc_compatible_htile)
+ rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+ else
+ rtex->db_render_format = base->format;
+
/* Tiled depth textures utilize the non-displayable tile order.
* This must be done after r600_setup_surface.
* Applies to R600-Cayman. */
rtex->non_disp_tiling = rtex->is_depth && rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D;
+ /* Applies to GCN. */
+ rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode;
+
+ /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
+ * between frames, so the only thing that can enable separate DCC
+ * with DRI2 is multiple slow clears within a frame.
+ */
+ rtex->ps_draw_ratio = 0;
if (rtex->is_depth) {
+ if (base->flags & (R600_RESOURCE_FLAG_TRANSFER |
+ R600_RESOURCE_FLAG_FLUSHED_DEPTH) ||
+ rscreen->chip_class >= EVERGREEN) {
+ rtex->can_sample_z = !rtex->surface.depth_adjusted;
+ rtex->can_sample_s = !rtex->surface.stencil_adjusted;
+ } else {
+ if (rtex->resource.b.b.nr_samples <= 1 &&
+ (rtex->resource.b.b.format == PIPE_FORMAT_Z16_UNORM ||
+ rtex->resource.b.b.format == PIPE_FORMAT_Z32_FLOAT))
+ rtex->can_sample_z = true;
+ }
+
if (!(base->flags & (R600_RESOURCE_FLAG_TRANSFER |
- R600_RESOURCE_FLAG_FLUSHED_DEPTH)) &&
- !(rscreen->debug_flags & DBG_NO_HYPERZ)) {
+ R600_RESOURCE_FLAG_FLUSHED_DEPTH))) {
+ rtex->db_compatible = true;
- r600_texture_allocate_htile(rscreen, rtex);
+ if (!(rscreen->debug_flags & DBG_NO_HYPERZ))
+ r600_texture_allocate_htile(rscreen, rtex);
}
} else {
if (base->nr_samples > 1) {
@@ -621,27 +1137,56 @@ r600_texture_create_object(struct pipe_screen *screen,
return NULL;
}
}
+
+ /* Shared textures must always set up DCC here.
+ * If it's not present, it will be disabled by
+ * apply_opaque_metadata later.
+ */
+ if (rtex->surface.dcc_size &&
+ (buf || !(rscreen->debug_flags & DBG_NO_DCC)) &&
+ !(rtex->surface.flags & RADEON_SURF_SCANOUT)) {
+ /* Reserve space for the DCC buffer. */
+ rtex->dcc_offset = align64(rtex->size, rtex->surface.dcc_alignment);
+ rtex->size = rtex->dcc_offset + rtex->surface.dcc_size;
+ }
}
/* Now create the backing buffer. */
if (!buf) {
- if (!r600_init_resource(rscreen, resource, rtex->size,
- rtex->surface.bo_alignment, TRUE)) {
+ r600_init_resource_fields(rscreen, resource, rtex->size,
+ rtex->surface.bo_alignment);
+
+ resource->flags |= RADEON_FLAG_HANDLE;
+
+ if (!r600_alloc_resource(rscreen, resource)) {
FREE(rtex);
return NULL;
}
} else {
resource->buf = buf;
- resource->cs_buf = rscreen->ws->buffer_get_cs_handle(buf);
- resource->gpu_address = rscreen->ws->buffer_get_virtual_address(resource->cs_buf);
- resource->domains = rscreen->ws->buffer_get_initial_domain(resource->cs_buf);
+ resource->gpu_address = rscreen->ws->buffer_get_virtual_address(resource->buf);
+ resource->bo_size = buf->size;
+ resource->bo_alignment = buf->alignment;
+ resource->domains = rscreen->ws->buffer_get_initial_domain(resource->buf);
+ if (resource->domains & RADEON_DOMAIN_VRAM)
+ resource->vram_usage = buf->size;
+ else if (resource->domains & RADEON_DOMAIN_GTT)
+ resource->gart_usage = buf->size;
}
if (rtex->cmask.size) {
/* Initialize the cmask to 0xCC (= compressed state). */
r600_screen_clear_buffer(rscreen, &rtex->cmask_buffer->b.b,
rtex->cmask.offset, rtex->cmask.size,
- 0xCCCCCCCC, true);
+ 0xCCCCCCCC, R600_COHERENCY_NONE);
+ }
+
+ /* Initialize DCC only if the texture is not being imported. */
+ if (!buf && rtex->dcc_offset) {
+ r600_screen_clear_buffer(rscreen, &rtex->resource.b.b,
+ rtex->dcc_offset,
+ rtex->surface.dcc_size,
+ 0xFFFFFFFF, R600_COHERENCY_NONE);
}
/* Initialize the CMASK base register value. */
@@ -656,50 +1201,12 @@ r600_texture_create_object(struct pipe_screen *screen,
base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format));
}
- if (rscreen->debug_flags & DBG_TEX ||
- (rtex->resource.b.b.last_level > 0 && rscreen->debug_flags & DBG_TEXMIP)) {
- printf("Texture: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
- "blk_h=%u, blk_d=%u, array_size=%u, last_level=%u, "
- "bpe=%u, nsamples=%u, flags=0x%x, %s\n",
- rtex->surface.npix_x, rtex->surface.npix_y,
- rtex->surface.npix_z, rtex->surface.blk_w,
- rtex->surface.blk_h, rtex->surface.blk_d,
- rtex->surface.array_size, rtex->surface.last_level,
- rtex->surface.bpe, rtex->surface.nsamples,
- rtex->surface.flags, util_format_short_name(base->format));
- for (int i = 0; i <= rtex->surface.last_level; i++) {
- printf(" L %i: offset=%"PRIu64", slice_size=%"PRIu64", npix_x=%u, "
- "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
- "nblk_z=%u, pitch_bytes=%u, mode=%u\n",
- i, rtex->surface.level[i].offset,
- rtex->surface.level[i].slice_size,
- u_minify(rtex->resource.b.b.width0, i),
- u_minify(rtex->resource.b.b.height0, i),
- u_minify(rtex->resource.b.b.depth0, i),
- rtex->surface.level[i].nblk_x,
- rtex->surface.level[i].nblk_y,
- rtex->surface.level[i].nblk_z,
- rtex->surface.level[i].pitch_bytes,
- rtex->surface.level[i].mode);
- }
- if (rtex->surface.flags & RADEON_SURF_SBUFFER) {
- for (int i = 0; i <= rtex->surface.last_level; i++) {
- printf(" S %i: offset=%"PRIu64", slice_size=%"PRIu64", npix_x=%u, "
- "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
- "nblk_z=%u, pitch_bytes=%u, mode=%u\n",
- i, rtex->surface.stencil_level[i].offset,
- rtex->surface.stencil_level[i].slice_size,
- u_minify(rtex->resource.b.b.width0, i),
- u_minify(rtex->resource.b.b.height0, i),
- u_minify(rtex->resource.b.b.depth0, i),
- rtex->surface.stencil_level[i].nblk_x,
- rtex->surface.stencil_level[i].nblk_y,
- rtex->surface.stencil_level[i].nblk_z,
- rtex->surface.stencil_level[i].pitch_bytes,
- rtex->surface.stencil_level[i].mode);
- }
- }
+ if (rscreen->debug_flags & DBG_TEX) {
+ puts("Texture:");
+ r600_print_texture_info(rtex, stdout);
+ fflush(stdout);
}
+
return rtex;
}
@@ -725,13 +1232,12 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
force_tiling = true;
/* Handle common candidates for the linear mode.
- * Compressed textures must always be tiled. */
- if (!force_tiling && !util_format_is_compressed(templ->format)) {
- /* Not everything can be linear, so we cannot enforce it
- * for all textures. */
- if ((rscreen->debug_flags & DBG_NO_TILING) &&
- (!util_format_is_depth_or_stencil(templ->format) ||
- !(templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH)))
+ * Compressed textures and DB surfaces must always be tiled.
+ */
+ if (!force_tiling && !util_format_is_compressed(templ->format) &&
+ (!util_format_is_depth_or_stencil(templ->format) ||
+ templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH)) {
+ if (rscreen->debug_flags & DBG_NO_TILING)
return RADEON_SURF_MODE_LINEAR_ALIGNED;
/* Tiling doesn't work with the 422 (SUBSAMPLED) formats on R600+. */
@@ -773,11 +1279,20 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
{
struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
struct radeon_surf surface = {0};
+ bool is_flushed_depth = templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH;
+ bool tc_compatible_htile =
+ rscreen->chip_class >= VI &&
+ (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
+ !(rscreen->debug_flags & DBG_NO_HYPERZ) &&
+ !is_flushed_depth &&
+ templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
+ util_format_is_depth_or_stencil(templ->format);
+
int r;
r = r600_init_surface(rscreen, &surface, templ,
r600_choose_tiling(rscreen, templ),
- templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH);
+ is_flushed_depth, tc_compatible_htile);
if (r) {
return NULL;
}
@@ -785,55 +1300,70 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
if (r) {
return NULL;
}
- return (struct pipe_resource *)r600_texture_create_object(screen, templ,
+ return (struct pipe_resource *)r600_texture_create_object(screen, templ, 0,
0, NULL, &surface);
}
static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen,
const struct pipe_resource *templ,
- struct winsys_handle *whandle)
+ struct winsys_handle *whandle,
+ unsigned usage)
{
struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
struct pb_buffer *buf = NULL;
- unsigned stride = 0;
+ unsigned stride = 0, offset = 0;
unsigned array_mode;
- enum radeon_bo_layout micro, macro;
struct radeon_surf surface;
- bool scanout;
int r;
+ struct radeon_bo_metadata metadata = {};
+ struct r600_texture *rtex;
/* Support only 2D textures without mipmaps */
if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) ||
templ->depth0 != 1 || templ->last_level != 0)
return NULL;
- buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride);
+ buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride, &offset);
if (!buf)
return NULL;
- rscreen->ws->buffer_get_tiling(buf, &micro, &macro,
- &surface.bankw, &surface.bankh,
- &surface.tile_split,
- &surface.stencil_tile_split,
- &surface.mtilea, &scanout);
+ rscreen->ws->buffer_get_metadata(buf, &metadata);
+
+ surface.pipe_config = metadata.pipe_config;
+ surface.bankw = metadata.bankw;
+ surface.bankh = metadata.bankh;
+ surface.tile_split = metadata.tile_split;
+ surface.mtilea = metadata.mtilea;
+ surface.num_banks = metadata.num_banks;
- if (macro == RADEON_LAYOUT_TILED)
+ if (metadata.macrotile == RADEON_LAYOUT_TILED)
array_mode = RADEON_SURF_MODE_2D;
- else if (micro == RADEON_LAYOUT_TILED)
+ else if (metadata.microtile == RADEON_LAYOUT_TILED)
array_mode = RADEON_SURF_MODE_1D;
else
array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
- r = r600_init_surface(rscreen, &surface, templ, array_mode, false);
+ r = r600_init_surface(rscreen, &surface, templ, array_mode,
+ false, false);
if (r) {
return NULL;
}
- if (scanout)
+ if (metadata.scanout)
surface.flags |= RADEON_SURF_SCANOUT;
- return (struct pipe_resource *)r600_texture_create_object(screen, templ,
- stride, buf, &surface);
+ rtex = r600_texture_create_object(screen, templ, stride,
+ offset, buf, &surface);
+ if (!rtex)
+ return NULL;
+
+ rtex->resource.is_shared = true;
+ rtex->resource.external_usage = usage;
+
+ if (rscreen->apply_opaque_metadata)
+ rscreen->apply_opaque_metadata(rscreen, rtex, &metadata);
+
+ return &rtex->resource.b.b;
}
bool r600_init_flushed_depth_texture(struct pipe_context *ctx,
@@ -844,12 +1374,44 @@ bool r600_init_flushed_depth_texture(struct pipe_context *ctx,
struct pipe_resource resource;
struct r600_texture **flushed_depth_texture = staging ?
staging : &rtex->flushed_depth_texture;
+ enum pipe_format pipe_format = texture->format;
+
+ if (!staging) {
+ if (rtex->flushed_depth_texture)
+ return true; /* it's ready */
+
+ if (!rtex->can_sample_z && rtex->can_sample_s) {
+ switch (pipe_format) {
+ case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+ /* Save memory by not allocating the S plane. */
+ pipe_format = PIPE_FORMAT_Z32_FLOAT;
+ break;
+ case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+ case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+ /* Save memory bandwidth by not copying the
+ * stencil part during flush.
+ *
+ * This potentially increases memory bandwidth
+ * if an application uses both Z and S texturing
+ * simultaneously (a flushed Z24S8 texture
+ * would be stored compactly), but how often
+ * does that really happen?
+ */
+ pipe_format = PIPE_FORMAT_Z24X8_UNORM;
+ break;
+ default:;
+ }
+ } else if (!rtex->can_sample_s && rtex->can_sample_z) {
+ assert(util_format_has_stencil(util_format_description(pipe_format)));
- if (!staging && rtex->flushed_depth_texture)
- return true; /* it's ready */
+ /* DB->CB copies to an 8bpp surface don't work. */
+ pipe_format = PIPE_FORMAT_X24S8_UINT;
+ }
+ }
+ memset(&resource, 0, sizeof(resource));
resource.target = texture->target;
- resource.format = texture->format;
+ resource.format = pipe_format;
resource.width0 = texture->width0;
resource.height0 = texture->height0;
resource.depth0 = texture->depth0;
@@ -869,7 +1431,6 @@ bool r600_init_flushed_depth_texture(struct pipe_context *ctx,
return false;
}
- (*flushed_depth_texture)->is_flushing_texture = TRUE;
(*flushed_depth_texture)->non_disp_tiling = false;
return true;
}
@@ -894,24 +1455,52 @@ static void r600_init_temp_resource_from_box(struct pipe_resource *res,
res->flags = flags;
/* We must set the correct texture target and dimensions for a 3D box. */
- if (box->depth > 1 && util_max_layer(orig, level) > 0)
- res->target = orig->target;
- else
- res->target = PIPE_TEXTURE_2D;
-
- switch (res->target) {
- case PIPE_TEXTURE_1D_ARRAY:
- case PIPE_TEXTURE_2D_ARRAY:
- case PIPE_TEXTURE_CUBE_ARRAY:
+ if (box->depth > 1 && util_max_layer(orig, level) > 0) {
+ res->target = PIPE_TEXTURE_2D_ARRAY;
res->array_size = box->depth;
- break;
- case PIPE_TEXTURE_3D:
- res->depth0 = box->depth;
- break;
- default:;
+ } else {
+ res->target = PIPE_TEXTURE_2D;
}
}
+static bool r600_can_invalidate_texture(struct r600_common_screen *rscreen,
+ struct r600_texture *rtex,
+ unsigned transfer_usage,
+ const struct pipe_box *box)
+{
+ /* r600g doesn't react to dirty_tex_descriptor_counter */
+ return rscreen->chip_class >= SI &&
+ !rtex->resource.is_shared &&
+ !(transfer_usage & PIPE_TRANSFER_READ) &&
+ rtex->resource.b.b.last_level == 0 &&
+ util_texrange_covers_whole_level(&rtex->resource.b.b, 0,
+ box->x, box->y, box->z,
+ box->width, box->height,
+ box->depth);
+}
+
+static void r600_texture_invalidate_storage(struct r600_common_context *rctx,
+ struct r600_texture *rtex)
+{
+ struct r600_common_screen *rscreen = rctx->screen;
+
+ /* There is no point in discarding depth and tiled buffers. */
+ assert(!rtex->is_depth);
+ assert(rtex->surface.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED);
+
+ /* Reallocate the buffer in the same pipe_resource. */
+ r600_alloc_resource(rscreen, &rtex->resource);
+
+ /* Initialize the CMASK base address (needed even without CMASK). */
+ rtex->cmask.base_address_reg =
+ (rtex->resource.gpu_address + rtex->cmask.offset) >> 8;
+
+ r600_dirty_all_framebuffer_states(rscreen);
+ p_atomic_inc(&rscreen->dirty_tex_descriptor_counter);
+
+ rctx->num_alloc_tex_transfer_bytes += rtex->size;
+}
+
static void *r600_texture_transfer_map(struct pipe_context *ctx,
struct pipe_resource *texture,
unsigned level,
@@ -922,41 +1511,61 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
struct r600_common_context *rctx = (struct r600_common_context*)ctx;
struct r600_texture *rtex = (struct r600_texture*)texture;
struct r600_transfer *trans;
- boolean use_staging_texture = FALSE;
struct r600_resource *buf;
unsigned offset = 0;
char *map;
+ bool use_staging_texture = false;
- /* We cannot map a tiled texture directly because the data is
- * in a different order, therefore we do detiling using a blit.
- *
- * Also, use a temporary in GTT memory for read transfers, as
- * the CPU is much happier reading out of cached system memory
- * than uncached VRAM.
- */
- if (rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) {
- use_staging_texture = TRUE;
- } else if ((usage & PIPE_TRANSFER_READ) && !(usage & PIPE_TRANSFER_MAP_DIRECTLY) &&
- (rtex->resource.domains == RADEON_DOMAIN_VRAM)) {
- /* Untiled buffers in VRAM, which is slow for CPU reads */
- use_staging_texture = TRUE;
- } else if (!(usage & PIPE_TRANSFER_READ) &&
- (r600_rings_is_buffer_referenced(rctx, rtex->resource.cs_buf, RADEON_USAGE_READWRITE) ||
- !rctx->ws->buffer_wait(rtex->resource.buf, 0, RADEON_USAGE_READWRITE))) {
- /* Use a staging texture for uploads if the underlying BO is busy. */
- use_staging_texture = TRUE;
- }
+ assert(!(texture->flags & R600_RESOURCE_FLAG_TRANSFER));
- if (texture->flags & R600_RESOURCE_FLAG_TRANSFER) {
- use_staging_texture = FALSE;
- }
+ /* Depth textures use staging unconditionally. */
+ if (!rtex->is_depth) {
+ /* Degrade the tile mode if we get too many transfers on APUs.
+ * On dGPUs, the staging texture is always faster.
+ * Only count uploads that are at least 4x4 pixels large.
+ */
+ if (!rctx->screen->info.has_dedicated_vram &&
+ level == 0 &&
+ box->width >= 4 && box->height >= 4 &&
+ p_atomic_inc_return(&rtex->num_level0_transfers) == 10) {
+ bool can_invalidate =
+ r600_can_invalidate_texture(rctx->screen, rtex,
+ usage, box);
+
+ r600_degrade_tile_mode_to_linear(rctx, rtex,
+ can_invalidate);
+ }
- if (use_staging_texture && (usage & PIPE_TRANSFER_MAP_DIRECTLY)) {
- return NULL;
+ /* Tiled textures need to be converted into a linear texture for CPU
+ * access. The staging texture is always linear and is placed in GART.
+ *
+ * Reading from VRAM is slow, always use the staging texture in
+ * this case.
+ *
+ * Use the staging texture for uploads if the underlying BO
+ * is busy.
+ */
+ if (rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D)
+ use_staging_texture = true;
+ else if (usage & PIPE_TRANSFER_READ)
+ use_staging_texture = (rtex->resource.domains &
+ RADEON_DOMAIN_VRAM) != 0;
+ /* Write & linear only: */
+ else if (r600_rings_is_buffer_referenced(rctx, rtex->resource.buf,
+ RADEON_USAGE_READWRITE) ||
+ !rctx->ws->buffer_wait(rtex->resource.buf, 0,
+ RADEON_USAGE_READWRITE)) {
+ /* It's busy. */
+ if (r600_can_invalidate_texture(rctx->screen, rtex,
+ usage, box))
+ r600_texture_invalidate_storage(rctx, rtex);
+ else
+ use_staging_texture = true;
+ }
}
trans = CALLOC_STRUCT(r600_transfer);
- if (trans == NULL)
+ if (!trans)
return NULL;
trans->transfer.resource = texture;
trans->transfer.level = level;
@@ -998,7 +1607,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
r600_copy_region_with_blit(ctx, temp, 0, 0, 0, 0, texture, level, box);
rctx->blit_decompress_depth(ctx, (struct r600_texture*)temp, staging_depth,
0, 0, 0, box->depth, 0, 0);
- pipe_resource_reference((struct pipe_resource**)&temp, NULL);
+ pipe_resource_reference(&temp, NULL);
}
}
else {
@@ -1021,6 +1630,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
trans->transfer.stride = staging_depth->surface.level[level].pitch_bytes;
trans->transfer.layer_stride = staging_depth->surface.level[level].slice_size;
trans->staging = (struct r600_resource*)staging_depth;
+ buf = trans->staging;
} else if (use_staging_texture) {
struct pipe_resource resource;
struct r600_texture *staging;
@@ -1032,7 +1642,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
/* Create the temporary texture. */
staging = (struct r600_texture*)ctx->screen->resource_create(ctx->screen, &resource);
- if (staging == NULL) {
+ if (!staging) {
R600_ERR("failed to create temporary texture to hold untiled copy\n");
FREE(trans);
return NULL;
@@ -1040,26 +1650,23 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
trans->staging = &staging->resource;
trans->transfer.stride = staging->surface.level[0].pitch_bytes;
trans->transfer.layer_stride = staging->surface.level[0].slice_size;
- if (usage & PIPE_TRANSFER_READ) {
+
+ if (usage & PIPE_TRANSFER_READ)
r600_copy_to_staging_texture(ctx, trans);
- }
+ else
+ usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+
+ buf = trans->staging;
} else {
/* the resource is mapped directly */
trans->transfer.stride = rtex->surface.level[level].pitch_bytes;
trans->transfer.layer_stride = rtex->surface.level[level].slice_size;
offset = r600_texture_get_offset(rtex, level, box);
- }
-
- if (trans->staging) {
- buf = trans->staging;
- if (!rtex->is_depth && !(usage & PIPE_TRANSFER_READ))
- usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
- } else {
buf = &rtex->resource;
}
if (!(map = r600_buffer_map_sync_with_rings(rctx, buf, usage))) {
- pipe_resource_reference((struct pipe_resource**)&trans->staging, NULL);
+ r600_resource_reference(&trans->staging, NULL);
FREE(trans);
return NULL;
}
@@ -1071,6 +1678,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
static void r600_texture_transfer_unmap(struct pipe_context *ctx,
struct pipe_transfer* transfer)
{
+ struct r600_common_context *rctx = (struct r600_common_context*)ctx;
struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
struct pipe_resource *texture = transfer->resource;
struct r600_texture *rtex = (struct r600_texture*)texture;
@@ -1086,8 +1694,28 @@ static void r600_texture_transfer_unmap(struct pipe_context *ctx,
}
}
- if (rtransfer->staging)
- pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL);
+ if (rtransfer->staging) {
+ rctx->num_alloc_tex_transfer_bytes += rtransfer->staging->buf->size;
+ r600_resource_reference(&rtransfer->staging, NULL);
+ }
+
+ /* Heuristic for {upload, draw, upload, draw, ..}:
+ *
+ * Flush the gfx IB if we've allocated too much texture storage.
+ *
+ * The idea is that we don't want to build IBs that use too much
+ * memory and put pressure on the kernel memory manager and we also
+ * want to make temporary and invalidated buffers go idle ASAP to
+ * decrease the total memory usage or make them reusable. The memory
+ * usage will be slightly higher than given here because of the buffer
+ * cache in the winsys.
+ *
+ * The result is that the kernel memory manager is never a bottleneck.
+ */
+ if (rctx->num_alloc_tex_transfer_bytes > rctx->screen->info.gart_size / 4) {
+ rctx->gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
+ rctx->num_alloc_tex_transfer_bytes = 0;
+ }
FREE(transfer);
}
@@ -1097,19 +1725,118 @@ static const struct u_resource_vtbl r600_texture_vtbl =
NULL, /* get_handle */
r600_texture_destroy, /* resource_destroy */
r600_texture_transfer_map, /* transfer_map */
- NULL, /* transfer_flush_region */
+ u_default_transfer_flush_region, /* transfer_flush_region */
r600_texture_transfer_unmap, /* transfer_unmap */
- NULL /* transfer_inline_write */
};
+/* DCC channel type categories within which formats can be reinterpreted
+ * while keeping the same DCC encoding. The swizzle must also match. */
+enum dcc_channel_type {
+ dcc_channel_float32,
+ dcc_channel_uint32,
+ dcc_channel_sint32,
+ dcc_channel_float16,
+ dcc_channel_uint16,
+ dcc_channel_sint16,
+ dcc_channel_uint_10_10_10_2,
+ dcc_channel_uint8,
+ dcc_channel_sint8,
+ dcc_channel_incompatible,
+};
+
+/* Return the type of DCC encoding. */
+static enum dcc_channel_type
+vi_get_dcc_channel_type(const struct util_format_description *desc)
+{
+ int i;
+
+ /* Find the first non-void channel. */
+ for (i = 0; i < desc->nr_channels; i++)
+ if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
+ break;
+ if (i == desc->nr_channels)
+ return dcc_channel_incompatible;
+
+ switch (desc->channel[i].size) {
+ case 32:
+ if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT)
+ return dcc_channel_float32;
+ if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED)
+ return dcc_channel_uint32;
+ return dcc_channel_sint32;
+ case 16:
+ if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT)
+ return dcc_channel_float16;
+ if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED)
+ return dcc_channel_uint16;
+ return dcc_channel_sint16;
+ case 10:
+ return dcc_channel_uint_10_10_10_2;
+ case 8:
+ if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED)
+ return dcc_channel_uint8;
+ return dcc_channel_sint8;
+ default:
+ return dcc_channel_incompatible;
+ }
+}
+
+/* Return if it's allowed to reinterpret one format as another with DCC enabled. */
+bool vi_dcc_formats_compatible(enum pipe_format format1,
+ enum pipe_format format2)
+{
+ const struct util_format_description *desc1, *desc2;
+ enum dcc_channel_type type1, type2;
+ int i;
+
+ if (format1 == format2)
+ return true;
+
+ desc1 = util_format_description(format1);
+ desc2 = util_format_description(format2);
+
+ if (desc1->nr_channels != desc2->nr_channels)
+ return false;
+
+ /* Swizzles must be the same. */
+ for (i = 0; i < desc1->nr_channels; i++)
+ if (desc1->swizzle[i] <= PIPE_SWIZZLE_W &&
+ desc2->swizzle[i] <= PIPE_SWIZZLE_W &&
+ desc1->swizzle[i] != desc2->swizzle[i])
+ return false;
+
+ type1 = vi_get_dcc_channel_type(desc1);
+ type2 = vi_get_dcc_channel_type(desc2);
+
+ return type1 != dcc_channel_incompatible &&
+ type2 != dcc_channel_incompatible &&
+ type1 == type2;
+}
+
+void vi_dcc_disable_if_incompatible_format(struct r600_common_context *rctx,
+ struct pipe_resource *tex,
+ unsigned level,
+ enum pipe_format view_format)
+{
+ struct r600_texture *rtex = (struct r600_texture *)tex;
+
+ if (rtex->dcc_offset &&
+ rtex->surface.level[level].dcc_enabled &&
+ !vi_dcc_formats_compatible(tex->format, view_format))
+ if (!r600_texture_disable_dcc(rctx, (struct r600_texture*)tex))
+ rctx->decompress_dcc(&rctx->b, rtex);
+}
+
struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
struct pipe_resource *texture,
const struct pipe_surface *templ,
unsigned width, unsigned height)
{
+ struct r600_common_context *rctx = (struct r600_common_context*)pipe;
+ struct r600_texture *rtex = (struct r600_texture*)texture;
struct r600_surface *surface = CALLOC_STRUCT(r600_surface);
- if (surface == NULL)
+ if (!surface)
return NULL;
assert(templ->u.tex.first_layer <= util_max_layer(texture, templ->u.tex.level));
@@ -1122,6 +1849,13 @@ struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
surface->base.width = width;
surface->base.height = height;
surface->base.u = templ->u;
+ surface->level_info = &rtex->surface.level[templ->u.tex.level];
+
+ if (texture->target != PIPE_BUFFER)
+ vi_dcc_disable_if_incompatible_format(rctx, texture,
+ templ->u.tex.level,
+ templ->format);
+
return &surface->base;
}
@@ -1130,27 +1864,112 @@ static struct pipe_surface *r600_create_surface(struct pipe_context *pipe,
const struct pipe_surface *templ)
{
unsigned level = templ->u.tex.level;
+ unsigned width = u_minify(tex->width0, level);
+ unsigned height = u_minify(tex->height0, level);
+
+ if (tex->target != PIPE_BUFFER && templ->format != tex->format) {
+ const struct util_format_description *tex_desc
+ = util_format_description(tex->format);
+ const struct util_format_description *templ_desc
+ = util_format_description(templ->format);
+
+ assert(tex_desc->block.bits == templ_desc->block.bits);
+
+ /* Adjust size of surface if and only if the block width or
+ * height is changed. */
+ if (tex_desc->block.width != templ_desc->block.width ||
+ tex_desc->block.height != templ_desc->block.height) {
+ unsigned nblks_x = util_format_get_nblocksx(tex->format, width);
+ unsigned nblks_y = util_format_get_nblocksy(tex->format, height);
+
+ width = nblks_x * templ_desc->block.width;
+ height = nblks_y * templ_desc->block.height;
+ }
+ }
- return r600_create_surface_custom(pipe, tex, templ,
- u_minify(tex->width0, level),
- u_minify(tex->height0, level));
+ return r600_create_surface_custom(pipe, tex, templ, width, height);
}
static void r600_surface_destroy(struct pipe_context *pipe,
struct pipe_surface *surface)
{
struct r600_surface *surf = (struct r600_surface*)surface;
- pipe_resource_reference((struct pipe_resource**)&surf->cb_buffer_fmask, NULL);
- pipe_resource_reference((struct pipe_resource**)&surf->cb_buffer_cmask, NULL);
+ r600_resource_reference(&surf->cb_buffer_fmask, NULL);
+ r600_resource_reference(&surf->cb_buffer_cmask, NULL);
pipe_resource_reference(&surface->texture, NULL);
FREE(surface);
}
-unsigned r600_translate_colorswap(enum pipe_format format)
+static void r600_clear_texture(struct pipe_context *pipe,
+ struct pipe_resource *tex,
+ unsigned level,
+ const struct pipe_box *box,
+ const void *data)
+{
+ struct pipe_screen *screen = pipe->screen;
+ struct r600_texture *rtex = (struct r600_texture*)tex;
+ struct pipe_surface tmpl = {{0}};
+ struct pipe_surface *sf;
+ const struct util_format_description *desc =
+ util_format_description(tex->format);
+
+ tmpl.format = tex->format;
+ tmpl.u.tex.first_layer = box->z;
+ tmpl.u.tex.last_layer = box->z + box->depth - 1;
+ tmpl.u.tex.level = level;
+ sf = pipe->create_surface(pipe, tex, &tmpl);
+ if (!sf)
+ return;
+
+ if (rtex->is_depth) {
+ unsigned clear;
+ float depth;
+ uint8_t stencil = 0;
+
+ /* Depth is always present. */
+ clear = PIPE_CLEAR_DEPTH;
+ desc->unpack_z_float(&depth, 0, data, 0, 1, 1);
+
+ if (rtex->surface.flags & RADEON_SURF_SBUFFER) {
+ clear |= PIPE_CLEAR_STENCIL;
+ desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
+ }
+
+ pipe->clear_depth_stencil(pipe, sf, clear, depth, stencil,
+ box->x, box->y,
+ box->width, box->height, false);
+ } else {
+ union pipe_color_union color;
+
+ /* pipe_color_union requires the full vec4 representation. */
+ if (util_format_is_pure_uint(tex->format))
+ desc->unpack_rgba_uint(color.ui, 0, data, 0, 1, 1);
+ else if (util_format_is_pure_sint(tex->format))
+ desc->unpack_rgba_sint(color.i, 0, data, 0, 1, 1);
+ else
+ desc->unpack_rgba_float(color.f, 0, data, 0, 1, 1);
+
+ if (screen->is_format_supported(screen, tex->format,
+ tex->target, 0,
+ PIPE_BIND_RENDER_TARGET)) {
+ pipe->clear_render_target(pipe, sf, &color,
+ box->x, box->y,
+ box->width, box->height, false);
+ } else {
+ /* Software fallback - just for R9G9B9E5_FLOAT */
+ util_clear_render_target(pipe, sf, &color,
+ box->x, box->y,
+ box->width, box->height);
+ }
+ }
+ pipe_surface_reference(&sf, NULL);
+}
+
+unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap)
{
const struct util_format_description *desc = util_format_description(format);
-#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == UTIL_FORMAT_SWIZZLE_##swz)
+#define HAS_SWIZZLE(chan,swz) (desc->swizzle[chan] == PIPE_SWIZZLE_##swz)
if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
return V_0280A0_SWAP_STD;
@@ -1173,7 +1992,8 @@ unsigned r600_translate_colorswap(enum pipe_format format)
else if ((HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,X)) ||
(HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(1,NONE)) ||
(HAS_SWIZZLE(0,NONE) && HAS_SWIZZLE(1,X)))
- return V_0280A0_SWAP_STD_REV; /* YX__ */
+ /* YX__ */
+ return (do_endian_swap ? V_0280A0_SWAP_STD : V_0280A0_SWAP_STD_REV);
else if (HAS_SWIZZLE(0,X) && HAS_SWIZZLE(3,Y))
return V_0280A0_SWAP_ALT; /* X__Y */
else if (HAS_SWIZZLE(0,Y) && HAS_SWIZZLE(3,X))
@@ -1181,25 +2001,269 @@ unsigned r600_translate_colorswap(enum pipe_format format)
break;
case 3:
if (HAS_SWIZZLE(0,X))
- return V_0280A0_SWAP_STD; /* XYZ */
+ return (do_endian_swap ? V_0280A0_SWAP_STD_REV : V_0280A0_SWAP_STD);
else if (HAS_SWIZZLE(0,Z))
return V_0280A0_SWAP_STD_REV; /* ZYX */
break;
case 4:
/* check the middle channels, the 1st and 4th channel can be NONE */
- if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z))
+ if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,Z)) {
return V_0280A0_SWAP_STD; /* XYZW */
- else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y))
+ } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,Y)) {
return V_0280A0_SWAP_STD_REV; /* WZYX */
- else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X))
+ } else if (HAS_SWIZZLE(1,Y) && HAS_SWIZZLE(2,X)) {
return V_0280A0_SWAP_ALT; /* ZYXW */
- else if (HAS_SWIZZLE(1,X) && HAS_SWIZZLE(2,Y))
- return V_0280A0_SWAP_ALT_REV; /* WXYZ */
+ } else if (HAS_SWIZZLE(1,Z) && HAS_SWIZZLE(2,W)) {
+ /* YZWX */
+ if (desc->is_array)
+ return V_0280A0_SWAP_ALT_REV;
+ else
+ return (do_endian_swap ? V_0280A0_SWAP_ALT : V_0280A0_SWAP_ALT_REV);
+ }
break;
}
return ~0U;
}
+/* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */
+
+static void vi_dcc_clean_up_context_slot(struct r600_common_context *rctx,
+ int slot)
+{
+ int i;
+
+ if (rctx->dcc_stats[slot].query_active)
+ vi_separate_dcc_stop_query(&rctx->b,
+ rctx->dcc_stats[slot].tex);
+
+ for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats[slot].ps_stats); i++)
+ if (rctx->dcc_stats[slot].ps_stats[i]) {
+ rctx->b.destroy_query(&rctx->b,
+ rctx->dcc_stats[slot].ps_stats[i]);
+ rctx->dcc_stats[slot].ps_stats[i] = NULL;
+ }
+
+ r600_texture_reference(&rctx->dcc_stats[slot].tex, NULL);
+}
+
+/**
+ * Return the per-context slot where DCC statistics queries for the texture live.
+ */
+static unsigned vi_get_context_dcc_stats_index(struct r600_common_context *rctx,
+ struct r600_texture *tex)
+{
+ int i, empty_slot = -1;
+
+ /* Remove zombie textures (textures kept alive by this array only). */
+ for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++)
+ if (rctx->dcc_stats[i].tex &&
+ rctx->dcc_stats[i].tex->resource.b.b.reference.count == 1)
+ vi_dcc_clean_up_context_slot(rctx, i);
+
+ /* Find the texture. */
+ for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
+ /* Return if found. */
+ if (rctx->dcc_stats[i].tex == tex) {
+ rctx->dcc_stats[i].last_use_timestamp = os_time_get();
+ return i;
+ }
+
+ /* Record the first seen empty slot. */
+ if (empty_slot == -1 && !rctx->dcc_stats[i].tex)
+ empty_slot = i;
+ }
+
+ /* Not found. Remove the oldest member to make space in the array. */
+ if (empty_slot == -1) {
+ int oldest_slot = 0;
+
+ /* Find the oldest slot. */
+ for (i = 1; i < ARRAY_SIZE(rctx->dcc_stats); i++)
+ if (rctx->dcc_stats[oldest_slot].last_use_timestamp >
+ rctx->dcc_stats[i].last_use_timestamp)
+ oldest_slot = i;
+
+ /* Clean up the oldest slot. */
+ vi_dcc_clean_up_context_slot(rctx, oldest_slot);
+ empty_slot = oldest_slot;
+ }
+
+ /* Add the texture to the new slot. */
+ r600_texture_reference(&rctx->dcc_stats[empty_slot].tex, tex);
+ rctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
+ return empty_slot;
+}
+
+static struct pipe_query *
+vi_create_resuming_pipestats_query(struct pipe_context *ctx)
+{
+ struct r600_query_hw *query = (struct r600_query_hw*)
+ ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0);
+
+ query->flags |= R600_QUERY_HW_FLAG_BEGIN_RESUMES;
+ return (struct pipe_query*)query;
+}
+
+/**
+ * Called when binding a color buffer.
+ */
+void vi_separate_dcc_start_query(struct pipe_context *ctx,
+ struct r600_texture *tex)
+{
+ struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+ unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
+
+ assert(!rctx->dcc_stats[i].query_active);
+
+ if (!rctx->dcc_stats[i].ps_stats[0])
+ rctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(ctx);
+
+ /* begin or resume the query */
+ ctx->begin_query(ctx, rctx->dcc_stats[i].ps_stats[0]);
+ rctx->dcc_stats[i].query_active = true;
+}
+
+/**
+ * Called when unbinding a color buffer.
+ */
+void vi_separate_dcc_stop_query(struct pipe_context *ctx,
+ struct r600_texture *tex)
+{
+ struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+ unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
+
+ assert(rctx->dcc_stats[i].query_active);
+ assert(rctx->dcc_stats[i].ps_stats[0]);
+
+ /* pause or end the query */
+ ctx->end_query(ctx, rctx->dcc_stats[i].ps_stats[0]);
+ rctx->dcc_stats[i].query_active = false;
+}
+
+static bool vi_should_enable_separate_dcc(struct r600_texture *tex)
+{
+ /* The minimum number of fullscreen draws per frame that is required
+ * to enable DCC. */
+ return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
+}
+
+/* Called by fast clear. */
+static void vi_separate_dcc_try_enable(struct r600_common_context *rctx,
+ struct r600_texture *tex)
+{
+ /* The intent is to use this with shared displayable back buffers,
+ * but it's not strictly limited only to them.
+ */
+ if (!tex->resource.is_shared ||
+ !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
+ tex->resource.b.b.target != PIPE_TEXTURE_2D ||
+ tex->surface.last_level > 0 ||
+ !tex->surface.dcc_size)
+ return;
+
+ if (tex->dcc_offset)
+ return; /* already enabled */
+
+ /* Enable the DCC stat gathering. */
+ if (!tex->dcc_gather_statistics) {
+ tex->dcc_gather_statistics = true;
+ vi_separate_dcc_start_query(&rctx->b, tex);
+ }
+
+ if (!vi_should_enable_separate_dcc(tex))
+ return; /* stats show that DCC decompression is too expensive */
+
+ assert(tex->surface.level[0].dcc_enabled);
+ assert(!tex->dcc_separate_buffer);
+
+ r600_texture_discard_cmask(rctx->screen, tex);
+
+ /* Get a DCC buffer. */
+ if (tex->last_dcc_separate_buffer) {
+ assert(tex->dcc_gather_statistics);
+ assert(!tex->dcc_separate_buffer);
+ tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
+ tex->last_dcc_separate_buffer = NULL;
+ } else {
+ tex->dcc_separate_buffer = (struct r600_resource*)
+ r600_aligned_buffer_create(rctx->b.screen, 0,
+ PIPE_USAGE_DEFAULT,
+ tex->surface.dcc_size,
+ tex->surface.dcc_alignment);
+ if (!tex->dcc_separate_buffer)
+ return;
+ }
+
+ /* dcc_offset is the absolute GPUVM address. */
+ tex->dcc_offset = tex->dcc_separate_buffer->gpu_address;
+
+ /* no need to flag anything since this is called by fast clear that
+ * flags framebuffer state
+ */
+}
+
+/**
+ * Called by pipe_context::flush_resource, the place where DCC decompression
+ * takes place.
+ */
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
+ struct r600_texture *tex)
+{
+ struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+ struct pipe_query *tmp;
+ unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
+ bool query_active = rctx->dcc_stats[i].query_active;
+ bool disable = false;
+
+ if (rctx->dcc_stats[i].ps_stats[2]) {
+ union pipe_query_result result;
+
+ /* Read the results. */
+ ctx->get_query_result(ctx, rctx->dcc_stats[i].ps_stats[2],
+ true, &result);
+ r600_query_hw_reset_buffers(rctx,
+ (struct r600_query_hw*)
+ rctx->dcc_stats[i].ps_stats[2]);
+
+ /* Compute the approximate number of fullscreen draws. */
+ tex->ps_draw_ratio =
+ result.pipeline_statistics.ps_invocations /
+ (tex->resource.b.b.width0 * tex->resource.b.b.height0);
+ rctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio;
+
+ disable = tex->dcc_separate_buffer &&
+ !vi_should_enable_separate_dcc(tex);
+ }
+
+ tex->num_slow_clears = 0;
+
+ /* stop the statistics query for ps_stats[0] */
+ if (query_active)
+ vi_separate_dcc_stop_query(ctx, tex);
+
+ /* Move the queries in the queue by one. */
+ tmp = rctx->dcc_stats[i].ps_stats[2];
+ rctx->dcc_stats[i].ps_stats[2] = rctx->dcc_stats[i].ps_stats[1];
+ rctx->dcc_stats[i].ps_stats[1] = rctx->dcc_stats[i].ps_stats[0];
+ rctx->dcc_stats[i].ps_stats[0] = tmp;
+
+ /* create and start a new query as ps_stats[0] */
+ if (query_active)
+ vi_separate_dcc_start_query(ctx, tex);
+
+ if (disable) {
+ assert(!tex->last_dcc_separate_buffer);
+ tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
+ tex->dcc_separate_buffer = NULL;
+ tex->dcc_offset = 0;
+ /* no need to flag anything since this is called after
+ * decompression that re-sets framebuffer state
+ */
+ }
+}
+
+/* FAST COLOR CLEAR */
+
static void evergreen_set_clear_color(struct r600_texture *rtex,
enum pipe_format surface_format,
const union pipe_color_union *color)
@@ -1208,7 +2272,16 @@ static void evergreen_set_clear_color(struct r600_texture *rtex,
memset(&uc, 0, sizeof(uc));
- if (util_format_is_pure_uint(surface_format)) {
+ if (util_format_get_blocksizebits(surface_format) == 128) {
+ /* DCC fast clear only:
+ * CLEAR_WORD0 = R = G = B
+ * CLEAR_WORD1 = A
+ */
+ assert(color->ui[0] == color->ui[1] &&
+ color->ui[0] == color->ui[2]);
+ uc.ui[0] = color->ui[0];
+ uc.ui[1] = color->ui[3];
+ } else if (util_format_is_pure_uint(surface_format)) {
util_format_write_4ui(surface_format, color->ui, 0, &uc, 0, 0, 0, 1, 1);
} else if (util_format_is_pure_sint(surface_format)) {
util_format_write_4i(surface_format, color->i, 0, &uc, 0, 0, 0, 1, 1);
@@ -1219,15 +2292,210 @@ static void evergreen_set_clear_color(struct r600_texture *rtex,
memcpy(rtex->color_clear_value, &uc, 2 * sizeof(uint32_t));
}
+static bool vi_get_fast_clear_parameters(enum pipe_format surface_format,
+ const union pipe_color_union *color,
+ uint32_t* reset_value,
+ bool* clear_words_needed)
+{
+ bool values[4] = {};
+ int i;
+ bool main_value = false;
+ bool extra_value = false;
+ int extra_channel;
+ const struct util_format_description *desc = util_format_description(surface_format);
+
+ if (desc->block.bits == 128 &&
+ (color->ui[0] != color->ui[1] ||
+ color->ui[0] != color->ui[2]))
+ return false;
+
+ *clear_words_needed = true;
+ *reset_value = 0x20202020U;
+
+ /* If we want to clear without needing a fast clear eliminate step, we
+ * can set each channel to 0 or 1 (or 0/max for integer formats). We
+ * have two sets of flags, one for the last or first channel(extra) and
+ * one for the other channels(main).
+ */
+
+ if (surface_format == PIPE_FORMAT_R11G11B10_FLOAT ||
+ surface_format == PIPE_FORMAT_B5G6R5_UNORM ||
+ surface_format == PIPE_FORMAT_B5G6R5_SRGB) {
+ extra_channel = -1;
+ } else if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
+ if(r600_translate_colorswap(surface_format, false) <= 1)
+ extra_channel = desc->nr_channels - 1;
+ else
+ extra_channel = 0;
+ } else
+ return true;
+
+ for (i = 0; i < 4; ++i) {
+ int index = desc->swizzle[i] - PIPE_SWIZZLE_X;
+
+ if (desc->swizzle[i] < PIPE_SWIZZLE_X ||
+ desc->swizzle[i] > PIPE_SWIZZLE_W)
+ continue;
+
+ if (desc->channel[i].pure_integer &&
+ desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
+ /* Use the maximum value for clamping the clear color. */
+ int max = u_bit_consecutive(0, desc->channel[i].size - 1);
+
+ values[i] = color->i[i] != 0;
+ if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
+ return true;
+ } else if (desc->channel[i].pure_integer &&
+ desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+ /* Use the maximum value for clamping the clear color. */
+ unsigned max = u_bit_consecutive(0, desc->channel[i].size);
+
+ values[i] = color->ui[i] != 0U;
+ if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
+ return true;
+ } else {
+ values[i] = color->f[i] != 0.0F;
+ if (color->f[i] != 0.0F && color->f[i] != 1.0F)
+ return true;
+ }
+
+ if (index == extra_channel)
+ extra_value = values[i];
+ else
+ main_value = values[i];
+ }
+
+ for (int i = 0; i < 4; ++i)
+ if (values[i] != main_value &&
+ desc->swizzle[i] - PIPE_SWIZZLE_X != extra_channel &&
+ desc->swizzle[i] >= PIPE_SWIZZLE_X &&
+ desc->swizzle[i] <= PIPE_SWIZZLE_W)
+ return true;
+
+ *clear_words_needed = false;
+ if (main_value)
+ *reset_value |= 0x80808080U;
+
+ if (extra_value)
+ *reset_value |= 0x40404040U;
+ return true;
+}
+
+void vi_dcc_clear_level(struct r600_common_context *rctx,
+ struct r600_texture *rtex,
+ unsigned level, unsigned clear_value)
+{
+ struct pipe_resource *dcc_buffer;
+ uint64_t dcc_offset;
+
+ assert(rtex->dcc_offset && rtex->surface.level[level].dcc_enabled);
+
+ if (rtex->dcc_separate_buffer) {
+ dcc_buffer = &rtex->dcc_separate_buffer->b.b;
+ dcc_offset = 0;
+ } else {
+ dcc_buffer = &rtex->resource.b.b;
+ dcc_offset = rtex->dcc_offset;
+ }
+
+ dcc_offset += rtex->surface.level[level].dcc_offset;
+
+ rctx->clear_buffer(&rctx->b, dcc_buffer, dcc_offset,
+ rtex->surface.level[level].dcc_fast_clear_size,
+ clear_value, R600_COHERENCY_CB_META);
+}
+
+/* Set the same micro tile mode as the destination of the last MSAA resolve.
+ * This allows hitting the MSAA resolve fast path, which requires that both
+ * src and dst micro tile modes match.
+ */
+static void si_set_optimal_micro_tile_mode(struct r600_common_screen *rscreen,
+ struct r600_texture *rtex)
+{
+ if (rtex->resource.is_shared ||
+ rtex->surface.nsamples <= 1 ||
+ rtex->surface.micro_tile_mode == rtex->last_msaa_resolve_target_micro_mode)
+ return;
+
+ assert(rtex->surface.level[0].mode == RADEON_SURF_MODE_2D);
+ assert(rtex->surface.last_level == 0);
+
+ /* These magic numbers were copied from addrlib. It doesn't use any
+ * definitions for them either. They are all 2D_TILED_THIN1 modes with
+ * different bpp and micro tile mode.
+ */
+ if (rscreen->chip_class >= CIK) {
+ switch (rtex->last_msaa_resolve_target_micro_mode) {
+ case 0: /* displayable */
+ rtex->surface.tiling_index[0] = 10;
+ break;
+ case 1: /* thin */
+ rtex->surface.tiling_index[0] = 14;
+ break;
+ case 3: /* rotated */
+ rtex->surface.tiling_index[0] = 28;
+ break;
+ default: /* depth, thick */
+ assert(!"unexpected micro mode");
+ return;
+ }
+ } else { /* SI */
+ switch (rtex->last_msaa_resolve_target_micro_mode) {
+ case 0: /* displayable */
+ switch (rtex->surface.bpe) {
+ case 1:
+ rtex->surface.tiling_index[0] = 10;
+ break;
+ case 2:
+ rtex->surface.tiling_index[0] = 11;
+ break;
+ default: /* 4, 8 */
+ rtex->surface.tiling_index[0] = 12;
+ break;
+ }
+ break;
+ case 1: /* thin */
+ switch (rtex->surface.bpe) {
+ case 1:
+ rtex->surface.tiling_index[0] = 14;
+ break;
+ case 2:
+ rtex->surface.tiling_index[0] = 15;
+ break;
+ case 4:
+ rtex->surface.tiling_index[0] = 16;
+ break;
+ default: /* 8, 16 */
+ rtex->surface.tiling_index[0] = 17;
+ break;
+ }
+ break;
+ default: /* depth, thick */
+ assert(!"unexpected micro mode");
+ return;
+ }
+ }
+
+ rtex->surface.micro_tile_mode = rtex->last_msaa_resolve_target_micro_mode;
+
+ p_atomic_inc(&rscreen->dirty_fb_counter);
+ p_atomic_inc(&rscreen->dirty_tex_descriptor_counter);
+}
+
void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
struct pipe_framebuffer_state *fb,
struct r600_atom *fb_state,
- unsigned *buffers,
+ unsigned *buffers, unsigned *dirty_cbufs,
const union pipe_color_union *color)
{
int i;
- if (rctx->current_render_cond)
+ /* This function is broken in BE, so just disable this path for now */
+#ifdef PIPE_ARCH_BIG_ENDIAN
+ return;
+#endif
+
+ if (rctx->render_cond)
return;
for (i = 0; i < fb->nr_cbufs; i++) {
@@ -1243,11 +2511,6 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
tex = (struct r600_texture *)fb->cbufs[i]->texture;
- /* 128-bit formats are unusupported */
- if (util_format_get_blocksizebits(fb->cbufs[i]->format) > 64) {
- continue;
- }
-
/* the clear is allowed if all layers are bound */
if (fb->cbufs[i]->u.tex.first_layer != 0 ||
fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->resource.b.b, 0)) {
@@ -1264,6 +2527,14 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
continue;
}
+ /* shared textures can't use fast clear without an explicit flush,
+ * because there is no way to communicate the clear color among
+ * all clients
+ */
+ if (tex->resource.is_shared &&
+ !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+ continue;
+
/* fast color clear with 1D tiling doesn't work on old kernels and CIK */
if (tex->surface.level[0].mode == RADEON_SURF_MODE_1D &&
rctx->chip_class >= CIK &&
@@ -1272,18 +2543,72 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
continue;
}
- /* ensure CMASK is enabled */
- r600_texture_alloc_cmask_separate(rctx->screen, tex);
- if (tex->cmask.size == 0) {
- continue;
+ /* Fast clear is the most appropriate place to enable DCC for
+ * displayable surfaces.
+ */
+ if (rctx->chip_class >= VI &&
+ !(rctx->screen->debug_flags & DBG_NO_DCC_FB)) {
+ vi_separate_dcc_try_enable(rctx, tex);
+
+ /* Stoney can't do a CMASK-based clear, so all clears are
+ * considered to be hypothetically slow clears, which
+ * is weighed when determining to enable separate DCC.
+ */
+ if (tex->dcc_gather_statistics &&
+ rctx->family == CHIP_STONEY)
+ tex->num_slow_clears++;
+ }
+
+ /* Try to clear DCC first, otherwise try CMASK. */
+ if (tex->dcc_offset && tex->surface.level[0].dcc_enabled) {
+ uint32_t reset_value;
+ bool clear_words_needed;
+
+ if (rctx->screen->debug_flags & DBG_NO_DCC_CLEAR)
+ continue;
+
+ if (!vi_get_fast_clear_parameters(fb->cbufs[i]->format,
+ color, &reset_value,
+ &clear_words_needed))
+ continue;
+
+ vi_dcc_clear_level(rctx, tex, 0, reset_value);
+
+ if (clear_words_needed)
+ tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
+ tex->separate_dcc_dirty = true;
+ } else {
+ /* 128-bit formats are unusupported */
+ if (util_format_get_blocksizebits(fb->cbufs[i]->format) > 64) {
+ continue;
+ }
+
+ /* Stoney/RB+ doesn't work with CMASK fast clear. */
+ if (rctx->family == CHIP_STONEY)
+ continue;
+
+ /* ensure CMASK is enabled */
+ r600_texture_alloc_cmask_separate(rctx->screen, tex);
+ if (tex->cmask.size == 0) {
+ continue;
+ }
+
+ /* Do the fast clear. */
+ rctx->clear_buffer(&rctx->b, &tex->cmask_buffer->b.b,
+ tex->cmask.offset, tex->cmask.size, 0,
+ R600_COHERENCY_CB_META);
+
+ tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
}
- /* Do the fast clear. */
+ /* We can change the micro tile mode before a full clear. */
+ if (rctx->screen->chip_class >= SI)
+ si_set_optimal_micro_tile_mode(rctx->screen, tex);
+
evergreen_set_clear_color(tex, fb->cbufs[i]->format, color);
- rctx->clear_buffer(&rctx->b, &tex->cmask_buffer->b.b,
- tex->cmask.offset, tex->cmask.size, 0, true);
- tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
+ if (dirty_cbufs)
+ *dirty_cbufs |= 1 << i;
rctx->set_atom_dirty(rctx, fb_state, true);
*buffers &= ~clear_bit;
}
@@ -1299,4 +2624,5 @@ void r600_init_context_texture_functions(struct r600_common_context *rctx)
{
rctx->b.create_surface = r600_create_surface;
rctx->b.surface_destroy = r600_surface_destroy;
+ rctx->b.clear_texture = r600_clear_texture;
}
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.c b/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.c
new file mode 100644
index 000000000..8aaa85d02
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.c
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors: Tom Stellard <thomas.stellard@amd.com>
+ *
+ */
+
+#include "radeon_elf_util.h"
+#include "r600_pipe_common.h"
+
+#include "util/u_memory.h"
+
+#include <gelf.h>
+#include <libelf.h>
+#include <stdio.h>
+
+static void parse_symbol_table(Elf_Data *symbol_table_data,
+ const GElf_Shdr *symbol_table_header,
+ struct radeon_shader_binary *binary)
+{
+ GElf_Sym symbol;
+ unsigned i = 0;
+ unsigned symbol_count =
+ symbol_table_header->sh_size / symbol_table_header->sh_entsize;
+
+ /* We are over allocating this list, because symbol_count gives the
+ * total number of symbols, and we will only be filling the list
+ * with offsets of global symbols. The memory savings from
+ * allocating the correct size of this list will be small, and
+ * I don't think it is worth the cost of pre-computing the number
+ * of global symbols.
+ */
+ binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
+
+ while (gelf_getsym(symbol_table_data, i++, &symbol)) {
+ unsigned i;
+ if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
+ symbol.st_shndx == 0 /* Undefined symbol */) {
+ continue;
+ }
+
+ binary->global_symbol_offsets[binary->global_symbol_count] =
+ symbol.st_value;
+
+ /* Sort the list using bubble sort. This list will usually
+ * be small. */
+ for (i = binary->global_symbol_count; i > 0; --i) {
+ uint64_t lhs = binary->global_symbol_offsets[i - 1];
+ uint64_t rhs = binary->global_symbol_offsets[i];
+ if (lhs < rhs) {
+ break;
+ }
+ binary->global_symbol_offsets[i] = lhs;
+ binary->global_symbol_offsets[i - 1] = rhs;
+ }
+ ++binary->global_symbol_count;
+ }
+}
+
+static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
+ unsigned symbol_sh_link,
+ struct radeon_shader_binary *binary)
+{
+ unsigned i;
+
+ if (!relocs || !symbols || !binary->reloc_count) {
+ return;
+ }
+ binary->relocs = CALLOC(binary->reloc_count,
+ sizeof(struct radeon_shader_reloc));
+ for (i = 0; i < binary->reloc_count; i++) {
+ GElf_Sym symbol;
+ GElf_Rel rel;
+ char *symbol_name;
+ struct radeon_shader_reloc *reloc = &binary->relocs[i];
+
+ gelf_getrel(relocs, i, &rel);
+ gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
+ symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
+
+ reloc->offset = rel.r_offset;
+ strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
+ reloc->name[sizeof(reloc->name)-1] = 0;
+ }
+}
+
+void radeon_elf_read(const char *elf_data, unsigned elf_size,
+ struct radeon_shader_binary *binary)
+{
+ char *elf_buffer;
+ Elf *elf;
+ Elf_Scn *section = NULL;
+ Elf_Data *symbols = NULL, *relocs = NULL;
+ size_t section_str_index;
+ unsigned symbol_sh_link = 0;
+
+ /* One of the libelf implementations
+ * (http://www.mr511.de/software/english.htm) requires calling
+ * elf_version() before elf_memory().
+ */
+ elf_version(EV_CURRENT);
+ elf_buffer = MALLOC(elf_size);
+ memcpy(elf_buffer, elf_data, elf_size);
+
+ elf = elf_memory(elf_buffer, elf_size);
+
+ elf_getshdrstrndx(elf, &section_str_index);
+
+ while ((section = elf_nextscn(elf, section))) {
+ const char *name;
+ Elf_Data *section_data = NULL;
+ GElf_Shdr section_header;
+ if (gelf_getshdr(section, &section_header) != &section_header) {
+ fprintf(stderr, "Failed to read ELF section header\n");
+ return;
+ }
+ name = elf_strptr(elf, section_str_index, section_header.sh_name);
+ if (!strcmp(name, ".text")) {
+ section_data = elf_getdata(section, section_data);
+ binary->code_size = section_data->d_size;
+ binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
+ memcpy(binary->code, section_data->d_buf, binary->code_size);
+ } else if (!strcmp(name, ".AMDGPU.config")) {
+ section_data = elf_getdata(section, section_data);
+ binary->config_size = section_data->d_size;
+ binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
+ memcpy(binary->config, section_data->d_buf, binary->config_size);
+ } else if (!strcmp(name, ".AMDGPU.disasm")) {
+ /* Always read disassembly if it's available. */
+ section_data = elf_getdata(section, section_data);
+ binary->disasm_string = strndup(section_data->d_buf,
+ section_data->d_size);
+ } else if (!strncmp(name, ".rodata", 7)) {
+ section_data = elf_getdata(section, section_data);
+ binary->rodata_size = section_data->d_size;
+ binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
+ memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
+ } else if (!strncmp(name, ".symtab", 7)) {
+ symbols = elf_getdata(section, section_data);
+ symbol_sh_link = section_header.sh_link;
+ parse_symbol_table(symbols, &section_header, binary);
+ } else if (!strcmp(name, ".rel.text")) {
+ relocs = elf_getdata(section, section_data);
+ binary->reloc_count = section_header.sh_size /
+ section_header.sh_entsize;
+ }
+ }
+
+ parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
+
+ if (elf){
+ elf_end(elf);
+ }
+ FREE(elf_buffer);
+
+ /* Cache the config size per symbol */
+ if (binary->global_symbol_count) {
+ binary->config_size_per_symbol =
+ binary->config_size / binary->global_symbol_count;
+ } else {
+ binary->global_symbol_count = 1;
+ binary->config_size_per_symbol = binary->config_size;
+ }
+}
+
+const unsigned char *radeon_shader_binary_config_start(
+ const struct radeon_shader_binary *binary,
+ uint64_t symbol_offset)
+{
+ unsigned i;
+ for (i = 0; i < binary->global_symbol_count; ++i) {
+ if (binary->global_symbol_offsets[i] == symbol_offset) {
+ unsigned offset = i * binary->config_size_per_symbol;
+ return binary->config + offset;
+ }
+ }
+ return binary->config;
+}
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.h b/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.h
new file mode 100644
index 000000000..c2af9e0df
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_elf_util.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors: Tom Stellard <thomas.stellard@amd.com>
+ *
+ */
+
+#ifndef RADEON_ELF_UTIL_H
+#define RADEON_ELF_UTIL_H
+
+#include <stdint.h>
+
+struct radeon_shader_binary;
+struct radeon_shader_reloc;
+
+/*
+ * Parse the elf binary stored in \p elf_data and create a
+ * radeon_shader_binary object.
+ */
+void radeon_elf_read(const char *elf_data, unsigned elf_size,
+ struct radeon_shader_binary *binary);
+
+/**
+ * @returns A pointer to the start of the configuration information for
+ * the function starting at \p symbol_offset of the binary.
+ */
+const unsigned char *radeon_shader_binary_config_start(
+ const struct radeon_shader_binary *binary,
+ uint64_t symbol_offset);
+
+#endif /* RADEON_ELF_UTIL_H */
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c
index 55c216aa5..fb1491a28 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c
@@ -57,7 +57,9 @@
#define FB_BUFFER_OFFSET 0x1000
#define FB_BUFFER_SIZE 2048
+#define FB_BUFFER_SIZE_TONGA (2048 * 64)
#define IT_SCALING_TABLE_SIZE 992
+#define UVD_SESSION_CONTEXT_SIZE (128 * 1024)
/* UVD decoder representation */
struct ruvd_decoder {
@@ -78,6 +80,7 @@ struct ruvd_decoder {
struct rvid_buffer msg_fb_it_buffers[NUM_BUFFERS];
struct ruvd_msg *msg;
uint32_t *fb;
+ unsigned fb_size;
uint8_t *it;
struct rvid_buffer bs_buffers[NUM_BUFFERS];
@@ -87,38 +90,40 @@ struct ruvd_decoder {
struct rvid_buffer dpb;
bool use_legacy;
struct rvid_buffer ctx;
+ struct rvid_buffer sessionctx;
};
/* flush IB to the hardware */
-static void flush(struct ruvd_decoder *dec)
+static int flush(struct ruvd_decoder *dec, unsigned flags)
{
- dec->ws->cs_flush(dec->cs, RADEON_FLUSH_ASYNC, NULL, 0);
+ return dec->ws->cs_flush(dec->cs, flags, NULL);
}
/* add a new set register command to the IB */
static void set_reg(struct ruvd_decoder *dec, unsigned reg, uint32_t val)
{
- uint32_t *pm4 = dec->cs->buf;
- pm4[dec->cs->cdw++] = RUVD_PKT0(reg >> 2, 0);
- pm4[dec->cs->cdw++] = val;
+ radeon_emit(dec->cs, RUVD_PKT0(reg >> 2, 0));
+ radeon_emit(dec->cs, val);
}
/* send a command to the VCPU through the GPCOM registers */
static void send_cmd(struct ruvd_decoder *dec, unsigned cmd,
- struct radeon_winsys_cs_handle* cs_buf, uint32_t off,
+ struct pb_buffer* buf, uint32_t off,
enum radeon_bo_usage usage, enum radeon_bo_domain domain)
{
int reloc_idx;
- reloc_idx = dec->ws->cs_add_reloc(dec->cs, cs_buf, usage, domain,
- RADEON_PRIO_MIN);
+ reloc_idx = dec->ws->cs_add_buffer(dec->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED,
+ domain,
+ RADEON_PRIO_UVD);
if (!dec->use_legacy) {
uint64_t addr;
- addr = dec->ws->buffer_get_virtual_address(cs_buf);
+ addr = dec->ws->buffer_get_virtual_address(buf);
addr = addr + off;
set_reg(dec, RUVD_GPCOM_VCPU_DATA0, addr);
set_reg(dec, RUVD_GPCOM_VCPU_DATA1, addr >> 32);
} else {
+ off += dec->ws->buffer_get_reloc_offset(buf);
set_reg(dec, RUVD_GPCOM_VCPU_DATA0, off);
set_reg(dec, RUVD_GPCOM_VCPU_DATA1, reloc_idx * 4);
}
@@ -142,13 +147,13 @@ static void map_msg_fb_it_buf(struct ruvd_decoder *dec)
buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
/* and map it for CPU access */
- ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs, PIPE_TRANSFER_WRITE);
+ ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE);
/* calc buffer offsets */
dec->msg = (struct ruvd_msg *)ptr;
dec->fb = (uint32_t *)(ptr + FB_BUFFER_OFFSET);
if (have_it(dec))
- dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + FB_BUFFER_SIZE);
+ dec->it = (uint8_t *)(ptr + FB_BUFFER_OFFSET + dec->fb_size);
}
/* unmap and send a message command to the VCPU */
@@ -164,13 +169,19 @@ static void send_msg_buf(struct ruvd_decoder *dec)
buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
/* unmap the buffer */
- dec->ws->buffer_unmap(buf->res->cs_buf);
+ dec->ws->buffer_unmap(buf->res->buf);
dec->msg = NULL;
dec->fb = NULL;
dec->it = NULL;
+
+ if (dec->sessionctx.res)
+ send_cmd(dec, RUVD_CMD_SESSION_CONTEXT_BUFFER,
+ dec->sessionctx.res->buf, 0, RADEON_USAGE_READWRITE,
+ RADEON_DOMAIN_VRAM);
+
/* and send it to the hardware */
- send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->cs_buf, 0,
+ send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->buf, 0,
RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
}
@@ -207,7 +218,61 @@ static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family)
}
}
-static unsigned calc_ctx_size(struct ruvd_decoder *dec)
+static unsigned calc_ctx_size_h264_perf(struct ruvd_decoder *dec)
+{
+ unsigned width_in_mb, height_in_mb, ctx_size;
+ unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+ unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
+
+ unsigned max_references = dec->base.max_references + 1;
+
+ // picture width & height in 16 pixel units
+ width_in_mb = width / VL_MACROBLOCK_WIDTH;
+ height_in_mb = align(height / VL_MACROBLOCK_HEIGHT, 2);
+
+ if (!dec->use_legacy) {
+ unsigned fs_in_mb = width_in_mb * height_in_mb;
+ unsigned num_dpb_buffer;
+ switch(dec->base.level) {
+ case 30:
+ num_dpb_buffer = 8100 / fs_in_mb;
+ break;
+ case 31:
+ num_dpb_buffer = 18000 / fs_in_mb;
+ break;
+ case 32:
+ num_dpb_buffer = 20480 / fs_in_mb;
+ break;
+ case 41:
+ num_dpb_buffer = 32768 / fs_in_mb;
+ break;
+ case 42:
+ num_dpb_buffer = 34816 / fs_in_mb;
+ break;
+ case 50:
+ num_dpb_buffer = 110400 / fs_in_mb;
+ break;
+ case 51:
+ num_dpb_buffer = 184320 / fs_in_mb;
+ break;
+ default:
+ num_dpb_buffer = 184320 / fs_in_mb;
+ break;
+ }
+ num_dpb_buffer++;
+ max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references);
+ ctx_size = max_references * align(width_in_mb * height_in_mb * 192, 256);
+ } else {
+ // the firmware seems to always assume a minimum of ref frames
+ max_references = MAX2(NUM_H264_REFS, max_references);
+ // macroblock context buffer
+ ctx_size = align(width_in_mb * height_in_mb * max_references * 192, 256);
+ }
+
+ return ctx_size;
+}
+
+static unsigned calc_ctx_size_h265_main(struct ruvd_decoder *dec)
{
unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
@@ -224,6 +289,39 @@ static unsigned calc_ctx_size(struct ruvd_decoder *dec)
return ((width + 255) / 16) * ((height + 255) / 16) * 16 * max_references + 52 * 1024;
}
+static unsigned calc_ctx_size_h265_main10(struct ruvd_decoder *dec, struct pipe_h265_picture_desc *pic)
+{
+ unsigned block_size, log2_ctb_size, width_in_ctb, height_in_ctb, num_16x16_block_per_ctb;
+ unsigned context_buffer_size_per_ctb_row, cm_buffer_size, max_mb_address, db_left_tile_pxl_size;
+ unsigned db_left_tile_ctx_size = 4096 / 16 * (32 + 16 * 4);
+
+ unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+ unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
+ unsigned coeff_10bit = (pic->pps->sps->bit_depth_luma_minus8 || pic->pps->sps->bit_depth_chroma_minus8) ? 2 : 1;
+
+ unsigned max_references = dec->base.max_references + 1;
+
+ if (dec->base.width * dec->base.height >= 4096*2000)
+ max_references = MAX2(max_references, 8);
+ else
+ max_references = MAX2(max_references, 17);
+
+ block_size = (1 << (pic->pps->sps->log2_min_luma_coding_block_size_minus3 + 3));
+ log2_ctb_size = block_size + pic->pps->sps->log2_diff_max_min_luma_coding_block_size;
+
+ width_in_ctb = (width + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size;
+ height_in_ctb = (height + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size;
+
+ num_16x16_block_per_ctb = ((1 << log2_ctb_size) >> 4) * ((1 << log2_ctb_size) >> 4);
+ context_buffer_size_per_ctb_row = align(width_in_ctb * num_16x16_block_per_ctb * 16, 256);
+ max_mb_address = (unsigned) ceil(height * 8 / 2048.0);
+
+ cm_buffer_size = max_references * context_buffer_size_per_ctb_row * height_in_ctb;
+ db_left_tile_pxl_size = coeff_10bit * (max_mb_address * 2 * 2048 + 1024);
+
+ return cm_buffer_size + db_left_tile_ctx_size + db_left_tile_pxl_size;
+}
+
/* calculate size of reference picture buffer */
static unsigned calc_dpb_size(struct ruvd_decoder *dec)
{
@@ -282,17 +380,23 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec)
num_dpb_buffer++;
max_references = MAX2(MIN2(NUM_H264_REFS, num_dpb_buffer), max_references);
dpb_size = image_size * max_references;
- dpb_size += max_references * align(width_in_mb * height_in_mb * 192, alignment);
- dpb_size += align(width_in_mb * height_in_mb * 32, alignment);
+ if ((dec->stream_type != RUVD_CODEC_H264_PERF) ||
+ (((struct r600_common_screen*)dec->screen)->family < CHIP_POLARIS10)) {
+ dpb_size += max_references * align(width_in_mb * height_in_mb * 192, alignment);
+ dpb_size += align(width_in_mb * height_in_mb * 32, alignment);
+ }
} else {
// the firmware seems to allways assume a minimum of ref frames
max_references = MAX2(NUM_H264_REFS, max_references);
// reference picture buffer
dpb_size = image_size * max_references;
- // macroblock context buffer
- dpb_size += width_in_mb * height_in_mb * max_references * 192;
- // IT surface buffer
- dpb_size += width_in_mb * height_in_mb * 32;
+ if ((dec->stream_type != RUVD_CODEC_H264_PERF) ||
+ (((struct r600_common_screen*)dec->screen)->family < CHIP_POLARIS10)) {
+ // macroblock context buffer
+ dpb_size += width_in_mb * height_in_mb * max_references * 192;
+ // IT surface buffer
+ dpb_size += width_in_mb * height_in_mb * 32;
+ }
}
break;
}
@@ -305,7 +409,10 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec)
width = align (width, 16);
height = align (height, 16);
- dpb_size = align((width * height * 3) / 2, 256) * max_references;
+ if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+ dpb_size = align((width * height * 9) / 4, 256) * max_references;
+ else
+ dpb_size = align((width * height * 3) / 2, 256) * max_references;
break;
case PIPE_VIDEO_FORMAT_VC1:
@@ -402,6 +509,9 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_
result.log2_max_pic_order_cnt_lsb_minus4 = pic->pps->sps->log2_max_pic_order_cnt_lsb_minus4;
switch (dec->base.chroma_format) {
+ case PIPE_VIDEO_CHROMA_FORMAT_NONE:
+ /* TODO: assert? */
+ break;
case PIPE_VIDEO_CHROMA_FORMAT_400:
result.chroma_format = 0;
break;
@@ -478,6 +588,8 @@ static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video
result.sps_info_flags |= pic->pps->sps->separate_colour_plane_flag << 8;
if (((struct r600_common_screen*)dec->screen)->family == CHIP_CARRIZO)
result.sps_info_flags |= 1 << 9;
+ if (pic->UseRefPicList == true)
+ result.sps_info_flags |= 1 << 10;
result.chroma_format = pic->pps->sps->chroma_format_idc;
result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8;
@@ -586,6 +698,20 @@ static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video
memcpy(dec->it + 480, pic->pps->sps->ScalingList16x16, 6 * 64);
memcpy(dec->it + 864, pic->pps->sps->ScalingList32x32, 2 * 64);
+ for (i = 0 ; i < 2 ; i++) {
+ for (int j = 0 ; j < 15 ; j++)
+ result.direct_reflist[i][j] = pic->RefPicList[i][j];
+ }
+
+ if ((pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) &&
+ (target->buffer_format == PIPE_FORMAT_NV12)) {
+ result.p010_mode = 0;
+ result.luma_10to8 = 5;
+ result.chroma_10to8 = 5;
+ result.sclr_luma10to8 = 4;
+ result.sclr_chroma10to8 = 4;
+ }
+
/* TODO
result.highestTid;
result.isNonRef;
@@ -811,7 +937,7 @@ static void ruvd_destroy(struct pipe_video_codec *decoder)
dec->msg->stream_handle = dec->stream_handle;
send_msg_buf(dec);
- flush(dec);
+ flush(dec, 0);
dec->ws->cs_destroy(dec->cs);
@@ -821,8 +947,8 @@ static void ruvd_destroy(struct pipe_video_codec *decoder)
}
rvid_destroy_buffer(&dec->dpb);
- if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC)
- rvid_destroy_buffer(&dec->ctx);
+ rvid_destroy_buffer(&dec->ctx);
+ rvid_destroy_buffer(&dec->sessionctx);
FREE(dec);
}
@@ -845,7 +971,7 @@ static void ruvd_begin_frame(struct pipe_video_codec *decoder,
dec->bs_size = 0;
dec->bs_ptr = dec->ws->buffer_map(
- dec->bs_buffers[dec->cur_buffer].res->cs_buf,
+ dec->bs_buffers[dec->cur_buffer].res->buf,
dec->cs, PIPE_TRANSFER_WRITE);
}
@@ -885,13 +1011,13 @@ static void ruvd_decode_bitstream(struct pipe_video_codec *decoder,
unsigned new_size = dec->bs_size + sizes[i];
if (new_size > buf->res->buf->size) {
- dec->ws->buffer_unmap(buf->res->cs_buf);
+ dec->ws->buffer_unmap(buf->res->buf);
if (!rvid_resize_buffer(dec->screen, dec->cs, buf, new_size)) {
RVID_ERR("Can't resize bitstream buffer!");
return;
}
- dec->bs_ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs,
+ dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,
PIPE_TRANSFER_WRITE);
if (!dec->bs_ptr)
return;
@@ -913,7 +1039,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
struct pipe_picture_desc *picture)
{
struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder;
- struct radeon_winsys_cs_handle *dt;
+ struct pb_buffer *dt;
struct rvid_buffer *msg_fb_it_buf, *bs_buf;
unsigned bs_size;
@@ -927,7 +1053,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
bs_size = align(dec->bs_size, 128);
memset(dec->bs_ptr, 0, bs_size - dec->bs_size);
- dec->ws->buffer_unmap(bs_buf->res->cs_buf);
+ dec->ws->buffer_unmap(bs_buf->res->buf);
map_msg_fb_it_buf(dec);
dec->msg->size = sizeof(*dec->msg);
@@ -948,9 +1074,15 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size;
dec->msg->body.decode.bsd_size = bs_size;
- dec->msg->body.decode.db_pitch = dec->base.width;
+ dec->msg->body.decode.db_pitch = align(dec->base.width, 16);
+
+ if (dec->stream_type == RUVD_CODEC_H264_PERF &&
+ ((struct r600_common_screen*)dec->screen)->family >= CHIP_POLARIS10)
+ dec->msg->body.decode.dpb_reserved = dec->ctx.res->buf->size;
dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target);
+ if (((struct r600_common_screen*)dec->screen)->family >= CHIP_STONEY)
+ dec->msg->body.decode.dt_wa_chroma_top_offset = dec->msg->body.decode.dt_pitch / 2;
switch (u_reduce_video_profile(picture->profile)) {
case PIPE_VIDEO_FORMAT_MPEG4_AVC:
@@ -959,6 +1091,20 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
case PIPE_VIDEO_FORMAT_HEVC:
dec->msg->body.decode.codec.h265 = get_h265_msg(dec, target, (struct pipe_h265_picture_desc*)picture);
+ if (dec->ctx.res == NULL) {
+ unsigned ctx_size;
+ if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+ ctx_size = calc_ctx_size_h265_main10(dec, (struct pipe_h265_picture_desc*)picture);
+ else
+ ctx_size = calc_ctx_size_h265_main(dec);
+ if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) {
+ RVID_ERR("Can't allocated context buffer.\n");
+ }
+ rvid_clear_buffer(decoder->context, &dec->ctx);
+ }
+
+ if (dec->ctx.res)
+ dec->msg->body.decode.dpb_reserved = dec->ctx.res->buf->size;
break;
case PIPE_VIDEO_FORMAT_VC1:
@@ -982,28 +1128,27 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
dec->msg->body.decode.extension_support = 0x1;
/* set at least the feedback buffer size */
- dec->fb[0] = FB_BUFFER_SIZE;
+ dec->fb[0] = dec->fb_size;
send_msg_buf(dec);
- send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->cs_buf, 0,
+ send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->buf, 0,
RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
- if (u_reduce_video_profile(picture->profile) == PIPE_VIDEO_FORMAT_HEVC) {
- send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->cs_buf, 0,
+ if (dec->ctx.res)
+ send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->buf, 0,
RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
- }
- send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->cs_buf,
+ send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->buf,
0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
send_cmd(dec, RUVD_CMD_DECODING_TARGET_BUFFER, dt, 0,
RADEON_USAGE_WRITE, RADEON_DOMAIN_VRAM);
- send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->cs_buf,
+ send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->buf,
FB_BUFFER_OFFSET, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
if (have_it(dec))
- send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->cs_buf,
- FB_BUFFER_OFFSET + FB_BUFFER_SIZE, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
+ send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->buf,
+ FB_BUFFER_OFFSET + dec->fb_size, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
set_reg(dec, RUVD_ENGINE_CNTL, 1);
- flush(dec);
+ flush(dec, RADEON_FLUSH_ASYNC);
next_buffer(dec);
}
@@ -1028,7 +1173,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
unsigned bs_buf_size;
struct radeon_info info;
struct ruvd_decoder *dec;
- int i;
+ int r, i;
ws->query_info(ws, &info);
@@ -1039,6 +1184,9 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
/* fall through */
case PIPE_VIDEO_FORMAT_MPEG4:
+ width = align(width, VL_MACROBLOCK_WIDTH);
+ height = align(height, VL_MACROBLOCK_HEIGHT);
+ break;
case PIPE_VIDEO_FORMAT_MPEG4_AVC:
width = align(width, VL_MACROBLOCK_WIDTH);
height = align(height, VL_MACROBLOCK_HEIGHT);
@@ -1055,7 +1203,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
return NULL;
if (info.drm_major < 3)
- dec->use_legacy = TRUE;
+ dec->use_legacy = true;
dec->base = *templ;
dec->base.context = context;
@@ -1074,15 +1222,17 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
dec->stream_handle = rvid_alloc_stream_handle();
dec->screen = context->screen;
dec->ws = ws;
- dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL, NULL);
+ dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL);
if (!dec->cs) {
RVID_ERR("Can't get command submission context.\n");
goto error;
}
- bs_buf_size = width * height * 512 / (16 * 16);
+ dec->fb_size = (info.family == CHIP_TONGA) ? FB_BUFFER_SIZE_TONGA :
+ FB_BUFFER_SIZE;
+ bs_buf_size = width * height * (512 / (16 * 16));
for (i = 0; i < NUM_BUFFERS; ++i) {
- unsigned msg_fb_it_size = FB_BUFFER_OFFSET + FB_BUFFER_SIZE;
+ unsigned msg_fb_it_size = FB_BUFFER_OFFSET + dec->fb_size;
STATIC_ASSERT(sizeof(struct ruvd_msg) <= FB_BUFFER_OFFSET);
if (have_it(dec))
msg_fb_it_size += IT_SCALING_TABLE_SIZE;
@@ -1111,8 +1261,8 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
rvid_clear_buffer(context, &dec->dpb);
- if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) {
- unsigned ctx_size = calc_ctx_size(dec);
+ if (dec->stream_type == RUVD_CODEC_H264_PERF && info.family >= CHIP_POLARIS10) {
+ unsigned ctx_size = calc_ctx_size_h264_perf(dec);
if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) {
RVID_ERR("Can't allocated context buffer.\n");
goto error;
@@ -1120,6 +1270,16 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
rvid_clear_buffer(context, &dec->ctx);
}
+ if (info.family >= CHIP_POLARIS10 && info.drm_minor >= 3) {
+ if (!rvid_create_buffer(dec->screen, &dec->sessionctx,
+ UVD_SESSION_CONTEXT_SIZE,
+ PIPE_USAGE_DEFAULT)) {
+ RVID_ERR("Can't allocated session ctx.\n");
+ goto error;
+ }
+ rvid_clear_buffer(context, &dec->sessionctx);
+ }
+
map_msg_fb_it_buf(dec);
dec->msg->size = sizeof(*dec->msg);
dec->msg->msg_type = RUVD_MSG_CREATE;
@@ -1129,7 +1289,10 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
dec->msg->body.create.height_in_samples = dec->base.height;
dec->msg->body.create.dpb_size = dpb_size;
send_msg_buf(dec);
- flush(dec);
+ r = flush(dec, 0);
+ if (r)
+ goto error;
+
next_buffer(dec);
return &dec->base;
@@ -1143,8 +1306,8 @@ error:
}
rvid_destroy_buffer(&dec->dpb);
- if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC)
- rvid_destroy_buffer(&dec->ctx);
+ rvid_destroy_buffer(&dec->ctx);
+ rvid_destroy_buffer(&dec->sessionctx);
FREE(dec);
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h
index 452fbd608..e3f8504d8 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h
@@ -38,13 +38,13 @@
#include "vl/vl_video_buffer.h"
/* UVD uses PM4 packet type 0 and 2 */
-#define RUVD_PKT_TYPE_S(x) (((x) & 0x3) << 30)
+#define RUVD_PKT_TYPE_S(x) (((unsigned)(x) & 0x3) << 30)
#define RUVD_PKT_TYPE_G(x) (((x) >> 30) & 0x3)
#define RUVD_PKT_TYPE_C 0x3FFFFFFF
-#define RUVD_PKT_COUNT_S(x) (((x) & 0x3FFF) << 16)
+#define RUVD_PKT_COUNT_S(x) (((unsigned)(x) & 0x3FFF) << 16)
#define RUVD_PKT_COUNT_G(x) (((x) >> 16) & 0x3FFF)
#define RUVD_PKT_COUNT_C 0xC000FFFF
-#define RUVD_PKT0_BASE_INDEX_S(x) (((x) & 0xFFFF) << 0)
+#define RUVD_PKT0_BASE_INDEX_S(x) (((unsigned)(x) & 0xFFFF) << 0)
#define RUVD_PKT0_BASE_INDEX_G(x) (((x) >> 0) & 0xFFFF)
#define RUVD_PKT0_BASE_INDEX_C 0xFFFF0000
#define RUVD_PKT0(index, count) (RUVD_PKT_TYPE_S(0) | RUVD_PKT0_BASE_INDEX_S(index) | RUVD_PKT_COUNT_S(count))
@@ -61,6 +61,7 @@
#define RUVD_CMD_DPB_BUFFER 0x00000001
#define RUVD_CMD_DECODING_TARGET_BUFFER 0x00000002
#define RUVD_CMD_FEEDBACK_BUFFER 0x00000003
+#define RUVD_CMD_SESSION_CONTEXT_BUFFER 0x00000005
#define RUVD_CMD_BITSTREAM_BUFFER 0x00000100
#define RUVD_CMD_ITSCALING_TABLE_BUFFER 0x00000204
#define RUVD_CMD_CONTEXT_BUFFER 0x00000206
@@ -233,6 +234,15 @@ struct ruvd_h265 {
uint8_t highestTid;
uint8_t isNonRef;
+
+ uint8_t p010_mode;
+ uint8_t msb_mode;
+ uint8_t luma_10to8;
+ uint8_t chroma_10to8;
+ uint8_t sclr_luma10to8;
+ uint8_t sclr_chroma10to8;
+
+ uint8_t direct_reflist[2][15];
};
struct ruvd_vc1 {
@@ -385,7 +395,10 @@ struct ruvd_msg {
uint32_t dt_chroma_top_offset;
uint32_t dt_chroma_bottom_offset;
uint32_t dt_surf_tile_config;
- uint32_t dt_reserved[3];
+ uint32_t dt_uv_surf_tile_config;
+ // re-use dt_wa_chroma_top_offset as dt_ext_info for UV pitch in stoney
+ uint32_t dt_wa_chroma_top_offset;
+ uint32_t dt_wa_chroma_bottom_offset;
uint32_t reserved[16];
@@ -409,7 +422,7 @@ struct ruvd_msg {
};
/* driver dependent callback */
-typedef struct radeon_winsys_cs_handle* (*ruvd_set_dtb)
+typedef struct pb_buffer* (*ruvd_set_dtb)
(struct ruvd_msg* msg, struct vl_video_buffer *vb);
/* create an UVD decode */
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c
index 7eab974a3..ef93e46c1 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c
@@ -49,13 +49,16 @@
#define FW_50_1_2 ((50 << 24) | (1 << 16) | (2 << 8))
#define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8))
#define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8))
+#define FW_52_0_3 ((52 << 24) | (0 << 16) | (3 << 8))
+#define FW_52_4_3 ((52 << 24) | (4 << 16) | (3 << 8))
+#define FW_52_8_3 ((52 << 24) | (8 << 16) | (3 << 8))
/**
* flush commands to the hardware
*/
static void flush(struct rvce_encoder *enc)
{
- enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL, 0);
+ enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL);
enc->task_info_idx = 0;
enc->bs_idx = 0;
}
@@ -63,7 +66,7 @@ static void flush(struct rvce_encoder *enc)
#if 0
static void dump_feedback(struct rvce_encoder *enc, struct rvid_buffer *fb)
{
- uint32_t *ptr = enc->ws->buffer_map(fb->res->cs_buf, enc->cs, PIPE_TRANSFER_READ_WRITE);
+ uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE);
unsigned i = 0;
fprintf(stderr, "\n");
fprintf(stderr, "encStatus:\t\t\t%08x\n", ptr[i++]);
@@ -82,7 +85,7 @@ static void dump_feedback(struct rvce_encoder *enc, struct rvid_buffer *fb)
fprintf(stderr, "seiPrivatePackageOffset:\t%08x\n", ptr[i++]);
fprintf(stderr, "seiPrivatePackageSize:\t\t%08x\n", ptr[i++]);
fprintf(stderr, "\n");
- enc->ws->buffer_unmap(fb->res->cs_buf);
+ enc->ws->buffer_unmap(fb->res->buf);
}
#endif
@@ -265,6 +268,7 @@ static void rvce_begin_frame(struct pipe_video_codec *encoder,
enc->pic.quant_b_frames != pic->quant_b_frames;
enc->pic = *pic;
+ get_pic_param(enc, pic);
enc->get_buffer(vid_buf->resources[0], &enc->handle, &enc->luma);
enc->get_buffer(vid_buf->resources[1], NULL, &enc->chroma);
@@ -311,7 +315,7 @@ static void rvce_encode_bitstream(struct pipe_video_codec *encoder,
RVID_ERR("Can't create feedback buffer.\n");
return;
}
- if (!enc->cs->cdw)
+ if (!radeon_emitted(enc->cs, 0))
enc->session(enc);
enc->encode(enc);
enc->feedback(enc);
@@ -345,7 +349,7 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder,
struct rvid_buffer *fb = feedback;
if (size) {
- uint32_t *ptr = enc->ws->buffer_map(fb->res->cs_buf, enc->cs, PIPE_TRANSFER_READ_WRITE);
+ uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE);
if (ptr[1]) {
*size = ptr[4] - ptr[9];
@@ -353,7 +357,7 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder,
*size = 0;
}
- enc->ws->buffer_unmap(fb->res->cs_buf);
+ enc->ws->buffer_unmap(fb->res->buf);
}
//dump_feedback(enc, fb);
rvid_destroy_buffer(fb);
@@ -403,9 +407,12 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
if (rscreen->info.drm_major == 3)
enc->use_vm = true;
- if ((rscreen->info.drm_major > 2) || (rscreen->info.drm_minor >= 42))
+ if ((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42) ||
+ rscreen->info.drm_major == 3)
enc->use_vui = true;
- if (rscreen->info.family >= CHIP_TONGA)
+ if (rscreen->info.family >= CHIP_TONGA &&
+ rscreen->info.family != CHIP_STONEY &&
+ rscreen->info.family != CHIP_POLARIS11)
enc->dual_pipe = true;
/* TODO enable B frame with dual instance */
if ((rscreen->info.family >= CHIP_TONGA) &&
@@ -426,7 +433,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
enc->screen = context->screen;
enc->ws = ws;
- enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc, NULL);
+ enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc);
if (!enc->cs) {
RVID_ERR("Can't get command submission context.\n");
goto error;
@@ -448,7 +455,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
get_buffer(((struct vl_video_buffer *)tmp_buf)->resources[0], NULL, &tmp_surf);
cpb_size = align(tmp_surf->level[0].pitch_bytes, 128);
- cpb_size = cpb_size * align(tmp_surf->npix_y, 16);
+ cpb_size = cpb_size * align(tmp_surf->npix_y, 32);
cpb_size = cpb_size * 3 / 2;
cpb_size = cpb_size * enc->cpb_num;
if (enc->dual_pipe)
@@ -469,6 +476,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
switch (rscreen->info.vce_fw_version) {
case FW_40_2_2:
radeon_vce_40_2_2_init(enc);
+ get_pic_param = radeon_vce_40_2_2_get_param;
break;
case FW_50_0_1:
@@ -476,6 +484,14 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
case FW_50_10_2:
case FW_50_17_3:
radeon_vce_50_init(enc);
+ get_pic_param = radeon_vce_50_get_param;
+ break;
+
+ case FW_52_0_3:
+ case FW_52_4_3:
+ case FW_52_8_3:
+ radeon_vce_52_init(enc);
+ get_pic_param = radeon_vce_52_get_param;
break;
default:
@@ -500,23 +516,32 @@ error:
*/
bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
{
- return rscreen->info.vce_fw_version == FW_40_2_2 ||
- rscreen->info.vce_fw_version == FW_50_0_1 ||
- rscreen->info.vce_fw_version == FW_50_1_2 ||
- rscreen->info.vce_fw_version == FW_50_10_2 ||
- rscreen->info.vce_fw_version == FW_50_17_3;
+ switch (rscreen->info.vce_fw_version) {
+ case FW_40_2_2:
+ case FW_50_0_1:
+ case FW_50_1_2:
+ case FW_50_10_2:
+ case FW_50_17_3:
+ case FW_52_0_3:
+ case FW_52_4_3:
+ case FW_52_8_3:
+ return true;
+ default:
+ return false;
+ }
}
/**
* Add the buffer as relocation to the current command submission
*/
-void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf,
+void rvce_add_buffer(struct rvce_encoder *enc, struct pb_buffer *buf,
enum radeon_bo_usage usage, enum radeon_bo_domain domain,
signed offset)
{
int reloc_idx;
- reloc_idx = enc->ws->cs_add_reloc(enc->cs, buf, usage, domain, RADEON_PRIO_MIN);
+ reloc_idx = enc->ws->cs_add_buffer(enc->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED,
+ domain, RADEON_PRIO_VCE);
if (enc->use_vm) {
uint64_t addr;
addr = enc->ws->buffer_get_virtual_address(buf);
@@ -524,6 +549,7 @@ void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *b
RVCE_CS(addr >> 32);
RVCE_CS(addr);
} else {
+ offset += enc->ws->buffer_get_reloc_offset(buf);
RVCE_CS(reloc_idx * 4);
RVCE_CS(offset);
}
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index c00565904..fe15ded39 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -59,11 +59,11 @@ static void task_info(struct rvce_encoder *enc, uint32_t op,
RVCE_BEGIN(0x00000002); // task info
if (op == 0x3) {
if (enc->task_info_idx) {
- uint32_t offs = enc->cs->cdw - enc->task_info_idx + 3;
+ uint32_t offs = enc->cs->current.cdw - enc->task_info_idx + 3;
// Update offsetOfNextTaskInfo
- enc->cs->buf[enc->task_info_idx] = offs;
+ enc->cs->current.buf[enc->task_info_idx] = offs;
}
- enc->task_info_idx = enc->cs->cdw;
+ enc->task_info_idx = enc->cs->current.cdw;
}
RVCE_CS(0xffffffff); // offsetOfNextTaskInfo
RVCE_CS(op); // taskOperation
@@ -77,7 +77,7 @@ static void task_info(struct rvce_encoder *enc, uint32_t op,
static void feedback(struct rvce_encoder *enc)
{
RVCE_BEGIN(0x05000005); // feedback buffer
- RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo
+ RVCE_WRITE(enc->fb->res->buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo
RVCE_CS(0x00000001); // feedbackRingSize
RVCE_END();
}
@@ -303,7 +303,7 @@ static void encode(struct rvce_encoder *enc)
enc->task_info(enc, 0x00000003, 0, 0, 0);
RVCE_BEGIN(0x05000001); // context buffer
- RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo
+ RVCE_READWRITE(enc->cpb.res->buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo
RVCE_END();
RVCE_BEGIN(0x05000004); // video bitstream buffer
@@ -431,6 +431,10 @@ static void destroy(struct rvce_encoder *enc)
RVCE_END();
}
+void radeon_vce_40_2_2_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+}
+
void radeon_vce_40_2_2_init(struct rvce_encoder *enc)
{
enc->session = session;
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c
index afdab18c0..262e13ba9 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -95,7 +95,7 @@ static void encode(struct rvce_encoder *enc)
enc->task_info(enc, 0x00000003, dep, 0, bs_idx);
RVCE_BEGIN(0x05000001); // context buffer
- RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
+ RVCE_READWRITE(enc->cpb.res->buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
RVCE_END();
bs_offset = -(signed)(bs_idx * enc->bs_size);
@@ -233,6 +233,10 @@ static void encode(struct rvce_encoder *enc)
RVCE_END();
}
+void radeon_vce_50_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+}
+
void radeon_vce_50_init(struct rvce_encoder *enc)
{
radeon_vce_40_2_2_init(enc);
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c
index 3894eea31..5db01fe52 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce_52.c
@@ -40,27 +40,152 @@
static const unsigned profiles[7] = { 66, 77, 88, 100, 110, 122, 244 };
+static void get_rate_control_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+ enc->enc_pic.rc.rc_method = pic->rate_ctrl.rate_ctrl_method;
+ enc->enc_pic.rc.target_bitrate = pic->rate_ctrl.target_bitrate;
+ enc->enc_pic.rc.peak_bitrate = pic->rate_ctrl.peak_bitrate;
+ enc->enc_pic.rc.quant_i_frames = pic->quant_i_frames;
+ enc->enc_pic.rc.quant_p_frames = pic->quant_p_frames;
+ enc->enc_pic.rc.quant_b_frames = pic->quant_b_frames;
+ enc->enc_pic.rc.gop_size = pic->gop_size;
+ enc->enc_pic.rc.frame_rate_num = pic->rate_ctrl.frame_rate_num;
+ enc->enc_pic.rc.frame_rate_den = pic->rate_ctrl.frame_rate_den;
+ enc->enc_pic.rc.max_qp = 51;
+ enc->enc_pic.rc.vbv_buffer_size = pic->rate_ctrl.vbv_buffer_size;
+ enc->enc_pic.rc.vbv_buf_lv = pic->rate_ctrl.vbv_buf_lv;
+ enc->enc_pic.rc.fill_data_enable = pic->rate_ctrl.fill_data_enable;
+ enc->enc_pic.rc.enforce_hrd = pic->rate_ctrl.enforce_hrd;
+ enc->enc_pic.rc.target_bits_picture = pic->rate_ctrl.target_bits_picture;
+ enc->enc_pic.rc.peak_bits_picture_integer = pic->rate_ctrl.peak_bits_picture_integer;
+ enc->enc_pic.rc.peak_bits_picture_fraction = pic->rate_ctrl.peak_bits_picture_fraction;
+}
+
+static void get_motion_estimation_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+ enc->enc_pic.me.motion_est_quarter_pixel = pic->motion_est.motion_est_quarter_pixel;
+ enc->enc_pic.me.enc_disable_sub_mode = pic->motion_est.enc_disable_sub_mode;
+ enc->enc_pic.me.lsmvert = pic->motion_est.lsmvert;
+ enc->enc_pic.me.enc_en_ime_overw_dis_subm = pic->motion_est.enc_en_ime_overw_dis_subm;
+ enc->enc_pic.me.enc_ime_overw_dis_subm_no = pic->motion_est.enc_ime_overw_dis_subm_no;
+ enc->enc_pic.me.enc_ime2_search_range_x = pic->motion_est.enc_ime2_search_range_x;
+ enc->enc_pic.me.enc_ime2_search_range_y = pic->motion_est.enc_ime2_search_range_y;
+ enc->enc_pic.me.enc_ime_decimation_search = 0x00000001;
+ enc->enc_pic.me.motion_est_half_pixel = 0x00000001;
+ enc->enc_pic.me.enc_search_range_x = 0x00000010;
+ enc->enc_pic.me.enc_search_range_y = 0x00000010;
+ enc->enc_pic.me.enc_search1_range_x = 0x00000010;
+ enc->enc_pic.me.enc_search1_range_y = 0x00000010;
+}
+
+static void get_pic_control_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+ unsigned encNumMBsPerSlice;
+ encNumMBsPerSlice = align(enc->base.width, 16) / 16;
+ encNumMBsPerSlice *= align(enc->base.height, 16) / 16;
+ enc->enc_pic.pc.enc_crop_right_offset = (align(enc->base.width, 16) - enc->base.width) >> 1;
+ enc->enc_pic.pc.enc_crop_bottom_offset = (align(enc->base.height, 16) - enc->base.height) >> 1;
+ enc->enc_pic.pc.enc_num_mbs_per_slice = encNumMBsPerSlice;
+ enc->enc_pic.pc.enc_b_pic_pattern = MAX2(enc->base.max_references, 1) - 1;
+ enc->enc_pic.pc.enc_number_of_reference_frames = MIN2(enc->base.max_references, 2);
+ enc->enc_pic.pc.enc_max_num_ref_frames = enc->base.max_references + 1;
+ enc->enc_pic.pc.enc_num_default_active_ref_l0 = 0x00000001;
+ enc->enc_pic.pc.enc_num_default_active_ref_l1 = 0x00000001;
+ enc->enc_pic.pc.enc_cabac_enable = pic->pic_ctrl.enc_cabac_enable;
+ enc->enc_pic.pc.enc_constraint_set_flags = pic->pic_ctrl.enc_constraint_set_flags;
+ enc->enc_pic.pc.enc_num_default_active_ref_l0 = 0x00000001;
+ enc->enc_pic.pc.enc_num_default_active_ref_l1 = 0x00000001;
+}
+
+static void get_task_info_param(struct rvce_encoder *enc)
+{
+ enc->enc_pic.ti.offset_of_next_task_info = 0xffffffff;
+}
+
+static void get_feedback_buffer_param(struct rvce_encoder *enc)
+{
+ enc->enc_pic.fb.feedback_ring_size = 0x00000001;
+}
+
+static void get_config_ext_param(struct rvce_encoder *enc)
+{
+ enc->enc_pic.ce.enc_enable_perf_logging = 0x00000003;
+}
+
+static void get_vui_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+ enc->enc_pic.enable_vui = pic->enable_vui;
+ enc->enc_pic.vui.video_format = 0x00000005;
+ enc->enc_pic.vui.color_prim = 0x00000002;
+ enc->enc_pic.vui.transfer_char = 0x00000002;
+ enc->enc_pic.vui.matrix_coef = 0x00000002;
+ enc->enc_pic.vui.timing_info_present_flag = 0x00000001;
+ enc->enc_pic.vui.num_units_in_tick = pic->rate_ctrl.frame_rate_den;
+ enc->enc_pic.vui.time_scale = pic->rate_ctrl.frame_rate_num * 2;
+ enc->enc_pic.vui.fixed_frame_rate_flag = 0x00000001;
+ enc->enc_pic.vui.bit_rate_scale = 0x00000004;
+ enc->enc_pic.vui.cpb_size_scale = 0x00000006;
+ enc->enc_pic.vui.initial_cpb_removal_delay_length_minus1 = 0x00000017;
+ enc->enc_pic.vui.cpb_removal_delay_length_minus1 = 0x00000017;
+ enc->enc_pic.vui.dpb_output_delay_length_minus1 = 0x00000017;
+ enc->enc_pic.vui.time_offset_length = 0x00000018;
+ enc->enc_pic.vui.motion_vectors_over_pic_boundaries_flag = 0x00000001;
+ enc->enc_pic.vui.max_bytes_per_pic_denom = 0x00000002;
+ enc->enc_pic.vui.max_bits_per_mb_denom = 0x00000001;
+ enc->enc_pic.vui.log2_max_mv_length_hori = 0x00000010;
+ enc->enc_pic.vui.log2_max_mv_length_vert = 0x00000010;
+ enc->enc_pic.vui.num_reorder_frames = 0x00000003;
+ enc->enc_pic.vui.max_dec_frame_buffering = 0x00000003;
+}
+
+void radeon_vce_52_get_param(struct rvce_encoder *enc, struct pipe_h264_enc_picture_desc *pic)
+{
+ get_rate_control_param(enc, pic);
+ get_motion_estimation_param(enc, pic);
+ get_pic_control_param(enc, pic);
+ get_task_info_param(enc);
+ get_feedback_buffer_param(enc);
+ get_vui_param(enc, pic);
+ get_config_ext_param(enc);
+
+ enc->enc_pic.picture_type = pic->picture_type;
+ enc->enc_pic.frame_num = pic->frame_num;
+ enc->enc_pic.frame_num_cnt = pic->frame_num_cnt;
+ enc->enc_pic.p_remain = pic->p_remain;
+ enc->enc_pic.i_remain = pic->i_remain;
+ enc->enc_pic.gop_cnt = pic->gop_cnt;
+ enc->enc_pic.pic_order_cnt = pic->pic_order_cnt;
+ enc->enc_pic.ref_idx_l0 = pic->ref_idx_l0;
+ enc->enc_pic.ref_idx_l1 = pic->ref_idx_l1;
+ enc->enc_pic.not_referenced = pic->not_referenced;
+ if (enc->dual_inst)
+ enc->enc_pic.addrmode_arraymode_disrdo_distwoinstants = 0x00000201;
+ else
+ enc->enc_pic.addrmode_arraymode_disrdo_distwoinstants = 0x01000201;
+ enc->enc_pic.is_idr = pic->is_idr;
+}
+
static void create(struct rvce_encoder *enc)
{
enc->task_info(enc, 0x00000000, 0, 0, 0);
RVCE_BEGIN(0x01000001); // create cmd
- RVCE_CS(0x00000000); // encUseCircularBuffer
+ RVCE_CS(enc->enc_pic.ec.enc_use_circular_buffer);
RVCE_CS(profiles[enc->base.profile -
PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE]); // encProfile
RVCE_CS(enc->base.level); // encLevel
- RVCE_CS(0x00000000); // encPicStructRestriction
+ RVCE_CS(enc->enc_pic.ec.enc_pic_struct_restriction);
RVCE_CS(enc->base.width); // encImageWidth
RVCE_CS(enc->base.height); // encImageHeight
RVCE_CS(enc->luma->level[0].pitch_bytes); // encRefPicLumaPitch
RVCE_CS(enc->chroma->level[0].pitch_bytes); // encRefPicChromaPitch
RVCE_CS(align(enc->luma->npix_y, 16) / 8); // encRefYHeightInQw
- RVCE_CS(0x00000000); // encRefPic(Addr|Array)Mode, encPicStructRestriction, disableRDO
+ RVCE_CS(enc->enc_pic.addrmode_arraymode_disrdo_distwoinstants);
- RVCE_CS(0x00000000); // encPreEncodeContextBufferOffset
- RVCE_CS(0x00000000); // encPreEncodeInputLumaBufferOffset
- RVCE_CS(0x00000000); // encPreEncodeInputChromaBufferOffs
- RVCE_CS(0x00000000); // encPreEncodeMode|ChromaFlag|VBAQMode|SceneChangeSensitivity
+ RVCE_CS(enc->enc_pic.ec.enc_pre_encode_context_buffer_offset);
+ RVCE_CS(enc->enc_pic.ec.enc_pre_encode_input_luma_buffer_offset);
+ RVCE_CS(enc->enc_pic.ec.enc_pre_encode_input_chroma_buffer_offset);
+ RVCE_CS(enc->enc_pic.ec.enc_pre_encode_mode_chromaflag_vbaqmode_scenechangesensitivity);
RVCE_END();
}
@@ -73,7 +198,7 @@ static void encode(struct rvce_encoder *enc)
if (enc->dual_inst) {
if (bs_idx == 0)
dep = 1;
- else if (enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR)
+ else if (enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR)
dep = 0;
else
dep = 2;
@@ -107,13 +232,13 @@ static void encode(struct rvce_encoder *enc)
}
RVCE_BEGIN(0x03000001); // encode
- RVCE_CS(enc->pic.frame_num ? 0x0 : 0x11); // insertHeaders
- RVCE_CS(0x00000000); // pictureStructure
+ RVCE_CS(enc->enc_pic.frame_num ? 0x0 : 0x11); // insertHeaders
+ RVCE_CS(enc->enc_pic.eo.picture_structure);
RVCE_CS(enc->bs_size); // allowedMaxBitstreamSize
- RVCE_CS(0x00000000); // forceRefreshMap
- RVCE_CS(0x00000000); // insertAUD
- RVCE_CS(0x00000000); // endOfSequence
- RVCE_CS(0x00000000); // endOfStream
+ RVCE_CS(enc->enc_pic.eo.force_refresh_map);
+ RVCE_CS(enc->enc_pic.eo.insert_aud);
+ RVCE_CS(enc->enc_pic.eo.end_of_sequence);
+ RVCE_CS(enc->enc_pic.eo.end_of_stream);
RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
enc->luma->level[0].offset); // inputPictureLumaAddressHi/Lo
RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM,
@@ -122,121 +247,396 @@ static void encode(struct rvce_encoder *enc)
RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
if (enc->dual_pipe)
- RVCE_CS(0x00000000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
+ enc->enc_pic.eo.enc_input_pic_addr_array_disable2pipe_disablemboffload = 0x00000000;
else
- RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
- RVCE_CS(0x00000000); // encInputPicTileConfig
- RVCE_CS(enc->pic.picture_type); // encPicType
- RVCE_CS(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag
- RVCE_CS(0x00000000); // encIdrPicId
- RVCE_CS(0x00000000); // encMGSKeyPic
- RVCE_CS(!enc->pic.not_referenced); // encReferenceFlag
- RVCE_CS(0x00000000); // encTemporalLayerIndex
- RVCE_CS(0x00000000); // num_ref_idx_active_override_flag
- RVCE_CS(0x00000000); // num_ref_idx_l0_active_minus1
- RVCE_CS(0x00000000); // num_ref_idx_l1_active_minus1
-
- i = enc->pic.frame_num - enc->pic.ref_idx_l0;
- if (i > 1 && enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P) {
- RVCE_CS(0x00000001); // encRefListModificationOp
- RVCE_CS(i - 1); // encRefListModificationNum
+ enc->enc_pic.eo.enc_input_pic_addr_array_disable2pipe_disablemboffload = 0x00010000;
+ RVCE_CS(enc->enc_pic.eo.enc_input_pic_addr_array_disable2pipe_disablemboffload);
+ RVCE_CS(enc->enc_pic.eo.enc_input_pic_tile_config);
+ RVCE_CS(enc->enc_pic.picture_type); // encPicType
+ RVCE_CS(enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag
+ if ((enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR) && (enc->enc_pic.eo.enc_idr_pic_id !=0))
+ enc->enc_pic.eo.enc_idr_pic_id = enc->enc_pic.idr_pic_id - 1;
+ else
+ enc->enc_pic.eo.enc_idr_pic_id = 0x00000000;
+ RVCE_CS(enc->enc_pic.eo.enc_idr_pic_id);
+ RVCE_CS(enc->enc_pic.eo.enc_mgs_key_pic);
+ RVCE_CS(!enc->enc_pic.not_referenced);
+ RVCE_CS(enc->enc_pic.eo.enc_temporal_layer_index);
+ RVCE_CS(enc->enc_pic.eo.num_ref_idx_active_override_flag);
+ RVCE_CS(enc->enc_pic.eo.num_ref_idx_l0_active_minus1);
+ RVCE_CS(enc->enc_pic.eo.num_ref_idx_l1_active_minus1);
+
+ i = enc->enc_pic.frame_num - enc->enc_pic.ref_idx_l0;
+ if (i > 1 && enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P) {
+ enc->enc_pic.eo.enc_ref_list_modification_op = 0x00000001;
+ enc->enc_pic.eo.enc_ref_list_modification_num = i - 1;
+ RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_op);
+ RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_num);
} else {
- RVCE_CS(0x00000000); // encRefListModificationOp
- RVCE_CS(0x00000000); // encRefListModificationNum
+ enc->enc_pic.eo.enc_ref_list_modification_op = 0x00000000;
+ enc->enc_pic.eo.enc_ref_list_modification_num = 0x00000000;
+ RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_op);
+ RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_num);
}
for (i = 0; i < 3; ++i) {
- RVCE_CS(0x00000000); // encRefListModificationOp
- RVCE_CS(0x00000000); // encRefListModificationNum
+ enc->enc_pic.eo.enc_ref_list_modification_op = 0x00000000;
+ enc->enc_pic.eo.enc_ref_list_modification_num = 0x00000000;
+ RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_op);
+ RVCE_CS(enc->enc_pic.eo.enc_ref_list_modification_num);
}
for (i = 0; i < 4; ++i) {
- RVCE_CS(0x00000000); // encDecodedPictureMarkingOp
- RVCE_CS(0x00000000); // encDecodedPictureMarkingNum
- RVCE_CS(0x00000000); // encDecodedPictureMarkingIdx
- RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingOp
- RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingNum
+ RVCE_CS(enc->enc_pic.eo.enc_decoded_picture_marking_op);
+ RVCE_CS(enc->enc_pic.eo.enc_decoded_picture_marking_num);
+ RVCE_CS(enc->enc_pic.eo.enc_decoded_picture_marking_idx);
+ RVCE_CS(enc->enc_pic.eo.enc_decoded_ref_base_picture_marking_op);
+ RVCE_CS(enc->enc_pic.eo.enc_decoded_ref_base_picture_marking_num);
}
// encReferencePictureL0[0]
RVCE_CS(0x00000000); // pictureStructure
- if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P ||
- enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
+ if(enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P ||
+ enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
struct rvce_cpb_slot *l0 = l0_slot(enc);
rvce_frame_offset(enc, l0, &luma_offset, &chroma_offset);
- RVCE_CS(l0->picture_type); // encPicType
- RVCE_CS(l0->frame_num); // frameNumber
- RVCE_CS(l0->pic_order_cnt); // pictureOrderCount
- RVCE_CS(luma_offset); // lumaOffset
- RVCE_CS(chroma_offset); // chromaOffset
+ RVCE_CS(l0->picture_type);
+ RVCE_CS(l0->frame_num);
+ RVCE_CS(l0->pic_order_cnt);
+ RVCE_CS(luma_offset);
+ RVCE_CS(chroma_offset);
} else {
- RVCE_CS(0x00000000); // encPicType
- RVCE_CS(0x00000000); // frameNumber
- RVCE_CS(0x00000000); // pictureOrderCount
- RVCE_CS(0xffffffff); // lumaOffset
- RVCE_CS(0xffffffff); // chromaOffset
+ enc->enc_pic.eo.l0_enc_pic_type = 0x00000000;
+ enc->enc_pic.eo.l0_frame_number = 0x00000000;
+ enc->enc_pic.eo.l0_picture_order_count = 0x00000000;
+ enc->enc_pic.eo.l0_luma_offset = 0xffffffff;
+ enc->enc_pic.eo.l0_chroma_offset = 0xffffffff;
+ RVCE_CS(enc->enc_pic.eo.l0_enc_pic_type);
+ RVCE_CS(enc->enc_pic.eo.l0_frame_number);
+ RVCE_CS(enc->enc_pic.eo.l0_picture_order_count);
+ RVCE_CS(enc->enc_pic.eo.l0_luma_offset);
+ RVCE_CS(enc->enc_pic.eo.l0_chroma_offset);
}
// encReferencePictureL0[1]
- RVCE_CS(0x00000000); // pictureStructure
- RVCE_CS(0x00000000); // encPicType
- RVCE_CS(0x00000000); // frameNumber
- RVCE_CS(0x00000000); // pictureOrderCount
- RVCE_CS(0xffffffff); // lumaOffset
- RVCE_CS(0xffffffff); // chromaOffset
+ enc->enc_pic.eo.l0_picture_structure = 0x00000000;
+ enc->enc_pic.eo.l0_enc_pic_type = 0x00000000;
+ enc->enc_pic.eo.l0_frame_number = 0x00000000;
+ enc->enc_pic.eo.l0_picture_order_count = 0x00000000;
+ enc->enc_pic.eo.l0_luma_offset = 0xffffffff;
+ enc->enc_pic.eo.l0_chroma_offset = 0xffffffff;
+ RVCE_CS(enc->enc_pic.eo.l0_picture_structure);
+ RVCE_CS(enc->enc_pic.eo.l0_enc_pic_type);
+ RVCE_CS(enc->enc_pic.eo.l0_frame_number);
+ RVCE_CS(enc->enc_pic.eo.l0_picture_order_count);
+ RVCE_CS(enc->enc_pic.eo.l0_luma_offset);
+ RVCE_CS(enc->enc_pic.eo.l0_chroma_offset);
// encReferencePictureL1[0]
RVCE_CS(0x00000000); // pictureStructure
- if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
+ if(enc->enc_pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
struct rvce_cpb_slot *l1 = l1_slot(enc);
rvce_frame_offset(enc, l1, &luma_offset, &chroma_offset);
- RVCE_CS(l1->picture_type); // encPicType
- RVCE_CS(l1->frame_num); // frameNumber
- RVCE_CS(l1->pic_order_cnt); // pictureOrderCount
- RVCE_CS(luma_offset); // lumaOffset
- RVCE_CS(chroma_offset); // chromaOffset
+ RVCE_CS(l1->picture_type);
+ RVCE_CS(l1->frame_num);
+ RVCE_CS(l1->pic_order_cnt);
+ RVCE_CS(luma_offset);
+ RVCE_CS(chroma_offset);
} else {
- RVCE_CS(0x00000000); // encPicType
- RVCE_CS(0x00000000); // frameNumber
- RVCE_CS(0x00000000); // pictureOrderCount
- RVCE_CS(0xffffffff); // lumaOffset
- RVCE_CS(0xffffffff); // chromaOffset
+ enc->enc_pic.eo.l1_enc_pic_type = 0x00000000;
+ enc->enc_pic.eo.l1_frame_number = 0x00000000;
+ enc->enc_pic.eo.l1_picture_order_count = 0x00000000;
+ enc->enc_pic.eo.l1_luma_offset = 0xffffffff;
+ enc->enc_pic.eo.l1_chroma_offset = 0xffffffff;
+ RVCE_CS(enc->enc_pic.eo.l1_enc_pic_type);
+ RVCE_CS(enc->enc_pic.eo.l1_frame_number);
+ RVCE_CS(enc->enc_pic.eo.l1_picture_order_count);
+ RVCE_CS(enc->enc_pic.eo.l1_luma_offset);
+ RVCE_CS(enc->enc_pic.eo.l1_chroma_offset);
}
rvce_frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset);
- RVCE_CS(luma_offset); // encReconstructedLumaOffset
- RVCE_CS(chroma_offset); // encReconstructedChromaOffset
- RVCE_CS(0x00000000); // encColocBufferOffset
- RVCE_CS(0x00000000); // encReconstructedRefBasePictureLumaOffset
- RVCE_CS(0x00000000); // encReconstructedRefBasePictureChromaOffset
- RVCE_CS(0x00000000); // encReferenceRefBasePictureLumaOffset
- RVCE_CS(0x00000000); // encReferenceRefBasePictureChromaOffset
- RVCE_CS(0x00000000); // pictureCount
- RVCE_CS(enc->pic.frame_num); // frameNumber
- RVCE_CS(enc->pic.pic_order_cnt); // pictureOrderCount
- RVCE_CS(0x00000000); // numIPicRemainInRCGOP
- RVCE_CS(0x00000000); // numPPicRemainInRCGOP
- RVCE_CS(0x00000000); // numBPicRemainInRCGOP
- RVCE_CS(0x00000000); // numIRPicRemainInRCGOP
- RVCE_CS(0x00000000); // enableIntraRefresh
-
- RVCE_CS(0x00000000); // aq_variance_en
- RVCE_CS(0x00000000); // aq_block_size
- RVCE_CS(0x00000000); // aq_mb_variance_sel
- RVCE_CS(0x00000000); // aq_frame_variance_sel
- RVCE_CS(0x00000000); // aq_param_a
- RVCE_CS(0x00000000); // aq_param_b
- RVCE_CS(0x00000000); // aq_param_c
- RVCE_CS(0x00000000); // aq_param_d
- RVCE_CS(0x00000000); // aq_param_e
-
- RVCE_CS(0x00000000); // contextInSFB
+ RVCE_CS(luma_offset);
+ RVCE_CS(chroma_offset);
+ RVCE_CS(enc->enc_pic.eo.enc_coloc_buffer_offset);
+ RVCE_CS(enc->enc_pic.eo.enc_reconstructed_ref_base_picture_luma_offset);
+ RVCE_CS(enc->enc_pic.eo.enc_reconstructed_ref_base_picture_chroma_offset);
+ RVCE_CS(enc->enc_pic.eo.enc_reference_ref_base_picture_luma_offset);
+ RVCE_CS(enc->enc_pic.eo.enc_reference_ref_base_picture_chroma_offset);
+ RVCE_CS(enc->enc_pic.frame_num_cnt-1);
+ RVCE_CS(enc->enc_pic.frame_num);
+ RVCE_CS(enc->enc_pic.pic_order_cnt);
+ RVCE_CS(enc->enc_pic.i_remain);
+ RVCE_CS(enc->enc_pic.p_remain);
+ RVCE_CS(enc->enc_pic.eo.num_b_pic_remain_in_rcgop);
+ RVCE_CS(enc->enc_pic.eo.num_ir_pic_remain_in_rcgop);
+ RVCE_CS(enc->enc_pic.eo.enable_intra_refresh);
+
+ RVCE_CS(enc->enc_pic.eo.aq_variance_en);
+ RVCE_CS(enc->enc_pic.eo.aq_block_size);
+ RVCE_CS(enc->enc_pic.eo.aq_mb_variance_sel);
+ RVCE_CS(enc->enc_pic.eo.aq_frame_variance_sel);
+ RVCE_CS(enc->enc_pic.eo.aq_param_a);
+ RVCE_CS(enc->enc_pic.eo.aq_param_b);
+ RVCE_CS(enc->enc_pic.eo.aq_param_c);
+ RVCE_CS(enc->enc_pic.eo.aq_param_d);
+ RVCE_CS(enc->enc_pic.eo.aq_param_e);
+
+ RVCE_CS(enc->enc_pic.eo.context_in_sfb);
RVCE_END();
}
-void radeon_vce_52_init(struct rvce_encoder *enc)
+static void rate_control(struct rvce_encoder *enc)
+{
+ RVCE_BEGIN(0x04000005); // rate control
+ RVCE_CS(enc->enc_pic.rc.rc_method);
+ RVCE_CS(enc->enc_pic.rc.target_bitrate);
+ RVCE_CS(enc->enc_pic.rc.peak_bitrate);
+ RVCE_CS(enc->enc_pic.rc.frame_rate_num);
+ RVCE_CS(enc->enc_pic.rc.gop_size);
+ RVCE_CS(enc->enc_pic.rc.quant_i_frames);
+ RVCE_CS(enc->enc_pic.rc.quant_p_frames);
+ RVCE_CS(enc->enc_pic.rc.quant_b_frames);
+ RVCE_CS(enc->enc_pic.rc.vbv_buffer_size);
+ RVCE_CS(enc->enc_pic.rc.frame_rate_den);
+ RVCE_CS(enc->enc_pic.rc.vbv_buf_lv);
+ RVCE_CS(enc->enc_pic.rc.max_au_size);
+ RVCE_CS(enc->enc_pic.rc.qp_initial_mode);
+ RVCE_CS(enc->enc_pic.rc.target_bits_picture);
+ RVCE_CS(enc->enc_pic.rc.peak_bits_picture_integer);
+ RVCE_CS(enc->enc_pic.rc.peak_bits_picture_fraction);
+ RVCE_CS(enc->enc_pic.rc.min_qp);
+ RVCE_CS(enc->enc_pic.rc.max_qp);
+ RVCE_CS(enc->enc_pic.rc.skip_frame_enable);
+ RVCE_CS(enc->enc_pic.rc.fill_data_enable);
+ RVCE_CS(enc->enc_pic.rc.enforce_hrd);
+ RVCE_CS(enc->enc_pic.rc.b_pics_delta_qp);
+ RVCE_CS(enc->enc_pic.rc.ref_b_pics_delta_qp);
+ RVCE_CS(enc->enc_pic.rc.rc_reinit_disable);
+ RVCE_CS(enc->enc_pic.rc.enc_lcvbr_init_qp_flag);
+ RVCE_CS(enc->enc_pic.rc.lcvbrsatd_based_nonlinear_bit_budget_flag);
+ RVCE_END();
+}
+
+static void config(struct rvce_encoder *enc)
{
- radeon_vce_50_init(enc);
+ enc->task_info(enc, 0x00000002, 0, 0xffffffff, 0);
+ enc->rate_control(enc);
+ enc->config_extension(enc);
+ enc->motion_estimation(enc);
+ enc->rdo(enc);
+ if (enc->use_vui)
+ enc->vui(enc);
+ enc->pic_control(enc);
+}
+
+static void config_extension(struct rvce_encoder *enc)
+{
+ RVCE_BEGIN(0x04000001); // config extension
+ RVCE_CS(enc->enc_pic.ce.enc_enable_perf_logging);
+ RVCE_END();
+}
+static void destroy(struct rvce_encoder *enc)
+{
+ enc->task_info(enc, 0x00000001, 0, 0, 0);
+
+ RVCE_BEGIN(0x02000001); // destroy
+ RVCE_END();
+}
+
+static void feedback(struct rvce_encoder *enc)
+{
+ RVCE_BEGIN(0x05000005); // feedback buffer
+ RVCE_WRITE(enc->fb->res->buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo
+ RVCE_CS(enc->enc_pic.fb.feedback_ring_size);
+ RVCE_END();
+}
+
+static void motion_estimation(struct rvce_encoder *enc)
+{
+ RVCE_BEGIN(0x04000007); // motion estimation
+ RVCE_CS(enc->enc_pic.me.enc_ime_decimation_search);
+ RVCE_CS(enc->enc_pic.me.motion_est_half_pixel);
+ RVCE_CS(enc->enc_pic.me.motion_est_quarter_pixel);
+ RVCE_CS(enc->enc_pic.me.disable_favor_pmv_point);
+ RVCE_CS(enc->enc_pic.me.force_zero_point_center);
+ RVCE_CS(enc->enc_pic.me.lsmvert);
+ RVCE_CS(enc->enc_pic.me.enc_search_range_x);
+ RVCE_CS(enc->enc_pic.me.enc_search_range_y);
+ RVCE_CS(enc->enc_pic.me.enc_search1_range_x);
+ RVCE_CS(enc->enc_pic.me.enc_search1_range_y);
+ RVCE_CS(enc->enc_pic.me.disable_16x16_frame1);
+ RVCE_CS(enc->enc_pic.me.disable_satd);
+ RVCE_CS(enc->enc_pic.me.enable_amd);
+ RVCE_CS(enc->enc_pic.me.enc_disable_sub_mode);
+ RVCE_CS(enc->enc_pic.me.enc_ime_skip_x);
+ RVCE_CS(enc->enc_pic.me.enc_ime_skip_y);
+ RVCE_CS(enc->enc_pic.me.enc_en_ime_overw_dis_subm);
+ RVCE_CS(enc->enc_pic.me.enc_ime_overw_dis_subm_no);
+ RVCE_CS(enc->enc_pic.me.enc_ime2_search_range_x);
+ RVCE_CS(enc->enc_pic.me.enc_ime2_search_range_y);
+ RVCE_CS(enc->enc_pic.me.parallel_mode_speedup_enable);
+ RVCE_CS(enc->enc_pic.me.fme0_enc_disable_sub_mode);
+ RVCE_CS(enc->enc_pic.me.fme1_enc_disable_sub_mode);
+ RVCE_CS(enc->enc_pic.me.ime_sw_speedup_enable);
+ RVCE_END();
+}
+
+static void pic_control(struct rvce_encoder *enc)
+{
+ RVCE_BEGIN(0x04000002); // pic control
+ RVCE_CS(enc->enc_pic.pc.enc_use_constrained_intra_pred);
+ RVCE_CS(enc->enc_pic.pc.enc_cabac_enable);
+ RVCE_CS(enc->enc_pic.pc.enc_cabac_idc);
+ RVCE_CS(enc->enc_pic.pc.enc_loop_filter_disable);
+ RVCE_CS(enc->enc_pic.pc.enc_lf_beta_offset);
+ RVCE_CS(enc->enc_pic.pc.enc_lf_alpha_c0_offset);
+ RVCE_CS(enc->enc_pic.pc.enc_crop_left_offset);
+ RVCE_CS(enc->enc_pic.pc.enc_crop_right_offset);
+ RVCE_CS(enc->enc_pic.pc.enc_crop_top_offset);
+ RVCE_CS(enc->enc_pic.pc.enc_crop_bottom_offset);
+ RVCE_CS(enc->enc_pic.pc.enc_num_mbs_per_slice);
+ RVCE_CS(enc->enc_pic.pc.enc_intra_refresh_num_mbs_per_slot);
+ RVCE_CS(enc->enc_pic.pc.enc_force_intra_refresh);
+ RVCE_CS(enc->enc_pic.pc.enc_force_imb_period);
+ RVCE_CS(enc->enc_pic.pc.enc_pic_order_cnt_type);
+ RVCE_CS(enc->enc_pic.pc.log2_max_pic_order_cnt_lsb_minus4);
+ RVCE_CS(enc->enc_pic.pc.enc_sps_id);
+ RVCE_CS(enc->enc_pic.pc.enc_pps_id);
+ RVCE_CS(enc->enc_pic.pc.enc_constraint_set_flags);
+ RVCE_CS(enc->enc_pic.pc.enc_b_pic_pattern);
+ RVCE_CS(enc->enc_pic.pc.weight_pred_mode_b_picture);
+ RVCE_CS(enc->enc_pic.pc.enc_number_of_reference_frames);
+ RVCE_CS(enc->enc_pic.pc.enc_max_num_ref_frames);
+ RVCE_CS(enc->enc_pic.pc.enc_num_default_active_ref_l0);
+ RVCE_CS(enc->enc_pic.pc.enc_num_default_active_ref_l1);
+ RVCE_CS(enc->enc_pic.pc.enc_slice_mode);
+ RVCE_CS(enc->enc_pic.pc.enc_max_slice_size);
+ RVCE_END();
+}
+
+static void rdo(struct rvce_encoder *enc)
+{
+ RVCE_BEGIN(0x04000008); // rdo
+ RVCE_CS(enc->enc_pic.rdo.enc_disable_tbe_pred_i_frame);
+ RVCE_CS(enc->enc_pic.rdo.enc_disable_tbe_pred_p_frame);
+ RVCE_CS(enc->enc_pic.rdo.use_fme_interpol_y);
+ RVCE_CS(enc->enc_pic.rdo.use_fme_interpol_uv);
+ RVCE_CS(enc->enc_pic.rdo.use_fme_intrapol_y);
+ RVCE_CS(enc->enc_pic.rdo.use_fme_intrapol_uv);
+ RVCE_CS(enc->enc_pic.rdo.use_fme_interpol_y_1);
+ RVCE_CS(enc->enc_pic.rdo.use_fme_interpol_uv_1);
+ RVCE_CS(enc->enc_pic.rdo.use_fme_intrapol_y_1);
+ RVCE_CS(enc->enc_pic.rdo.use_fme_intrapol_uv_1);
+ RVCE_CS(enc->enc_pic.rdo.enc_16x16_cost_adj);
+ RVCE_CS(enc->enc_pic.rdo.enc_skip_cost_adj);
+ RVCE_CS(enc->enc_pic.rdo.enc_force_16x16_skip);
+ RVCE_CS(enc->enc_pic.rdo.enc_disable_threshold_calc_a);
+ RVCE_CS(enc->enc_pic.rdo.enc_luma_coeff_cost);
+ RVCE_CS(enc->enc_pic.rdo.enc_luma_mb_coeff_cost);
+ RVCE_CS(enc->enc_pic.rdo.enc_chroma_coeff_cost);
+ RVCE_END();
+}
+
+static void session(struct rvce_encoder *enc)
+{
+ RVCE_BEGIN(0x00000001); // session cmd
+ RVCE_CS(enc->stream_handle);
+ RVCE_END();
+}
+
+static void task_info(struct rvce_encoder *enc, uint32_t op,
+ uint32_t dep, uint32_t fb_idx, uint32_t ring_idx)
+{
+ RVCE_BEGIN(0x00000002); // task info
+ if (op == 0x3) {
+ if (enc->task_info_idx) {
+ uint32_t offs = enc->cs->current.cdw - enc->task_info_idx + 3;
+ // Update offsetOfNextTaskInfo
+ enc->cs->current.buf[enc->task_info_idx] = offs;
+ }
+ enc->task_info_idx = enc->cs->current.cdw;
+ }
+ enc->enc_pic.ti.task_operation = op;
+ enc->enc_pic.ti.reference_picture_dependency = dep;
+ enc->enc_pic.ti.feedback_index = fb_idx;
+ enc->enc_pic.ti.video_bitstream_ring_index = ring_idx;
+ RVCE_CS(enc->enc_pic.ti.offset_of_next_task_info);
+ RVCE_CS(enc->enc_pic.ti.task_operation);
+ RVCE_CS(enc->enc_pic.ti.reference_picture_dependency);
+ RVCE_CS(enc->enc_pic.ti.collocate_flag_dependency);
+ RVCE_CS(enc->enc_pic.ti.feedback_index);
+ RVCE_CS(enc->enc_pic.ti.video_bitstream_ring_index);
+ RVCE_END();
+}
+
+static void vui(struct rvce_encoder *enc)
+{
+ int i;
+
+ if (!enc->enc_pic.enable_vui)
+ return;
+
+ RVCE_BEGIN(0x04000009); // vui
+ RVCE_CS(enc->enc_pic.vui.aspect_ratio_info_present_flag);
+ RVCE_CS(enc->enc_pic.vui.aspect_ratio_idc);
+ RVCE_CS(enc->enc_pic.vui.sar_width);
+ RVCE_CS(enc->enc_pic.vui.sar_height);
+ RVCE_CS(enc->enc_pic.vui.overscan_info_present_flag);
+ RVCE_CS(enc->enc_pic.vui.overscan_Approp_flag);
+ RVCE_CS(enc->enc_pic.vui.video_signal_type_present_flag);
+ RVCE_CS(enc->enc_pic.vui.video_format);
+ RVCE_CS(enc->enc_pic.vui.video_full_range_flag);
+ RVCE_CS(enc->enc_pic.vui.color_description_present_flag);
+ RVCE_CS(enc->enc_pic.vui.color_prim);
+ RVCE_CS(enc->enc_pic.vui.transfer_char);
+ RVCE_CS(enc->enc_pic.vui.matrix_coef);
+ RVCE_CS(enc->enc_pic.vui.chroma_loc_info_present_flag);
+ RVCE_CS(enc->enc_pic.vui.chroma_loc_top);
+ RVCE_CS(enc->enc_pic.vui.chroma_loc_bottom);
+ RVCE_CS(enc->enc_pic.vui.timing_info_present_flag);
+ RVCE_CS(enc->enc_pic.vui.num_units_in_tick);
+ RVCE_CS(enc->enc_pic.vui.time_scale);
+ RVCE_CS(enc->enc_pic.vui.fixed_frame_rate_flag);
+ RVCE_CS(enc->enc_pic.vui.nal_hrd_parameters_present_flag);
+ RVCE_CS(enc->enc_pic.vui.cpb_cnt_minus1);
+ RVCE_CS(enc->enc_pic.vui.bit_rate_scale);
+ RVCE_CS(enc->enc_pic.vui.cpb_size_scale);
+ for (i = 0; i < 32; i++) {
+ RVCE_CS(enc->enc_pic.vui.bit_rate_value_minus);
+ RVCE_CS(enc->enc_pic.vui.cpb_size_value_minus);
+ RVCE_CS(enc->enc_pic.vui.cbr_flag);
+ }
+ RVCE_CS(enc->enc_pic.vui.initial_cpb_removal_delay_length_minus1);
+ RVCE_CS(enc->enc_pic.vui.cpb_removal_delay_length_minus1);
+ RVCE_CS(enc->enc_pic.vui.dpb_output_delay_length_minus1);
+ RVCE_CS(enc->enc_pic.vui.time_offset_length);
+ RVCE_CS(enc->enc_pic.vui.low_delay_hrd_flag);
+ RVCE_CS(enc->enc_pic.vui.pic_struct_present_flag);
+ RVCE_CS(enc->enc_pic.vui.bitstream_restriction_present_flag);
+ RVCE_CS(enc->enc_pic.vui.motion_vectors_over_pic_boundaries_flag);
+ RVCE_CS(enc->enc_pic.vui.max_bytes_per_pic_denom);
+ RVCE_CS(enc->enc_pic.vui.max_bits_per_mb_denom);
+ RVCE_CS(enc->enc_pic.vui.log2_max_mv_length_hori);
+ RVCE_CS(enc->enc_pic.vui.log2_max_mv_length_vert);
+ RVCE_CS(enc->enc_pic.vui.num_reorder_frames);
+ RVCE_CS(enc->enc_pic.vui.max_dec_frame_buffering);
+ RVCE_END();
+}
+
+void radeon_vce_52_init(struct rvce_encoder *enc)
+{
+ enc->session = session;
+ enc->task_info = task_info;
enc->create = create;
+ enc->feedback = feedback;
+ enc->rate_control = rate_control;
+ enc->config_extension = config_extension;
+ enc->pic_control = pic_control;
+ enc->motion_estimation = motion_estimation;
+ enc->rdo = rdo;
+ enc->vui = vui;
+ enc->config = config;
enc->encode = encode;
+ enc->destroy = destroy;
}
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_video.c b/lib/mesa/src/gallium/drivers/radeon/radeon_video.c
index f56c6cf6c..de8e11cd8 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_video.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_video.c
@@ -43,6 +43,8 @@
#include "radeon_video.h"
#include "radeon_vce.h"
+#define UVD_FW_1_66_16 ((1 << 24) | (66 << 16) | (16 << 8))
+
/* generate an stream handle */
unsigned rvid_alloc_stream_handle()
{
@@ -64,8 +66,14 @@ bool rvid_create_buffer(struct pipe_screen *screen, struct rvid_buffer *buffer,
{
memset(buffer, 0, sizeof(*buffer));
buffer->usage = usage;
+
+ /* Hardware buffer placement restrictions require the kernel to be
+ * able to move buffers around individually, so request a
+ * non-sub-allocated buffer.
+ */
buffer->res = (struct r600_resource *)
- pipe_buffer_create(screen, PIPE_BIND_CUSTOM, usage, size);
+ pipe_buffer_create(screen, PIPE_BIND_CUSTOM | PIPE_BIND_SHARED,
+ usage, size);
return buffer->res != NULL;
}
@@ -73,7 +81,7 @@ bool rvid_create_buffer(struct pipe_screen *screen, struct rvid_buffer *buffer,
/* destroy a buffer */
void rvid_destroy_buffer(struct rvid_buffer *buffer)
{
- pipe_resource_reference((struct pipe_resource **)&buffer->res, NULL);
+ r600_resource_reference(&buffer->res, NULL);
}
/* reallocate a buffer, preserving its content */
@@ -89,11 +97,11 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs,
if (!rvid_create_buffer(screen, new_buf, new_size, new_buf->usage))
goto error;
- src = ws->buffer_map(old_buf.res->cs_buf, cs, PIPE_TRANSFER_READ);
+ src = ws->buffer_map(old_buf.res->buf, cs, PIPE_TRANSFER_READ);
if (!src)
goto error;
- dst = ws->buffer_map(new_buf->res->cs_buf, cs, PIPE_TRANSFER_WRITE);
+ dst = ws->buffer_map(new_buf->res->buf, cs, PIPE_TRANSFER_WRITE);
if (!dst)
goto error;
@@ -103,14 +111,14 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs,
dst += bytes;
memset(dst, 0, new_size);
}
- ws->buffer_unmap(new_buf->res->cs_buf);
- ws->buffer_unmap(old_buf.res->cs_buf);
+ ws->buffer_unmap(new_buf->res->buf);
+ ws->buffer_unmap(old_buf.res->buf);
rvid_destroy_buffer(&old_buf);
return true;
error:
if (src)
- ws->buffer_unmap(old_buf.res->cs_buf);
+ ws->buffer_unmap(old_buf.res->buf);
rvid_destroy_buffer(new_buf);
*new_buf = old_buf;
return false;
@@ -122,7 +130,7 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer)
struct r600_common_context *rctx = (struct r600_common_context*)context;
rctx->clear_buffer(context, &buffer->res->b.b, 0, buffer->res->buf->size,
- 0, false);
+ 0, R600_COHERENCY_NONE);
context->flush(context, NULL, 0);
}
@@ -130,7 +138,7 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer)
* join surfaces into the same buffer with identical tiling params
* sumup their sizes and replace the backend buffers with a single bo
*/
-void rvid_join_surfaces(struct radeon_winsys* ws, unsigned bind,
+void rvid_join_surfaces(struct radeon_winsys* ws,
struct pb_buffer** buffers[VL_NUM_COMPONENTS],
struct radeon_surf *surfaces[VL_NUM_COMPONENTS])
{
@@ -165,7 +173,7 @@ void rvid_join_surfaces(struct radeon_winsys* ws, unsigned bind,
/* adjust the texture layer offsets */
off = align(off, surfaces[i]->bo_alignment);
- for (j = 0; j < Elements(surfaces[i]->level); ++j)
+ for (j = 0; j < ARRAY_SIZE(surfaces[i]->level); ++j)
surfaces[i]->level[j].offset += off;
off += surfaces[i]->bo_size;
}
@@ -185,7 +193,7 @@ void rvid_join_surfaces(struct radeon_winsys* ws, unsigned bind,
/* TODO: 2D tiling workaround */
alignment *= 2;
- pb = ws->buffer_create(ws, size, alignment, bind, RADEON_DOMAIN_VRAM, 0);
+ pb = ws->buffer_create(ws, size, alignment, RADEON_DOMAIN_VRAM, 0);
if (!pb)
return;
@@ -206,30 +214,33 @@ int rvid_get_video_param(struct pipe_screen *screen,
{
struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
enum pipe_video_format codec = u_reduce_video_profile(profile);
+ struct radeon_info info;
+
+ rscreen->ws->query_info(rscreen->ws, &info);
if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
switch (param) {
case PIPE_VIDEO_CAP_SUPPORTED:
return codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
rvce_is_fw_version_supported(rscreen);
- case PIPE_VIDEO_CAP_NPOT_TEXTURES:
- return 1;
- case PIPE_VIDEO_CAP_MAX_WIDTH:
+ case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+ return 1;
+ case PIPE_VIDEO_CAP_MAX_WIDTH:
return (rscreen->family < CHIP_TONGA) ? 2048 : 4096;
- case PIPE_VIDEO_CAP_MAX_HEIGHT:
+ case PIPE_VIDEO_CAP_MAX_HEIGHT:
return (rscreen->family < CHIP_TONGA) ? 1152 : 2304;
- case PIPE_VIDEO_CAP_PREFERED_FORMAT:
- return PIPE_FORMAT_NV12;
- case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
- return false;
- case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
- return false;
- case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
- return true;
- case PIPE_VIDEO_CAP_STACKED_FRAMES:
+ case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+ return PIPE_FORMAT_NV12;
+ case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+ return false;
+ case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+ return false;
+ case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+ return true;
+ case PIPE_VIDEO_CAP_STACKED_FRAMES:
return (rscreen->family < CHIP_TONGA) ? 1 : 2;
- default:
- return 0;
+ default:
+ return 0;
}
}
@@ -237,18 +248,27 @@ int rvid_get_video_param(struct pipe_screen *screen,
case PIPE_VIDEO_CAP_SUPPORTED:
switch (codec) {
case PIPE_VIDEO_FORMAT_MPEG12:
+ return profile != PIPE_VIDEO_PROFILE_MPEG1;
case PIPE_VIDEO_FORMAT_MPEG4:
+ /* no support for MPEG4 on older hw */
+ return rscreen->family >= CHIP_PALM;
case PIPE_VIDEO_FORMAT_MPEG4_AVC:
- if (rscreen->family < CHIP_PALM)
- /* no support for MPEG4 */
- return codec != PIPE_VIDEO_FORMAT_MPEG4;
+ if ((rscreen->family == CHIP_POLARIS10 ||
+ rscreen->family == CHIP_POLARIS11) &&
+ info.uvd_fw_version < UVD_FW_1_66_16 ) {
+ RVID_ERR("POLARIS10/11 firmware version need to be updated.\n");
+ return false;
+ }
return true;
case PIPE_VIDEO_FORMAT_VC1:
return true;
case PIPE_VIDEO_FORMAT_HEVC:
/* Carrizo only supports HEVC Main */
- return rscreen->family >= CHIP_CARRIZO &&
- profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
+ if (rscreen->family >= CHIP_STONEY)
+ return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN ||
+ profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10);
+ else if (rscreen->family >= CHIP_CARRIZO)
+ return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
default:
return false;
}
@@ -257,7 +277,7 @@ int rvid_get_video_param(struct pipe_screen *screen,
case PIPE_VIDEO_CAP_MAX_WIDTH:
return (rscreen->family < CHIP_TONGA) ? 2048 : 4096;
case PIPE_VIDEO_CAP_MAX_HEIGHT:
- return (rscreen->family < CHIP_TONGA) ? 1152 : 2304;
+ return (rscreen->family < CHIP_TONGA) ? 1152 : 4096;
case PIPE_VIDEO_CAP_PREFERED_FORMAT:
return PIPE_FORMAT_NV12;
case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
@@ -294,8 +314,9 @@ int rvid_get_video_param(struct pipe_screen *screen,
case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
- return 41;
+ return (rscreen->family < CHIP_TONGA) ? 41 : 52;
case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+ case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
return 186;
default:
return 0;
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_video.h b/lib/mesa/src/gallium/drivers/radeon/radeon_video.h
index c9ee67f07..39305b4fd 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_video.h
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_video.h
@@ -66,7 +66,7 @@ void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer)
/* join surfaces into the same buffer with identical tiling params
sumup their sizes and replace the backend buffers with a single bo */
-void rvid_join_surfaces(struct radeon_winsys* ws, unsigned bind,
+void rvid_join_surfaces(struct radeon_winsys* ws,
struct pb_buffer** buffers[VL_NUM_COMPONENTS],
struct radeon_surf *surfaces[VL_NUM_COMPONENTS]);
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h b/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h
index f9a7f878f..8946209d3 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_winsys.h
@@ -26,25 +26,12 @@
/* The public winsys interface header for the radeon driver. */
-/* R300 features in DRM.
- *
- * 2.6.0:
- * - Hyper-Z
- * - GB_Z_PEQ_CONFIG on rv350->r4xx
- * - R500 FG_ALPHA_VALUE
- *
- * 2.8.0:
- * - R500 US_FORMAT regs
- * - R500 ARGB2101010 colorbuffer
- * - CMask and AA regs
- * - R16F/RG16F
- */
-
#include "pipebuffer/pb_buffer.h"
+#include "amd/common/amd_family.h"
+
#define RADEON_FLUSH_ASYNC (1 << 0)
-#define RADEON_FLUSH_KEEP_TILING_FLAGS (1 << 1) /* needs DRM 2.12.0 */
-#define RADEON_FLUSH_END_OF_FRAME (1 << 2)
+#define RADEON_FLUSH_END_OF_FRAME (1 << 1)
/* Tiling flags. */
enum radeon_bo_layout {
@@ -65,94 +52,18 @@ enum radeon_bo_flag { /* bitfield */
RADEON_FLAG_GTT_WC = (1 << 0),
RADEON_FLAG_CPU_ACCESS = (1 << 1),
RADEON_FLAG_NO_CPU_ACCESS = (1 << 2),
+ RADEON_FLAG_HANDLE = (1 << 3), /* the buffer most not be suballocated */
};
enum radeon_bo_usage { /* bitfield */
RADEON_USAGE_READ = 2,
RADEON_USAGE_WRITE = 4,
- RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE
-};
+ RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE,
-enum radeon_family {
- CHIP_UNKNOWN = 0,
- CHIP_R300, /* R3xx-based cores. */
- CHIP_R350,
- CHIP_RV350,
- CHIP_RV370,
- CHIP_RV380,
- CHIP_RS400,
- CHIP_RC410,
- CHIP_RS480,
- CHIP_R420, /* R4xx-based cores. */
- CHIP_R423,
- CHIP_R430,
- CHIP_R480,
- CHIP_R481,
- CHIP_RV410,
- CHIP_RS600,
- CHIP_RS690,
- CHIP_RS740,
- CHIP_RV515, /* R5xx-based cores. */
- CHIP_R520,
- CHIP_RV530,
- CHIP_R580,
- CHIP_RV560,
- CHIP_RV570,
- CHIP_R600,
- CHIP_RV610,
- CHIP_RV630,
- CHIP_RV670,
- CHIP_RV620,
- CHIP_RV635,
- CHIP_RS780,
- CHIP_RS880,
- CHIP_RV770,
- CHIP_RV730,
- CHIP_RV710,
- CHIP_RV740,
- CHIP_CEDAR,
- CHIP_REDWOOD,
- CHIP_JUNIPER,
- CHIP_CYPRESS,
- CHIP_HEMLOCK,
- CHIP_PALM,
- CHIP_SUMO,
- CHIP_SUMO2,
- CHIP_BARTS,
- CHIP_TURKS,
- CHIP_CAICOS,
- CHIP_CAYMAN,
- CHIP_ARUBA,
- CHIP_TAHITI,
- CHIP_PITCAIRN,
- CHIP_VERDE,
- CHIP_OLAND,
- CHIP_HAINAN,
- CHIP_BONAIRE,
- CHIP_KAVERI,
- CHIP_KABINI,
- CHIP_HAWAII,
- CHIP_MULLINS,
- CHIP_TONGA,
- CHIP_ICELAND,
- CHIP_CARRIZO,
- CHIP_FIJI,
- CHIP_STONEY,
- CHIP_LAST,
-};
-
-enum chip_class {
- CLASS_UNKNOWN = 0,
- R300,
- R400,
- R500,
- R600,
- R700,
- EVERGREEN,
- CAYMAN,
- SI,
- CIK,
- VI,
+ /* The winsys ensures that the CS submission will be scheduled after
+ * previously flushed CSs referencing this BO in a conflicting way.
+ */
+ RADEON_USAGE_SYNCHRONIZED = 8
};
enum ring_type {
@@ -167,10 +78,13 @@ enum ring_type {
enum radeon_value_id {
RADEON_REQUESTED_VRAM_MEMORY,
RADEON_REQUESTED_GTT_MEMORY,
+ RADEON_MAPPED_VRAM,
+ RADEON_MAPPED_GTT,
RADEON_BUFFER_WAIT_TIME_NS,
RADEON_TIMESTAMP,
RADEON_NUM_CS_FLUSHES,
RADEON_NUM_BYTES_MOVED,
+ RADEON_NUM_EVICTIONS,
RADEON_VRAM_USAGE,
RADEON_GTT_USAGE,
RADEON_GPU_TEMPERATURE, /* DRM 2.42.0 */
@@ -179,73 +93,161 @@ enum radeon_value_id {
RADEON_GPU_RESET_COUNTER, /* DRM 2.43.0 */
};
+/* Each group of four has the same priority. */
enum radeon_bo_priority {
- RADEON_PRIO_MIN,
- RADEON_PRIO_SHADER_DATA, /* shader code, resource descriptors */
- RADEON_PRIO_SHADER_BUFFER_RO, /* read-only */
- RADEON_PRIO_SHADER_TEXTURE_RO, /* read-only */
- RADEON_PRIO_SHADER_RESOURCE_RW, /* buffers, textures, streamout, GS rings, RATs; read/write */
- RADEON_PRIO_COLOR_BUFFER,
- RADEON_PRIO_DEPTH_BUFFER,
- RADEON_PRIO_SHADER_TEXTURE_MSAA,
- RADEON_PRIO_COLOR_BUFFER_MSAA,
- RADEON_PRIO_DEPTH_BUFFER_MSAA,
- RADEON_PRIO_COLOR_META,
- RADEON_PRIO_DEPTH_META,
- RADEON_PRIO_MAX /* must be <= 15 */
+ RADEON_PRIO_FENCE = 0,
+ RADEON_PRIO_TRACE,
+ RADEON_PRIO_SO_FILLED_SIZE,
+ RADEON_PRIO_QUERY,
+
+ RADEON_PRIO_IB1 = 4, /* main IB submitted to the kernel */
+ RADEON_PRIO_IB2, /* IB executed with INDIRECT_BUFFER */
+ RADEON_PRIO_DRAW_INDIRECT,
+ RADEON_PRIO_INDEX_BUFFER,
+
+ RADEON_PRIO_VCE = 8,
+ RADEON_PRIO_UVD,
+ RADEON_PRIO_SDMA_BUFFER,
+ RADEON_PRIO_SDMA_TEXTURE,
+
+ RADEON_PRIO_CP_DMA = 12,
+
+ RADEON_PRIO_CONST_BUFFER = 16,
+ RADEON_PRIO_DESCRIPTORS,
+ RADEON_PRIO_BORDER_COLORS,
+
+ RADEON_PRIO_SAMPLER_BUFFER = 20,
+ RADEON_PRIO_VERTEX_BUFFER,
+
+ RADEON_PRIO_SHADER_RW_BUFFER = 24,
+ RADEON_PRIO_COMPUTE_GLOBAL,
+
+ RADEON_PRIO_SAMPLER_TEXTURE = 28,
+ RADEON_PRIO_SHADER_RW_IMAGE,
+
+ RADEON_PRIO_SAMPLER_TEXTURE_MSAA = 32,
+
+ RADEON_PRIO_COLOR_BUFFER = 36,
+
+ RADEON_PRIO_DEPTH_BUFFER = 40,
+
+ RADEON_PRIO_COLOR_BUFFER_MSAA = 44,
+
+ RADEON_PRIO_DEPTH_BUFFER_MSAA = 48,
+
+ RADEON_PRIO_CMASK = 52,
+ RADEON_PRIO_DCC,
+ RADEON_PRIO_HTILE,
+ RADEON_PRIO_SHADER_BINARY, /* the hw can't hide instruction cache misses */
+
+ RADEON_PRIO_SHADER_RINGS = 56,
+
+ RADEON_PRIO_SCRATCH_BUFFER = 60,
+ /* 63 is the maximum value */
};
struct winsys_handle;
-struct radeon_winsys_cs_handle;
struct radeon_winsys_ctx;
+struct radeon_winsys_cs_chunk {
+ unsigned cdw; /* Number of used dwords. */
+ unsigned max_dw; /* Maximum number of dwords. */
+ uint32_t *buf; /* The base pointer of the chunk. */
+};
+
struct radeon_winsys_cs {
- unsigned cdw; /* Number of used dwords. */
- unsigned max_dw; /* Maximum number of dwords. */
- uint32_t *buf; /* The command buffer. */
- enum ring_type ring_type;
+ struct radeon_winsys_cs_chunk current;
+ struct radeon_winsys_cs_chunk *prev;
+ unsigned num_prev; /* Number of previous chunks. */
+ unsigned max_prev; /* Space in array pointed to by prev. */
+ unsigned prev_dw; /* Total number of dwords in previous chunks. */
+
+ /* Memory usage of the buffer list. These are always 0 for CE and preamble
+ * IBs. */
+ uint64_t used_vram;
+ uint64_t used_gart;
};
struct radeon_info {
+ /* PCI info: domain:bus:dev:func */
+ uint32_t pci_domain;
+ uint32_t pci_bus;
+ uint32_t pci_dev;
+ uint32_t pci_func;
+
+ /* Device info. */
uint32_t pci_id;
enum radeon_family family;
enum chip_class chip_class;
+ uint32_t gart_page_size;
uint64_t gart_size;
uint64_t vram_size;
- uint32_t max_sclk;
- uint32_t max_compute_units;
- uint32_t max_se;
- uint32_t max_sh_per_se;
+ uint64_t max_alloc_size;
+ uint32_t min_alloc_size;
+ bool has_dedicated_vram;
+ bool has_virtual_memory;
+ bool gfx_ib_pad_with_type2;
+ bool has_sdma;
+ bool has_uvd;
+ uint32_t uvd_fw_version;
+ uint32_t vce_fw_version;
+ uint32_t me_fw_version;
+ uint32_t pfp_fw_version;
+ uint32_t ce_fw_version;
+ uint32_t vce_harvest_config;
+ uint32_t clock_crystal_freq;
+ /* Kernel info. */
uint32_t drm_major; /* version */
uint32_t drm_minor;
uint32_t drm_patchlevel;
+ bool has_userptr;
- boolean has_uvd;
- uint32_t vce_fw_version;
- boolean has_userptr;
+ /* Shader cores. */
+ uint32_t r600_max_quad_pipes; /* wave size / 16 */
+ uint32_t max_shader_clock;
+ uint32_t num_good_compute_units;
+ uint32_t max_se; /* shader engines */
+ uint32_t max_sh_per_se; /* shader arrays per shader engine */
+ /* Render backends (color + depth blocks). */
uint32_t r300_num_gb_pipes;
uint32_t r300_num_z_pipes;
-
- uint32_t r600_num_backends;
- uint32_t r600_clock_crystal_freq;
- uint32_t r600_tiling_config;
- uint32_t r600_num_tile_pipes;
- uint32_t r600_max_pipes;
- boolean r600_virtual_address;
- boolean r600_has_dma;
-
- uint32_t r600_backend_map;
- boolean r600_backend_map_valid;
-
- boolean si_tile_mode_array_valid;
+ uint32_t r600_gb_backend_map; /* R600 harvest config */
+ bool r600_gb_backend_map_valid;
+ uint32_t r600_num_banks;
+ uint32_t num_render_backends;
+ uint32_t num_tile_pipes; /* pipe count from PIPE_CONFIG */
+ uint32_t pipe_interleave_bytes;
+ uint32_t enabled_rb_mask; /* GCN harvest config */
+
+ /* Tile modes. */
uint32_t si_tile_mode_array[32];
- uint32_t si_backend_enabled_mask;
-
- boolean cik_macrotile_mode_array_valid;
uint32_t cik_macrotile_mode_array[16];
- uint32_t vce_harvest_config;
+};
+
+/* Tiling info for display code, DRI sharing, and other data. */
+struct radeon_bo_metadata {
+ /* Tiling flags describing the texture layout for display code
+ * and DRI sharing.
+ */
+ enum radeon_bo_layout microtile;
+ enum radeon_bo_layout macrotile;
+ unsigned pipe_config;
+ unsigned bankw;
+ unsigned bankh;
+ unsigned tile_split;
+ unsigned mtilea;
+ unsigned num_banks;
+ unsigned stride;
+ bool scanout;
+
+ /* Additional metadata associated with the buffer, in bytes.
+ * The maximum size is 64 * 4. This is opaque for the winsys & kernel.
+ * Supported by amdgpu only.
+ */
+ uint32_t size_metadata;
+ uint32_t metadata[64];
};
enum radeon_feature_id {
@@ -265,7 +267,6 @@ enum radeon_feature_id {
#define RADEON_SURF_TYPE_2D_ARRAY 5
#define RADEON_SURF_MODE_MASK 0xFF
#define RADEON_SURF_MODE_SHIFT 8
-#define RADEON_SURF_MODE_LINEAR 0
#define RADEON_SURF_MODE_LINEAR_ALIGNED 1
#define RADEON_SURF_MODE_1D 2
#define RADEON_SURF_MODE_2D 3
@@ -276,6 +277,8 @@ enum radeon_feature_id {
#define RADEON_SURF_HAS_SBUFFER_MIPTREE (1 << 19)
#define RADEON_SURF_HAS_TILE_MODE_INDEX (1 << 20)
#define RADEON_SURF_FMASK (1 << 21)
+#define RADEON_SURF_DISABLE_DCC (1 << 22)
+#define RADEON_SURF_TC_COMPATIBLE_HTILE (1 << 23)
#define RADEON_SURF_GET(v, field) (((v) >> RADEON_SURF_ ## field ## _SHIFT) & RADEON_SURF_ ## field ## _MASK)
#define RADEON_SURF_SET(v, field) (((v) & RADEON_SURF_ ## field ## _MASK) << RADEON_SURF_ ## field ## _SHIFT)
@@ -292,6 +295,9 @@ struct radeon_surf_level {
uint32_t nblk_z;
uint32_t pitch_bytes;
uint32_t mode;
+ uint64_t dcc_offset;
+ uint64_t dcc_fast_clear_size;
+ bool dcc_enabled;
};
struct radeon_surf {
@@ -320,13 +326,34 @@ struct radeon_surf {
uint32_t mtilea;
uint32_t tile_split;
uint32_t stencil_tile_split;
- uint64_t stencil_offset;
struct radeon_surf_level level[RADEON_SURF_MAX_LEVEL];
struct radeon_surf_level stencil_level[RADEON_SURF_MAX_LEVEL];
uint32_t tiling_index[RADEON_SURF_MAX_LEVEL];
uint32_t stencil_tiling_index[RADEON_SURF_MAX_LEVEL];
uint32_t pipe_config;
uint32_t num_banks;
+ uint32_t macro_tile_index;
+ uint32_t micro_tile_mode; /* displayable, thin, depth, rotated */
+
+ /* Whether the depth miptree or stencil miptree as used by the DB are
+ * adjusted from their TC compatible form to ensure depth/stencil
+ * compatibility. If either is true, the corresponding plane cannot be
+ * sampled from.
+ */
+ bool depth_adjusted;
+ bool stencil_adjusted;
+
+ uint64_t dcc_size;
+ uint64_t dcc_alignment;
+ /* TC-compatible HTILE only. */
+ uint64_t htile_size;
+ uint64_t htile_alignment;
+};
+
+struct radeon_bo_list_item {
+ uint64_t bo_size;
+ uint64_t vm_address;
+ uint64_t priority_usage; /* mask of (1 << RADEON_PRIO_*) */
};
struct radeon_winsys {
@@ -378,15 +405,11 @@ struct radeon_winsys {
* \return The created buffer object.
*/
struct pb_buffer *(*buffer_create)(struct radeon_winsys *ws,
- unsigned size,
+ uint64_t size,
unsigned alignment,
- boolean use_reusable_pool,
enum radeon_bo_domain domain,
enum radeon_bo_flag flags);
- struct radeon_winsys_cs_handle *(*buffer_get_cs_handle)(
- struct pb_buffer *buf);
-
/**
* Map the entire data store of a buffer object into the client's address
* space.
@@ -396,7 +419,7 @@ struct radeon_winsys {
* \param usage A bitmask of the PIPE_TRANSFER_* flags.
* \return The pointer at the beginning of the buffer.
*/
- void *(*buffer_map)(struct radeon_winsys_cs_handle *buf,
+ void *(*buffer_map)(struct pb_buffer *buf,
struct radeon_winsys_cs *cs,
enum pipe_transfer_usage usage);
@@ -405,7 +428,7 @@ struct radeon_winsys {
*
* \param buf A winsys buffer object to unmap.
*/
- void (*buffer_unmap)(struct radeon_winsys_cs_handle *buf);
+ void (*buffer_unmap)(struct pb_buffer *buf);
/**
* Wait for the buffer and return true if the buffer is not used
@@ -419,45 +442,24 @@ struct radeon_winsys {
enum radeon_bo_usage usage);
/**
- * Return tiling flags describing a memory layout of a buffer object.
+ * Return buffer metadata.
+ * (tiling info for display code, DRI sharing, and other data)
*
* \param buf A winsys buffer object to get the flags from.
- * \param macrotile A pointer to the return value of the microtile flag.
- * \param microtile A pointer to the return value of the macrotile flag.
- *
- * \note microtile and macrotile are not bitmasks!
+ * \param md Metadata
*/
- void (*buffer_get_tiling)(struct pb_buffer *buf,
- enum radeon_bo_layout *microtile,
- enum radeon_bo_layout *macrotile,
- unsigned *bankw, unsigned *bankh,
- unsigned *tile_split,
- unsigned *stencil_tile_split,
- unsigned *mtilea,
- bool *scanout);
+ void (*buffer_get_metadata)(struct pb_buffer *buf,
+ struct radeon_bo_metadata *md);
/**
- * Set tiling flags describing a memory layout of a buffer object.
+ * Set buffer metadata.
+ * (tiling info for display code, DRI sharing, and other data)
*
* \param buf A winsys buffer object to set the flags for.
- * \param cs A command stream to flush if the buffer is referenced by it.
- * \param macrotile A macrotile flag.
- * \param microtile A microtile flag.
- * \param stride A stride of the buffer in bytes, for texturing.
- *
- * \note microtile and macrotile are not bitmasks!
+ * \param md Metadata
*/
- void (*buffer_set_tiling)(struct pb_buffer *buf,
- struct radeon_winsys_cs *rcs,
- enum radeon_bo_layout microtile,
- enum radeon_bo_layout macrotile,
- unsigned pipe_config,
- unsigned bankw, unsigned bankh,
- unsigned tile_split,
- unsigned stencil_tile_split,
- unsigned mtilea, unsigned num_banks,
- unsigned stride,
- bool scanout);
+ void (*buffer_set_metadata)(struct pb_buffer *buf,
+ struct radeon_bo_metadata *md);
/**
* Get a winsys buffer from a winsys handle. The internal structure
@@ -470,7 +472,7 @@ struct radeon_winsys {
*/
struct pb_buffer *(*buffer_from_handle)(struct radeon_winsys *ws,
struct winsys_handle *whandle,
- unsigned *stride);
+ unsigned *stride, unsigned *offset);
/**
* Get a winsys buffer from a user pointer. The resulting buffer can't
@@ -481,7 +483,15 @@ struct radeon_winsys {
* \param Size Size in bytes for the new buffer.
*/
struct pb_buffer *(*buffer_from_ptr)(struct radeon_winsys *ws,
- void *pointer, unsigned size);
+ void *pointer, uint64_t size);
+
+ /**
+ * Whether the buffer was created from a user pointer.
+ *
+ * \param buf A winsys buffer object
+ * \return whether \p buf was created via buffer_from_ptr
+ */
+ bool (*buffer_is_user_ptr)(struct pb_buffer *buf);
/**
* Get a winsys handle from a winsys buffer. The internal structure
@@ -490,24 +500,40 @@ struct radeon_winsys {
* \param buf A winsys buffer object to get the handle from.
* \param whandle A winsys handle pointer.
* \param stride A stride of the buffer in bytes, for texturing.
- * \return TRUE on success.
+ * \return true on success.
*/
- boolean (*buffer_get_handle)(struct pb_buffer *buf,
- unsigned stride,
- struct winsys_handle *whandle);
+ bool (*buffer_get_handle)(struct pb_buffer *buf,
+ unsigned stride, unsigned offset,
+ unsigned slice_size,
+ struct winsys_handle *whandle);
/**
* Return the virtual address of a buffer.
*
+ * When virtual memory is not in use, this is the offset relative to the
+ * relocation base (non-zero for sub-allocated buffers).
+ *
* \param buf A winsys buffer object
* \return virtual address
*/
- uint64_t (*buffer_get_virtual_address)(struct radeon_winsys_cs_handle *buf);
+ uint64_t (*buffer_get_virtual_address)(struct pb_buffer *buf);
+
+ /**
+ * Return the offset of this buffer relative to the relocation base.
+ * This is only non-zero for sub-allocated buffers.
+ *
+ * This is only supported in the radeon winsys, since amdgpu uses virtual
+ * addresses in submissions even for the video engines.
+ *
+ * \param buf A winsys buffer object
+ * \return the offset for relocations
+ */
+ unsigned (*buffer_get_reloc_offset)(struct pb_buffer *buf);
/**
* Query the initial placement of the buffer from the kernel driver.
*/
- enum radeon_bo_domain (*buffer_get_initial_domain)(struct radeon_winsys_cs_handle *buf);
+ enum radeon_bo_domain (*buffer_get_initial_domain)(struct pb_buffer *buf);
/**************************************************************************
* Command submission.
@@ -539,15 +565,43 @@ struct radeon_winsys {
* \param ring_type The ring type (GFX, DMA, UVD)
* \param flush Flush callback function associated with the command stream.
* \param user User pointer that will be passed to the flush callback.
- * \param trace_buf Trace buffer when tracing is enabled
*/
struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys_ctx *ctx,
enum ring_type ring_type,
void (*flush)(void *ctx, unsigned flags,
struct pipe_fence_handle **fence),
- void *flush_ctx,
- struct radeon_winsys_cs_handle *trace_buf);
+ void *flush_ctx);
+
+ /**
+ * Add a constant engine IB to a graphics CS. This makes the graphics CS
+ * from "cs_create" a group of two IBs that share a buffer list and are
+ * flushed together.
+ *
+ * The returned constant CS is only a stream for writing packets to the new
+ * IB. Calling other winsys functions with it is not allowed, not even
+ * "cs_destroy".
+ *
+ * In order to add buffers and check memory usage, use the graphics CS.
+ * In order to flush it, use the graphics CS, which will flush both IBs.
+ * Destroying the graphics CS will destroy both of them.
+ *
+ * \param cs The graphics CS from "cs_create" that will hold the buffer
+ * list and will be used for flushing.
+ */
+ struct radeon_winsys_cs *(*cs_add_const_ib)(struct radeon_winsys_cs *cs);
+ /**
+ * Add a constant engine preamble IB to a graphics CS. This add an extra IB
+ * in similar manner to cs_add_const_ib. This should always be called after
+ * cs_add_const_ib.
+ *
+ * The returned IB is a constant engine IB that only gets flushed if the
+ * context changed.
+ *
+ * \param cs The graphics CS from "cs_create" that will hold the buffer
+ * list and will be used for flushing.
+ */
+ struct radeon_winsys_cs *(*cs_add_const_preamble_ib)(struct radeon_winsys_cs *cs);
/**
* Destroy a command stream.
*
@@ -556,19 +610,18 @@ struct radeon_winsys {
void (*cs_destroy)(struct radeon_winsys_cs *cs);
/**
- * Add a new buffer relocation. Every relocation must first be added
- * before it can be written.
+ * Add a buffer. Each buffer used by a CS must be added using this function.
*
- * \param cs A command stream to add buffer for validation against.
- * \param buf A winsys buffer to validate.
+ * \param cs Command stream
+ * \param buf Buffer
* \param usage Whether the buffer is used for read and/or write.
* \param domain Bitmask of the RADEON_DOMAIN_* flags.
* \param priority A higher number means a greater chance of being
* placed in the requested domain. 15 is the maximum.
- * \return Relocation index.
+ * \return Buffer index.
*/
- unsigned (*cs_add_reloc)(struct radeon_winsys_cs *cs,
- struct radeon_winsys_cs_handle *buf,
+ unsigned (*cs_add_buffer)(struct radeon_winsys_cs *cs,
+ struct pb_buffer *buf,
enum radeon_bo_usage usage,
enum radeon_bo_domain domain,
enum radeon_bo_priority priority);
@@ -576,32 +629,47 @@ struct radeon_winsys {
/**
* Return the index of an already-added buffer.
*
+ * Not supported on amdgpu. Drivers with GPUVM should not care about
+ * buffer indices.
+ *
* \param cs Command stream
* \param buf Buffer
* \return The buffer index, or -1 if the buffer has not been added.
*/
- int (*cs_get_reloc)(struct radeon_winsys_cs *cs,
- struct radeon_winsys_cs_handle *buf);
+ int (*cs_lookup_buffer)(struct radeon_winsys_cs *cs,
+ struct pb_buffer *buf);
/**
- * Return TRUE if there is enough memory in VRAM and GTT for the relocs
- * added so far. If the validation fails, all the relocations which have
+ * Return true if there is enough memory in VRAM and GTT for the buffers
+ * added so far. If the validation fails, all buffers which have
* been added since the last call of cs_validate will be removed and
- * the CS will be flushed (provided there are still any relocations).
+ * the CS will be flushed (provided there are still any buffers).
*
* \param cs A command stream to validate.
*/
- boolean (*cs_validate)(struct radeon_winsys_cs *cs);
+ bool (*cs_validate)(struct radeon_winsys_cs *cs);
/**
- * Return TRUE if there is enough memory in VRAM and GTT for the relocs
- * added so far.
+ * Check whether the given number of dwords is available in the IB.
+ * Optionally chain a new chunk of the IB if necessary and supported.
*
- * \param cs A command stream to validate.
- * \param vram VRAM memory size pending to be use
- * \param gtt GTT memory size pending to be use
+ * \param cs A command stream.
+ * \param dw Number of CS dwords requested by the caller.
*/
- boolean (*cs_memory_below_limit)(struct radeon_winsys_cs *cs, uint64_t vram, uint64_t gtt);
+ bool (*cs_check_space)(struct radeon_winsys_cs *cs, unsigned dw);
+
+ /**
+ * Return the buffer list.
+ *
+ * This is the buffer list as passed to the kernel, i.e. it only contains
+ * the parent buffers of sub-allocated buffers.
+ *
+ * \param cs Command stream
+ * \param list Returned buffer list. Set to NULL to query the count only.
+ * \return The buffer count.
+ */
+ unsigned (*cs_get_buffer_list)(struct radeon_winsys_cs *cs,
+ struct radeon_bo_list_item *list);
/**
* Flush a command stream.
@@ -610,22 +678,29 @@ struct radeon_winsys {
* \param flags, RADEON_FLUSH_ASYNC or 0.
* \param fence Pointer to a fence. If non-NULL, a fence is inserted
* after the CS and is returned through this parameter.
- * \param cs_trace_id A unique identifier of the cs, used for tracing.
+ * \return Negative POSIX error code or 0 for success.
+ * Asynchronous submissions never return an error.
*/
- void (*cs_flush)(struct radeon_winsys_cs *cs,
- unsigned flags,
- struct pipe_fence_handle **fence,
- uint32_t cs_trace_id);
+ int (*cs_flush)(struct radeon_winsys_cs *cs,
+ unsigned flags,
+ struct pipe_fence_handle **fence);
/**
- * Return TRUE if a buffer is referenced by a command stream.
+ * Create a fence before the CS is flushed.
+ * The user must flush manually to complete the initializaton of the fence.
+ * The fence must not be used before the flush.
+ */
+ struct pipe_fence_handle *(*cs_get_next_fence)(struct radeon_winsys_cs *cs);
+
+ /**
+ * Return true if a buffer is referenced by a command stream.
*
* \param cs A command stream.
* \param buf A winsys buffer.
*/
- boolean (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs,
- struct radeon_winsys_cs_handle *buf,
- enum radeon_bo_usage usage);
+ bool (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs,
+ struct pb_buffer *buf,
+ enum radeon_bo_usage usage);
/**
* Request access to a feature for a command stream.
@@ -634,9 +709,9 @@ struct radeon_winsys {
* \param fid Feature ID, one of RADEON_FID_*
* \param enable Whether to enable or disable the feature.
*/
- boolean (*cs_request_feature)(struct radeon_winsys_cs *cs,
- enum radeon_feature_id fid,
- boolean enable);
+ bool (*cs_request_feature)(struct radeon_winsys_cs *cs,
+ enum radeon_feature_id fid,
+ bool enable);
/**
* Make sure all asynchronous flush of the cs have completed
*
@@ -681,21 +756,25 @@ struct radeon_winsys {
uint64_t (*query_value)(struct radeon_winsys *ws,
enum radeon_value_id value);
- void (*read_registers)(struct radeon_winsys *ws, unsigned reg_offset,
+ bool (*read_registers)(struct radeon_winsys *ws, unsigned reg_offset,
unsigned num_registers, uint32_t *out);
};
+static inline bool radeon_emitted(struct radeon_winsys_cs *cs, unsigned num_dw)
+{
+ return cs && (cs->prev_dw + cs->current.cdw > num_dw);
+}
static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
{
- cs->buf[cs->cdw++] = value;
+ cs->current.buf[cs->current.cdw++] = value;
}
static inline void radeon_emit_array(struct radeon_winsys_cs *cs,
const uint32_t *values, unsigned count)
{
- memcpy(cs->buf+cs->cdw, values, count * 4);
- cs->cdw += count;
+ memcpy(cs->current.buf + cs->current.cdw, values, count * 4);
+ cs->current.cdw += count;
}
#endif