diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2019-01-29 11:52:33 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2019-01-29 11:52:33 +0000 |
commit | 37bbf6a1792773f11c15a4da1588a7520ee2fb4e (patch) | |
tree | 64944d4aa665a1e479cfc004e446593062254550 /lib/mesa/src/gallium/drivers/vc4 | |
parent | 6b139c2063623e9310025247cd966490b9aa57ea (diff) |
Merge Mesa 18.3.2
Diffstat (limited to 'lib/mesa/src/gallium/drivers/vc4')
26 files changed, 1165 insertions, 626 deletions
diff --git a/lib/mesa/src/gallium/drivers/vc4/Makefile.am b/lib/mesa/src/gallium/drivers/vc4/Makefile.am index c3e49af97..4c7dd843d 100644 --- a/lib/mesa/src/gallium/drivers/vc4/Makefile.am +++ b/lib/mesa/src/gallium/drivers/vc4/Makefile.am @@ -30,7 +30,8 @@ AM_CFLAGS = \ -I$(top_builddir)/src/compiler/nir \ -I$(top_srcdir)/include/drm-uapi \ -I$(top_builddir)/src \ - -I$(top_srcdir)/src/broadcom/cle \ + -I$(top_srcdir)/src/broadcom \ + -I$(top_builddir)/src/broadcom \ $(LIBDRM_CFLAGS) \ $(GALLIUM_DRIVER_CFLAGS) \ $(SIM_CFLAGS) \ @@ -54,4 +55,4 @@ endif libvc4_la_LDFLAGS = $(SIM_LDFLAGS) -EXTRA_DIST = kernel/README +EXTRA_DIST = kernel/README meson.build diff --git a/lib/mesa/src/gallium/drivers/vc4/Makefile.in b/lib/mesa/src/gallium/drivers/vc4/Makefile.in index 195f7e2c1..f55b61922 100644 --- a/lib/mesa/src/gallium/drivers/vc4/Makefile.in +++ b/lib/mesa/src/gallium/drivers/vc4/Makefile.in @@ -78,15 +78,19 @@ DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \ @HAVE_LIBDRM_TRUE@am__append_1 = \ @HAVE_LIBDRM_TRUE@ $(LIBDRM_LIBS) -@HAVE_DRISW_TRUE@am__append_2 = \ +@HAVE_PLATFORM_ANDROID_TRUE@am__append_2 = \ +@HAVE_PLATFORM_ANDROID_TRUE@ $(ANDROID_LIBS) \ +@HAVE_PLATFORM_ANDROID_TRUE@ $(BACKTRACE_LIBS) + +@HAVE_DRISW_TRUE@am__append_3 = \ @HAVE_DRISW_TRUE@ $(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la -@HAVE_DRISW_KMS_TRUE@am__append_3 = \ +@HAVE_DRISW_KMS_TRUE@am__append_4 = \ @HAVE_DRISW_KMS_TRUE@ $(top_builddir)/src/gallium/winsys/sw/kms-dri/libswkmsdri.la \ @HAVE_DRISW_KMS_TRUE@ $(LIBDRM_LIBS) -@HAVE_ARM_ASM_TRUE@am__append_4 = libvc4_neon.la @HAVE_ARM_ASM_TRUE@am__append_5 = libvc4_neon.la +@HAVE_ARM_ASM_TRUE@am__append_6 = libvc4_neon.la subdir = src/gallium/drivers/vc4 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_compile_flag.m4 \ @@ -106,7 +110,7 @@ mkinstalldirs = $(install_sh) -d CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = LTLIBRARIES = $(noinst_LTLIBRARIES) -libvc4_la_DEPENDENCIES = $(am__append_5) +libvc4_la_DEPENDENCIES = $(am__append_6) am__dirstamp = $(am__leading_dot)dirstamp am__objects_1 = kernel/vc4_gem.lo kernel/vc4_render_cl.lo \ kernel/vc4_validate.lo kernel/vc4_validate_shaders.lo \ @@ -201,6 +205,8 @@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ AWK = @AWK@ +BACKTRACE_CFLAGS = @BACKTRACE_CFLAGS@ +BACKTRACE_LIBS = @BACKTRACE_LIBS@ BSYMBOLIC = @BSYMBOLIC@ CC = @CC@ CCAS = @CCAS@ @@ -214,6 +220,7 @@ CLOVER_STD_OVERRIDE = @CLOVER_STD_OVERRIDE@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CXX = @CXX@ +CXX11_CXXFLAGS = @CXX11_CXXFLAGS@ CXXCPP = @CXXCPP@ CXXDEPMODE = @CXXDEPMODE@ CXXFLAGS = @CXXFLAGS@ @@ -247,8 +254,6 @@ EXEEXT = @EXEEXT@ EXPAT_CFLAGS = @EXPAT_CFLAGS@ EXPAT_LIBS = @EXPAT_LIBS@ FGREP = @FGREP@ -FREEDRENO_CFLAGS = @FREEDRENO_CFLAGS@ -FREEDRENO_LIBS = @FREEDRENO_LIBS@ GALLIUM_PIPE_LOADER_DEFINES = @GALLIUM_PIPE_LOADER_DEFINES@ GBM_PC_LIB_PRIV = @GBM_PC_LIB_PRIV@ GBM_PC_REQ_PRIV = @GBM_PC_REQ_PRIV@ @@ -267,8 +272,8 @@ GL_LIB_DEPS = @GL_LIB_DEPS@ GL_PC_CFLAGS = @GL_PC_CFLAGS@ GL_PC_LIB_PRIV = @GL_PC_LIB_PRIV@ GL_PC_REQ_PRIV = @GL_PC_REQ_PRIV@ +GL_PKGCONF_LIB = @GL_PKGCONF_LIB@ GREP = @GREP@ -HAVE_XF86VIDMODE = @HAVE_XF86VIDMODE@ I915_CFLAGS = @I915_CFLAGS@ I915_LIBS = @I915_LIBS@ INDENT = @INDENT@ @@ -280,6 +285,7 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ LD = @LD@ LDFLAGS = @LDFLAGS@ +LD_BUILD_ID = @LD_BUILD_ID@ LD_NO_UNDEFINED = @LD_NO_UNDEFINED@ LEX = @LEX@ LEXLIB = @LEXLIB@ @@ -317,7 +323,7 @@ MSVC2013_COMPAT_CFLAGS = @MSVC2013_COMPAT_CFLAGS@ MSVC2013_COMPAT_CXXFLAGS = @MSVC2013_COMPAT_CXXFLAGS@ NINE_MAJOR = @NINE_MAJOR@ NINE_MINOR = @NINE_MINOR@ -NINE_TINY = @NINE_TINY@ +NINE_PATCH = @NINE_PATCH@ NINE_VERSION = @NINE_VERSION@ NM = @NM@ NMEDIT = @NMEDIT@ @@ -330,6 +336,9 @@ OBJEXT = @OBJEXT@ OMX_BELLAGIO_CFLAGS = @OMX_BELLAGIO_CFLAGS@ OMX_BELLAGIO_LIBS = @OMX_BELLAGIO_LIBS@ OMX_BELLAGIO_LIB_INSTALL_DIR = @OMX_BELLAGIO_LIB_INSTALL_DIR@ +OMX_TIZONIA_CFLAGS = @OMX_TIZONIA_CFLAGS@ +OMX_TIZONIA_LIBS = @OMX_TIZONIA_LIBS@ +OMX_TIZONIA_LIB_INSTALL_DIR = @OMX_TIZONIA_LIB_INSTALL_DIR@ OPENCL_LIBNAME = @OPENCL_LIBNAME@ OPENCL_VERSION = @OPENCL_VERSION@ OSMESA_LIB = @OSMESA_LIB@ @@ -357,11 +366,16 @@ PTHREAD_CC = @PTHREAD_CC@ PTHREAD_CFLAGS = @PTHREAD_CFLAGS@ PTHREAD_LIBS = @PTHREAD_LIBS@ PWR8_CFLAGS = @PWR8_CFLAGS@ -PYTHON2 = @PYTHON2@ +PYTHON = @PYTHON@ +PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@ +PYTHON_PLATFORM = @PYTHON_PLATFORM@ +PYTHON_PREFIX = @PYTHON_PREFIX@ +PYTHON_VERSION = @PYTHON_VERSION@ RADEON_CFLAGS = @RADEON_CFLAGS@ RADEON_LIBS = @RADEON_LIBS@ RANLIB = @RANLIB@ RM = @RM@ +SCANNER_ARG = @SCANNER_ARG@ SED = @SED@ SELINUX_CFLAGS = @SELINUX_CFLAGS@ SELINUX_LIBS = @SELINUX_LIBS@ @@ -373,9 +387,10 @@ SSE41_CFLAGS = @SSE41_CFLAGS@ STRIP = @STRIP@ SWR_AVX2_CXXFLAGS = @SWR_AVX2_CXXFLAGS@ SWR_AVX_CXXFLAGS = @SWR_AVX_CXXFLAGS@ -SWR_CXX11_CXXFLAGS = @SWR_CXX11_CXXFLAGS@ SWR_KNL_CXXFLAGS = @SWR_KNL_CXXFLAGS@ SWR_SKX_CXXFLAGS = @SWR_SKX_CXXFLAGS@ +V3D_SIMULATOR_CFLAGS = @V3D_SIMULATOR_CFLAGS@ +V3D_SIMULATOR_LIBS = @V3D_SIMULATOR_LIBS@ VALGRIND_CFLAGS = @VALGRIND_CFLAGS@ VALGRIND_LIBS = @VALGRIND_LIBS@ VA_CFLAGS = @VA_CFLAGS@ @@ -383,8 +398,8 @@ VA_LIBS = @VA_LIBS@ VA_LIB_INSTALL_DIR = @VA_LIB_INSTALL_DIR@ VA_MAJOR = @VA_MAJOR@ VA_MINOR = @VA_MINOR@ -VC5_SIMULATOR_CFLAGS = @VC5_SIMULATOR_CFLAGS@ -VC5_SIMULATOR_LIBS = @VC5_SIMULATOR_LIBS@ +VC4_CFLAGS = @VC4_CFLAGS@ +VC4_LIBS = @VC4_LIBS@ VDPAU_CFLAGS = @VDPAU_CFLAGS@ VDPAU_LIBS = @VDPAU_LIBS@ VDPAU_LIB_INSTALL_DIR = @VDPAU_LIB_INSTALL_DIR@ @@ -398,7 +413,11 @@ VL_LIBS = @VL_LIBS@ VULKAN_ICD_INSTALL_DIR = @VULKAN_ICD_INSTALL_DIR@ WAYLAND_CLIENT_CFLAGS = @WAYLAND_CLIENT_CFLAGS@ WAYLAND_CLIENT_LIBS = @WAYLAND_CLIENT_LIBS@ +WAYLAND_EGL_CFLAGS = @WAYLAND_EGL_CFLAGS@ +WAYLAND_EGL_LIBS = @WAYLAND_EGL_LIBS@ +WAYLAND_PROTOCOLS_CFLAGS = @WAYLAND_PROTOCOLS_CFLAGS@ WAYLAND_PROTOCOLS_DATADIR = @WAYLAND_PROTOCOLS_DATADIR@ +WAYLAND_PROTOCOLS_LIBS = @WAYLAND_PROTOCOLS_LIBS@ WAYLAND_SCANNER = @WAYLAND_SCANNER@ WAYLAND_SCANNER_CFLAGS = @WAYLAND_SCANNER_CFLAGS@ WAYLAND_SCANNER_LIBS = @WAYLAND_SCANNER_LIBS@ @@ -408,16 +427,20 @@ WNO_OVERRIDE_INIT = @WNO_OVERRIDE_INIT@ X11_INCLUDES = @X11_INCLUDES@ XA_MAJOR = @XA_MAJOR@ XA_MINOR = @XA_MINOR@ -XA_TINY = @XA_TINY@ +XA_PATCH = @XA_PATCH@ XA_VERSION = @XA_VERSION@ XCB_DRI2_CFLAGS = @XCB_DRI2_CFLAGS@ XCB_DRI2_LIBS = @XCB_DRI2_LIBS@ XCB_DRI3_CFLAGS = @XCB_DRI3_CFLAGS@ XCB_DRI3_LIBS = @XCB_DRI3_LIBS@ -XF86VIDMODE_CFLAGS = @XF86VIDMODE_CFLAGS@ -XF86VIDMODE_LIBS = @XF86VIDMODE_LIBS@ +XCB_DRI3_MODIFIERS_CFLAGS = @XCB_DRI3_MODIFIERS_CFLAGS@ +XCB_DRI3_MODIFIERS_LIBS = @XCB_DRI3_MODIFIERS_LIBS@ +XCB_RANDR_CFLAGS = @XCB_RANDR_CFLAGS@ +XCB_RANDR_LIBS = @XCB_RANDR_LIBS@ XLIBGL_CFLAGS = @XLIBGL_CFLAGS@ XLIBGL_LIBS = @XLIBGL_LIBS@ +XLIB_RANDR_CFLAGS = @XLIB_RANDR_CFLAGS@ +XLIB_RANDR_LIBS = @XLIB_RANDR_LIBS@ XVMC_CFLAGS = @XVMC_CFLAGS@ XVMC_LIBS = @XVMC_LIBS@ XVMC_LIB_INSTALL_DIR = @XVMC_LIB_INSTALL_DIR@ @@ -472,9 +495,13 @@ mandir = @mandir@ mkdir_p = @mkdir_p@ oldincludedir = @oldincludedir@ pdfdir = @pdfdir@ +pkgpyexecdir = @pkgpyexecdir@ +pkgpythondir = @pkgpythondir@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ +pyexecdir = @pyexecdir@ +pythondir = @pythondir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ @@ -601,7 +628,8 @@ GALLIUM_TARGET_CFLAGS = \ $(VISIBILITY_CFLAGS) GALLIUM_COMMON_LIB_DEPS = -lm $(LIBUNWIND_LIBS) $(LIBSENSORS_LIBS) \ - $(CLOCK_LIB) $(PTHREAD_LIBS) $(DLOPEN_LIBS) $(am__append_1) + $(CLOCK_LIB) $(PTHREAD_LIBS) $(DLOPEN_LIBS) $(am__append_1) \ + $(am__append_2) GALLIUM_WINSYS_CFLAGS = \ -I$(top_srcdir)/src \ -I$(top_srcdir)/include \ @@ -613,26 +641,27 @@ GALLIUM_WINSYS_CFLAGS = \ GALLIUM_PIPE_LOADER_WINSYS_LIBS = \ $(top_builddir)/src/gallium/winsys/sw/null/libws_null.la \ $(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \ - $(am__append_2) $(am__append_3) + $(am__append_3) $(am__append_4) @USE_VC4_SIMULATOR_TRUE@SIM_LDFLAGS = -lsimpenrose AM_CFLAGS = \ -I$(top_builddir)/src/compiler/nir \ -I$(top_srcdir)/include/drm-uapi \ -I$(top_builddir)/src \ - -I$(top_srcdir)/src/broadcom/cle \ + -I$(top_srcdir)/src/broadcom \ + -I$(top_builddir)/src/broadcom \ $(LIBDRM_CFLAGS) \ $(GALLIUM_DRIVER_CFLAGS) \ $(SIM_CFLAGS) \ $(VALGRIND_CFLAGS) \ $() -noinst_LTLIBRARIES = libvc4.la $(am__append_4) +noinst_LTLIBRARIES = libvc4.la $(am__append_5) libvc4_la_SOURCES = $(C_SOURCES) -libvc4_la_LIBADD = $(SIM_LIB) $() $(am__append_5) +libvc4_la_LIBADD = $(SIM_LIB) $() $(am__append_6) @HAVE_ARM_ASM_TRUE@libvc4_neon_la_SOURCES = $(NEON_C_SOURCES) @HAVE_ARM_ASM_TRUE@libvc4_neon_la_CFLAGS = $(AM_CFLAGS) -mfpu=neon libvc4_la_LDFLAGS = $(SIM_LDFLAGS) -EXTRA_DIST = kernel/README +EXTRA_DIST = kernel/README meson.build all: all-am .SUFFIXES: diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c b/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c index 7f4c76968..d3cc5152a 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c @@ -24,6 +24,7 @@ #include "util/u_format.h" #include "util/u_surface.h" #include "util/u_blitter.h" +#include "compiler/nir/nir_builder.h" #include "vc4_context.h" static struct pipe_surface * @@ -183,6 +184,231 @@ vc4_blitter_save(struct vc4_context *vc4) vc4->fragtex.num_textures, vc4->fragtex.textures); } +static void *vc4_get_yuv_vs(struct pipe_context *pctx) +{ + struct vc4_context *vc4 = vc4_context(pctx); + struct pipe_screen *pscreen = pctx->screen; + + if (vc4->yuv_linear_blit_vs) + return vc4->yuv_linear_blit_vs; + + const struct nir_shader_compiler_options *options = + pscreen->get_compiler_options(pscreen, + PIPE_SHADER_IR_NIR, + PIPE_SHADER_VERTEX); + + nir_builder b; + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, options); + b.shader->info.name = ralloc_strdup(b.shader, "linear_blit_vs"); + + const struct glsl_type *vec4 = glsl_vec4_type(); + nir_variable *pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vec4, "pos"); + + nir_variable *pos_out = nir_variable_create(b.shader, nir_var_shader_out, + vec4, "gl_Position"); + pos_out->data.location = VARYING_SLOT_POS; + + nir_store_var(&b, pos_out, nir_load_var(&b, pos_in), 0xf); + + struct pipe_shader_state shader_tmpl = { + .type = PIPE_SHADER_IR_NIR, + .ir.nir = b.shader, + }; + + vc4->yuv_linear_blit_vs = pctx->create_vs_state(pctx, &shader_tmpl); + + return vc4->yuv_linear_blit_vs; +} + +static void *vc4_get_yuv_fs(struct pipe_context *pctx, int cpp) +{ + struct vc4_context *vc4 = vc4_context(pctx); + struct pipe_screen *pscreen = pctx->screen; + struct pipe_shader_state **cached_shader; + const char *name; + + if (cpp == 1) { + cached_shader = &vc4->yuv_linear_blit_fs_8bit; + name = "linear_blit_8bit_fs"; + } else { + cached_shader = &vc4->yuv_linear_blit_fs_16bit; + name = "linear_blit_16bit_fs"; + } + + if (*cached_shader) + return *cached_shader; + + const struct nir_shader_compiler_options *options = + pscreen->get_compiler_options(pscreen, + PIPE_SHADER_IR_NIR, + PIPE_SHADER_FRAGMENT); + + nir_builder b; + nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, options); + b.shader->info.name = ralloc_strdup(b.shader, name); + + const struct glsl_type *vec4 = glsl_vec4_type(); + const struct glsl_type *glsl_int = glsl_int_type(); + + nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out, + vec4, "f_color"); + color_out->data.location = FRAG_RESULT_COLOR; + + nir_variable *pos_in = nir_variable_create(b.shader, nir_var_shader_in, + vec4, "pos"); + pos_in->data.location = VARYING_SLOT_POS; + nir_ssa_def *pos = nir_load_var(&b, pos_in); + + nir_ssa_def *one = nir_imm_int(&b, 1); + nir_ssa_def *two = nir_imm_int(&b, 2); + + nir_ssa_def *x = nir_f2i32(&b, nir_channel(&b, pos, 0)); + nir_ssa_def *y = nir_f2i32(&b, nir_channel(&b, pos, 1)); + + nir_variable *stride_in = nir_variable_create(b.shader, nir_var_uniform, + glsl_int, "stride"); + nir_ssa_def *stride = nir_load_var(&b, stride_in); + + nir_ssa_def *x_offset; + nir_ssa_def *y_offset; + if (cpp == 1) { + nir_ssa_def *intra_utile_x_offset = + nir_ishl(&b, nir_iand(&b, x, one), two); + nir_ssa_def *inter_utile_x_offset = + nir_ishl(&b, nir_iand(&b, x, nir_imm_int(&b, ~3)), one); + + x_offset = nir_iadd(&b, + intra_utile_x_offset, + inter_utile_x_offset); + y_offset = nir_imul(&b, + nir_iadd(&b, + nir_ishl(&b, y, one), + nir_ushr(&b, nir_iand(&b, x, two), one)), + stride); + } else { + x_offset = nir_ishl(&b, x, two); + y_offset = nir_imul(&b, y, stride); + } + + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ubo); + load->num_components = 1; + nir_ssa_dest_init(&load->instr, &load->dest, load->num_components, 32, NULL); + load->src[0] = nir_src_for_ssa(one); + load->src[1] = nir_src_for_ssa(nir_iadd(&b, x_offset, y_offset)); + nir_builder_instr_insert(&b, &load->instr); + + nir_store_var(&b, color_out, + nir_unpack_unorm_4x8(&b, &load->dest.ssa), + 0xf); + + struct pipe_shader_state shader_tmpl = { + .type = PIPE_SHADER_IR_NIR, + .ir.nir = b.shader, + }; + + *cached_shader = pctx->create_fs_state(pctx, &shader_tmpl); + + return *cached_shader; +} + +static bool +vc4_yuv_blit(struct pipe_context *pctx, const struct pipe_blit_info *info) +{ + struct vc4_context *vc4 = vc4_context(pctx); + struct vc4_resource *src = vc4_resource(info->src.resource); + struct vc4_resource *dst = vc4_resource(info->dst.resource); + bool ok; + + if (src->tiled) + return false; + if (src->base.format != PIPE_FORMAT_R8_UNORM && + src->base.format != PIPE_FORMAT_R8G8_UNORM) + return false; + + /* YUV blits always turn raster-order to tiled */ + assert(dst->base.format == src->base.format); + assert(dst->tiled); + + /* Always 1:1 and at the origin */ + assert(info->src.box.x == 0 && info->dst.box.x == 0); + assert(info->src.box.y == 0 && info->dst.box.y == 0); + assert(info->src.box.width == info->dst.box.width); + assert(info->src.box.height == info->dst.box.height); + + if ((src->slices[info->src.level].offset & 3) || + (src->slices[info->src.level].stride & 3)) { + perf_debug("YUV-blit src texture offset/stride misaligned: 0x%08x/%d\n", + src->slices[info->src.level].offset, + src->slices[info->src.level].stride); + goto fallback; + } + + vc4_blitter_save(vc4); + + /* Create a renderable surface mapping the T-tiled shadow buffer. + */ + struct pipe_surface dst_tmpl; + util_blitter_default_dst_texture(&dst_tmpl, info->dst.resource, + info->dst.level, info->dst.box.z); + dst_tmpl.format = PIPE_FORMAT_RGBA8888_UNORM; + struct pipe_surface *dst_surf = + pctx->create_surface(pctx, info->dst.resource, &dst_tmpl); + if (!dst_surf) { + fprintf(stderr, "Failed to create YUV dst surface\n"); + util_blitter_unset_running_flag(vc4->blitter); + return false; + } + dst_surf->width /= 2; + if (dst->cpp == 1) + dst_surf->height /= 2; + + /* Set the constant buffer. */ + uint32_t stride = src->slices[info->src.level].stride; + struct pipe_constant_buffer cb_uniforms = { + .user_buffer = &stride, + .buffer_size = sizeof(stride), + }; + pctx->set_constant_buffer(pctx, PIPE_SHADER_FRAGMENT, 0, &cb_uniforms); + struct pipe_constant_buffer cb_src = { + .buffer = info->src.resource, + .buffer_offset = src->slices[info->src.level].offset, + .buffer_size = (src->bo->size - + src->slices[info->src.level].offset), + }; + pctx->set_constant_buffer(pctx, PIPE_SHADER_FRAGMENT, 1, &cb_src); + + /* Unbind the textures, to make sure we don't try to recurse into the + * shadow blit. + */ + pctx->set_sampler_views(pctx, PIPE_SHADER_FRAGMENT, 0, 0, NULL); + pctx->bind_sampler_states(pctx, PIPE_SHADER_FRAGMENT, 0, 0, NULL); + + util_blitter_custom_shader(vc4->blitter, dst_surf, + vc4_get_yuv_vs(pctx), + vc4_get_yuv_fs(pctx, src->cpp)); + + util_blitter_restore_textures(vc4->blitter); + util_blitter_restore_constant_buffer_state(vc4->blitter); + /* Restore cb1 (util_blitter doesn't handle this one). */ + struct pipe_constant_buffer cb_disabled = { 0 }; + pctx->set_constant_buffer(pctx, PIPE_SHADER_FRAGMENT, 1, &cb_disabled); + + pipe_surface_reference(&dst_surf, NULL); + + return true; + +fallback: + /* Do an immediate SW fallback, since the render blit path + * would just recurse. + */ + ok = util_try_blit_via_copy_region(pctx, info); + assert(ok); (void)ok; + + return true; +} + static bool vc4_render_blit(struct pipe_context *ctx, struct pipe_blit_info *info) { @@ -218,6 +444,9 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info) { struct pipe_blit_info info = *blit_info; + if (vc4_yuv_blit(pctx, blit_info)) + return; + if (vc4_tile_blit(pctx, blit_info)) return; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c index d06d55f86..54f9d9c26 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c @@ -30,6 +30,7 @@ #include "util/u_hash_table.h" #include "util/u_memory.h" +#include "util/u_string.h" #include "util/ralloc.h" #include "vc4_context.h" @@ -49,6 +50,13 @@ static void vc4_bo_cache_free_all(struct vc4_bo_cache *cache); void +vc4_bo_debug_describe(char* buf, const struct vc4_bo *ptr) +{ + util_sprintf(buf, "vc4_bo<%s,%u,%u>", ptr->name ? ptr->name : "?", + ptr->handle, ptr->size); +} + +void vc4_bo_label(struct vc4_screen *screen, struct vc4_bo *bo, const char *fmt, ...) { /* Perform BO labeling by default on debug builds (so that you get @@ -113,35 +121,105 @@ vc4_bo_remove_from_cache(struct vc4_bo_cache *cache, struct vc4_bo *bo) cache->bo_size -= bo->size; } +static void vc4_bo_purgeable(struct vc4_bo *bo) +{ + struct drm_vc4_gem_madvise arg = { + .handle = bo->handle, + .madv = VC4_MADV_DONTNEED, + }; + + if (bo->screen->has_madvise) + vc4_ioctl(bo->screen->fd, DRM_IOCTL_VC4_GEM_MADVISE, &arg); +} + +static bool vc4_bo_unpurgeable(struct vc4_bo *bo) +{ + struct drm_vc4_gem_madvise arg = { + .handle = bo->handle, + .madv = VC4_MADV_WILLNEED, + }; + + if (!bo->screen->has_madvise) + return true; + + if (vc4_ioctl(bo->screen->fd, DRM_IOCTL_VC4_GEM_MADVISE, &arg)) + return false; + + return arg.retained; +} + +static void +vc4_bo_free(struct vc4_bo *bo) +{ + struct vc4_screen *screen = bo->screen; + + if (bo->map) { + if (using_vc4_simulator && bo->name && + strcmp(bo->name, "winsys") == 0) { + free(bo->map); + } else { + munmap(bo->map, bo->size); + VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0)); + } + } + + struct drm_gem_close c; + memset(&c, 0, sizeof(c)); + c.handle = bo->handle; + int ret = vc4_ioctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c); + if (ret != 0) + fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno)); + + screen->bo_count--; + screen->bo_size -= bo->size; + + if (dump_stats) { + fprintf(stderr, "Freed %s%s%dkb:\n", + bo->name ? bo->name : "", + bo->name ? " " : "", + bo->size / 1024); + vc4_bo_dump_stats(screen); + } + + free(bo); +} + static struct vc4_bo * vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name) { struct vc4_bo_cache *cache = &screen->bo_cache; uint32_t page_index = size / 4096 - 1; + struct vc4_bo *iter, *tmp, *bo = NULL; if (cache->size_list_size <= page_index) return NULL; - struct vc4_bo *bo = NULL; mtx_lock(&cache->lock); - if (!list_empty(&cache->size_list[page_index])) { - bo = LIST_ENTRY(struct vc4_bo, cache->size_list[page_index].next, - size_list); - - /* Check that the BO has gone idle. If not, then we want to - * allocate something new instead, since we assume that the - * user will proceed to CPU map it and fill it with stuff. + LIST_FOR_EACH_ENTRY_SAFE(iter, tmp, &cache->size_list[page_index], + size_list) { + /* Check that the BO has gone idle. If not, then none of the + * other BOs (pushed to the list after later rendering) are + * likely to be idle, either. */ - if (!vc4_bo_wait(bo, 0, NULL)) { - mtx_unlock(&cache->lock); - return NULL; - } + if (!vc4_bo_wait(iter, 0, NULL)) + break; + + if (!vc4_bo_unpurgeable(iter)) { + /* The BO has been purged. Free it and try to find + * another one in the cache. + */ + vc4_bo_remove_from_cache(cache, iter); + vc4_bo_free(iter); + continue; + } + bo = iter; pipe_reference_init(&bo->reference, 1); vc4_bo_remove_from_cache(cache, bo); vc4_bo_label(screen, bo, "%s", name); bo->name = name; + break; } mtx_unlock(&cache->lock); return bo; @@ -221,42 +299,6 @@ vc4_bo_last_unreference(struct vc4_bo *bo) } static void -vc4_bo_free(struct vc4_bo *bo) -{ - struct vc4_screen *screen = bo->screen; - - if (bo->map) { - if (using_vc4_simulator && bo->name && - strcmp(bo->name, "winsys") == 0) { - free(bo->map); - } else { - munmap(bo->map, bo->size); - VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0)); - } - } - - struct drm_gem_close c; - memset(&c, 0, sizeof(c)); - c.handle = bo->handle; - int ret = vc4_ioctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c); - if (ret != 0) - fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno)); - - screen->bo_count--; - screen->bo_size -= bo->size; - - if (dump_stats) { - fprintf(stderr, "Freed %s%s%dkb:\n", - bo->name ? bo->name : "", - bo->name ? " " : "", - bo->size / 1024); - vc4_bo_dump_stats(screen); - } - - free(bo); -} - -static void free_stale_bos(struct vc4_screen *screen, time_t time) { struct vc4_bo_cache *cache = &screen->bo_cache; @@ -325,6 +367,7 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time) cache->size_list_size = page_index + 1; } + vc4_bo_purgeable(bo); bo->free_time = time; list_addtail(&bo->size_list, &cache->size_list[page_index]); list_addtail(&bo->time_list, &cache->time_list); @@ -354,7 +397,7 @@ vc4_bo_open_handle(struct vc4_screen *screen, bo = util_hash_table_get(screen->bo_handles, (void*)(uintptr_t)handle); if (bo) { - pipe_reference(NULL, &bo->reference); + vc4_bo_reference(bo); goto done; } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h index 4e7b23e08..9fa477442 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h @@ -39,6 +39,14 @@ struct vc4_bo { uint32_t handle; uint32_t size; + /* This will be read/written by multiple threads without a lock -- you + * should take a snapshot and use it to see if you happen to be in the + * CL's handles at this position, to make most lookups O(1). It's + * volatile to make sure that the compiler doesn't emit multiple loads + * from the address, which would make the lookup racy. + */ + volatile uint32_t last_hindex; + /** Entry in the linked list of buffers freed, by age. */ struct list_head time_list; /** Entry in the per-page-count linked list of buffers freed (by age). */ @@ -65,18 +73,13 @@ struct vc4_bo *vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd, bool vc4_bo_flink(struct vc4_bo *bo, uint32_t *name); int vc4_bo_get_dmabuf(struct vc4_bo *bo); -static inline void -vc4_bo_set_reference(struct vc4_bo **old_bo, struct vc4_bo *new_bo) -{ - if (pipe_reference(&(*old_bo)->reference, &new_bo->reference)) - vc4_bo_last_unreference(*old_bo); - *old_bo = new_bo; -} - +void vc4_bo_debug_describe(char* buf, const struct vc4_bo *ptr); static inline struct vc4_bo * vc4_bo_reference(struct vc4_bo *bo) { - pipe_reference(NULL, &bo->reference); + pipe_reference_described(NULL, &bo->reference, + (debug_reference_descriptor) + vc4_bo_debug_describe); return bo; } @@ -89,13 +92,18 @@ vc4_bo_unreference(struct vc4_bo **bo) if ((*bo)->private) { /* Avoid the mutex for private BOs */ - if (pipe_reference(&(*bo)->reference, NULL)) + if (pipe_reference_described(&(*bo)->reference, NULL, + (debug_reference_descriptor) + vc4_bo_debug_describe)) { vc4_bo_last_unreference(*bo); + } } else { screen = (*bo)->screen; mtx_lock(&screen->bo_handles_mutex); - if (pipe_reference(&(*bo)->reference, NULL)) { + if (pipe_reference_described(&(*bo)->reference, NULL, + (debug_reference_descriptor) + vc4_bo_debug_describe)) { util_hash_table_remove(screen->bo_handles, (void *)(uintptr_t)(*bo)->handle); vc4_bo_last_unreference(*bo); @@ -113,8 +121,11 @@ vc4_bo_unreference_locked_timed(struct vc4_bo **bo, time_t time) if (!*bo) return; - if (pipe_reference(&(*bo)->reference, NULL)) + if (pipe_reference_described(&(*bo)->reference, NULL, + (debug_reference_descriptor) + vc4_bo_debug_describe)) { vc4_bo_last_unreference_locked_timed(*bo, time); + } *bo = NULL; } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c index 508281a27..7ae092ebc 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c @@ -61,10 +61,19 @@ vc4_gem_hindex(struct vc4_job *job, struct vc4_bo *bo) { uint32_t hindex; uint32_t *current_handles = job->bo_handles.base; + uint32_t cl_hindex_count = cl_offset(&job->bo_handles) / 4; + uint32_t last_hindex = bo->last_hindex; /* volatile read! */ - for (hindex = 0; hindex < cl_offset(&job->bo_handles) / 4; hindex++) { - if (current_handles[hindex] == bo->handle) + if (last_hindex < cl_hindex_count && + current_handles[last_hindex] == bo->handle) { + return last_hindex; + } + + for (hindex = 0; hindex < cl_hindex_count; hindex++) { + if (current_handles[hindex] == bo->handle) { + bo->last_hindex = hindex; return hindex; + } } struct vc4_cl_out *out; @@ -79,5 +88,6 @@ vc4_gem_hindex(struct vc4_job *job, struct vc4_bo *bo) job->bo_space += bo->size; + bo->last_hindex = hindex; return hindex; } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.h b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.h index 8df9dbfe6..39d1d347b 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.h @@ -159,21 +159,6 @@ cl_aligned_f(struct vc4_cl_out **cl, float f) cl_aligned_u32(cl, fui(f)); } -static inline void -cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n) -{ - assert(n == 1 || n == 2); - assert(cl->reloc_count == 0); -#ifndef NDEBUG - cl->reloc_count = n; -#endif - - cl_u8(out, VC4_PACKET_GEM_HANDLES); - cl->reloc_next = *out; - cl_u32(out, 0); /* Space where hindex will be written. */ - cl_u32(out, 0); /* Space where hindex will be written. */ -} - static inline struct vc4_cl_out * cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n) { diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_cl_dump.c b/lib/mesa/src/gallium/drivers/vc4/vc4_cl_dump.c index ca1b9a315..a6ae0cf80 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_cl_dump.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_cl_dump.c @@ -28,6 +28,7 @@ #include "kernel/vc4_packet.h" #include "broadcom/cle/v3d_decoder.h" +#include "broadcom/clif/clif_dump.h" void vc4_dump_cl(void *cl, uint32_t size, bool is_render) @@ -41,6 +42,8 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render) }; struct v3d_spec *spec = v3d_spec_load(&devinfo); + struct clif_dump *clif = clif_dump_init(&devinfo, stderr, true); + uint32_t offset = 0, hw_offset = 0; uint8_t *p = cl; @@ -60,7 +63,7 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render) fprintf(stderr, "0x%08x 0x%08x: 0x%02x %s\n", offset, hw_offset, header, v3d_group_get_name(inst)); - v3d_print_group(stderr, inst, offset, p, ""); + v3d_print_group(clif, inst, offset, p); switch (header) { case VC4_PACKET_HALT: @@ -75,5 +78,7 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render) hw_offset += length; p += length; } + + clif_dump_destroy(clif); } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c index a9e7ff91f..ffd7d4c85 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c @@ -42,7 +42,6 @@ vc4_flush(struct pipe_context *pctx) { struct vc4_context *vc4 = vc4_context(pctx); - struct hash_entry *entry; hash_table_foreach(vc4->jobs, entry) { struct vc4_job *job = entry->data; vc4_job_submit(vc4, job); @@ -59,8 +58,17 @@ vc4_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence, if (fence) { struct pipe_screen *screen = pctx->screen; + int fd = -1; + + if (flags & PIPE_FLUSH_FENCE_FD) { + /* The vc4_fence takes ownership of the returned fd. */ + drmSyncobjExportSyncFile(vc4->fd, vc4->job_syncobj, + &fd); + } + struct vc4_fence *f = vc4_fence_create(vc4->screen, - vc4->last_emit_seqno); + vc4->last_emit_seqno, + fd); screen->fence_reference(screen, fence, NULL); *fence = (struct pipe_fence_handle *)f; } @@ -115,8 +123,22 @@ vc4_context_destroy(struct pipe_context *pctx) pipe_surface_reference(&vc4->framebuffer.cbufs[0], NULL); pipe_surface_reference(&vc4->framebuffer.zsbuf, NULL); + if (vc4->yuv_linear_blit_vs) + pctx->delete_vs_state(pctx, vc4->yuv_linear_blit_vs); + if (vc4->yuv_linear_blit_fs_8bit) + pctx->delete_fs_state(pctx, vc4->yuv_linear_blit_fs_8bit); + if (vc4->yuv_linear_blit_fs_16bit) + pctx->delete_fs_state(pctx, vc4->yuv_linear_blit_fs_16bit); + vc4_program_fini(pctx); + if (vc4->screen->has_syncobj) { + drmSyncobjDestroy(vc4->fd, vc4->job_syncobj); + drmSyncobjDestroy(vc4->fd, vc4->in_syncobj); + } + if (vc4->in_fence_fd >= 0) + close(vc4->in_fence_fd); + ralloc_free(vc4); } @@ -125,6 +147,7 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) { struct vc4_screen *screen = vc4_screen(pscreen); struct vc4_context *vc4; + int err; /* Prevent dumping of the shaders built during context setup. */ uint32_t saved_shaderdb_flag = vc4_debug & VC4_DEBUG_SHADERDB; @@ -150,10 +173,16 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) vc4_query_init(pctx); vc4_resource_context_init(pctx); - vc4_job_init(vc4); - vc4->fd = screen->fd; + err = vc4_job_init(vc4); + if (err) + goto fail; + + err = vc4_fence_context_init(vc4); + if (err) + goto fail; + slab_create_child(&vc4->transfer_pool, &screen->transfer_pool); vc4->uploader = u_upload_create_default(&vc4->base); diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h index 4a1e4093f..ce8bcffac 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h @@ -78,6 +78,7 @@ #define VC4_DIRTY_COMPILED_VS (1 << 24) #define VC4_DIRTY_COMPILED_FS (1 << 25) #define VC4_DIRTY_FS_INPUTS (1 << 26) +#define VC4_DIRTY_UBO_1_SIZE (1 << 27) struct vc4_sampler_view { struct pipe_sampler_view base; @@ -219,6 +220,13 @@ struct vc4_job_key { struct pipe_surface *zsbuf; }; +struct vc4_hwperfmon { + uint32_t id; + uint64_t last_seqno; + uint8_t events[DRM_VC4_MAX_PERF_COUNTERS]; + uint64_t counters[DRM_VC4_MAX_PERF_COUNTERS]; +}; + /** * A complete bin/render job. * @@ -243,6 +251,9 @@ struct vc4_job { */ uint32_t bo_space; + /* Last BO hindex referenced from VC4_PACKET_GEM_HANDLES. */ + uint32_t last_gem_handle_hindex; + /** @{ Surfaces to submit rendering for. */ struct pipe_surface *color_read; struct pipe_surface *color_write; @@ -306,6 +317,9 @@ struct vc4_job { /** Any flags to be passed in drm_vc4_submit_cl.flags. */ uint32_t flags; + /* Performance monitor attached to this job. */ + struct vc4_hwperfmon *perfmon; + struct vc4_job_key key; }; @@ -363,6 +377,10 @@ struct vc4_context { struct u_upload_mgr *uploader; + struct pipe_shader_state *yuv_linear_blit_vs; + struct pipe_shader_state *yuv_linear_blit_fs_8bit; + struct pipe_shader_state *yuv_linear_blit_fs_16bit; + /** @{ Current pipeline state objects */ struct pipe_scissor_state scissor; struct pipe_blend_state *blend; @@ -387,7 +405,16 @@ struct vc4_context { struct pipe_viewport_state viewport; struct vc4_constbuf_stateobj constbuf[PIPE_SHADER_TYPES]; struct vc4_vertexbuf_stateobj vertexbuf; + + struct vc4_hwperfmon *perfmon; /** @} */ + + /** Handle of syncobj containing the last submitted job fence. */ + uint32_t job_syncobj; + + int in_fence_fd; + /** Handle of the syncobj that holds in_fence_fd for submission. */ + uint32_t in_syncobj; }; struct vc4_rasterizer_state { @@ -444,6 +471,12 @@ vc4_sampler_state(struct pipe_sampler_state *psampler) return (struct vc4_sampler_state *)psampler; } +int vc4_get_driver_query_group_info(struct pipe_screen *pscreen, + unsigned index, + struct pipe_driver_query_group_info *info); +int vc4_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info); + struct pipe_context *vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags); void vc4_draw_init(struct pipe_context *pctx); @@ -476,7 +509,8 @@ void vc4_write_uniforms(struct vc4_context *vc4, struct vc4_texture_stateobj *texstate); void vc4_flush(struct pipe_context *pctx); -void vc4_job_init(struct vc4_context *vc4); +int vc4_job_init(struct vc4_context *vc4); +int vc4_fence_context_init(struct vc4_context *vc4); struct vc4_job *vc4_get_job(struct vc4_context *vc4, struct pipe_surface *cbuf, struct pipe_surface *zsbuf); diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c b/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c index 556855420..06785516c 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c @@ -40,7 +40,7 @@ vc4_get_draw_cl_space(struct vc4_job *job, int vert_count) /* The SW-5891 workaround may cause us to emit multiple shader recs * and draw packets. */ - int num_draws = DIV_ROUND_UP(vert_count, 65535) + 1; + int num_draws = DIV_ROUND_UP(vert_count, 65535 - 2) + 1; /* Binner gets our packet state -- vc4_emit.c contents, * and the primitive itself. @@ -222,6 +222,8 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4, attr.coordinate_shader_vpm_offset = 0; attr.vertex_shader_vpm_offset = 0; } + + vc4_bo_unreference(&bo); } cl_emit(&job->bcl, GL_SHADER_STATE, shader_state) { @@ -286,6 +288,7 @@ static void vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) { struct vc4_context *vc4 = vc4_context(pctx); + struct pipe_draw_info local_info; if (!info->count_from_stream_output && !info->indirect && !info->primitive_restart && @@ -293,11 +296,19 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) return; if (info->mode >= PIPE_PRIM_QUADS) { - util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base); - util_primconvert_draw_vbo(vc4->primconvert, info); - perf_debug("Fallback conversion for %d %s vertices\n", - info->count, u_prim_name(info->mode)); - return; + if (info->mode == PIPE_PRIM_QUADS && + info->count == 4 && + !vc4->rasterizer->base.flatshade) { + local_info = *info; + local_info.mode = PIPE_PRIM_TRIANGLE_FAN; + info = &local_info; + } else { + util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base); + util_primconvert_draw_vbo(vc4->primconvert, info); + perf_debug("Fallback conversion for %d %s vertices\n", + info->count, u_prim_name(info->mode)); + return; + } } /* Before setting up the draw, do any fixup blits necessary. */ @@ -377,7 +388,25 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) struct vc4_resource *rsc = vc4_resource(prsc); struct vc4_cl_out *bcl = cl_start(&job->bcl); - cl_start_reloc(&job->bcl, &bcl, 1); + + /* The original design for the VC4 kernel UABI had multiple + * packets that used relocations in the BCL (some of which + * needed two BOs), but later modifications eliminated all but + * this one usage. We have an arbitrary 32-bit offset value, + * and need to also supply an arbitrary 32-bit index buffer + * GEM handle, so we have this fake packet we emit in our BCL + * to be validated, which the kernel uses at validation time + * to perform the relocation in the IB packet (without + * emitting to the actual HW). + */ + uint32_t hindex = vc4_gem_hindex(job, rsc->bo); + if (job->last_gem_handle_hindex != hindex) { + cl_u8(&bcl, VC4_PACKET_GEM_HANDLES); + cl_u32(&bcl, hindex); + cl_u32(&bcl, 0); + job->last_gem_handle_hindex = hindex; + } + cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE); cl_u8(&bcl, info->mode | @@ -385,8 +414,9 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) VC4_INDEX_BUFFER_U16: VC4_INDEX_BUFFER_U8)); cl_u32(&bcl, info->count); - cl_reloc(job, &job->bcl, &bcl, rsc->bo, offset); + cl_u32(&bcl, offset); cl_u32(&bcl, vc4->max_index); + cl_end(&job->bcl, bcl); job->draw_calls_queued++; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_job.c b/lib/mesa/src/gallium/drivers/vc4/vc4_job.c index 7fe20c16b..f38c46475 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_job.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_job.c @@ -90,6 +90,11 @@ vc4_job_create(struct vc4_context *vc4) job->draw_max_x = 0; job->draw_max_y = 0; + job->last_gem_handle_hindex = ~0; + + if (vc4->perfmon) + job->perfmon = vc4->perfmon; + return job; } @@ -113,7 +118,6 @@ vc4_flush_jobs_reading_resource(struct vc4_context *vc4, vc4_flush_jobs_writing_resource(vc4, prsc); - struct hash_entry *entry; hash_table_foreach(vc4->jobs, entry) { struct vc4_job *job = entry->data; @@ -453,6 +457,8 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job) submit.shader_rec_count = job->shader_rec_count; submit.uniforms = (uintptr_t)job->uniforms.base; submit.uniforms_size = cl_offset(&job->uniforms); + if (job->perfmon) + submit.perfmonid = job->perfmon->id; assert(job->draw_min_x != ~0 && job->draw_min_y != ~0); submit.min_x_tile = job->draw_min_x / job->tile_width; @@ -470,6 +476,19 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job) } submit.flags |= job->flags; + if (vc4->screen->has_syncobj) { + submit.out_sync = vc4->job_syncobj; + + if (vc4->in_fence_fd >= 0) { + /* This replaces the fence in the syncobj. */ + drmSyncobjImportSyncFile(vc4->fd, vc4->in_syncobj, + vc4->in_fence_fd); + submit.in_sync = vc4->in_syncobj; + close(vc4->in_fence_fd); + vc4->in_fence_fd = -1; + } + } + if (!(vc4_debug & VC4_DEBUG_NORAST)) { int ret; @@ -485,6 +504,8 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job) warned = true; } else if (!ret) { vc4->last_emit_seqno = submit.seqno; + if (job->perfmon) + job->perfmon->last_seqno = submit.seqno; } } @@ -521,7 +542,7 @@ vc4_job_hash(const void *key) return _mesa_hash_data(key, sizeof(struct vc4_job_key)); } -void +int vc4_job_init(struct vc4_context *vc4) { vc4->jobs = _mesa_hash_table_create(vc4, @@ -530,5 +551,24 @@ vc4_job_init(struct vc4_context *vc4) vc4->write_jobs = _mesa_hash_table_create(vc4, _mesa_hash_pointer, _mesa_key_pointer_equal); + + if (vc4->screen->has_syncobj) { + /* Create the syncobj as signaled since with no job executed + * there is nothing to wait on. + */ + int ret = drmSyncobjCreate(vc4->fd, + DRM_SYNCOBJ_CREATE_SIGNALED, + &vc4->job_syncobj); + if (ret) { + /* If the screen indicated syncobj support, we should + * be able to create a signaled syncobj. + * At this point it is too late to pretend the screen + * has no syncobj support. + */ + return ret; + } + } + + return 0; } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c index 98cdfdf33..bc9bd76ae 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c @@ -38,6 +38,7 @@ #include "vc4_context.h" #include "vc4_qpu.h" #include "vc4_qir.h" +#include "mesa/state_tracker/st_glsl_types.h" static struct qreg ntq_get_src(struct vc4_compile *c, nir_src src, int i); @@ -50,6 +51,12 @@ type_size(const struct glsl_type *type) return glsl_count_attribute_slots(type, false); } +static int +uniforms_type_size(const struct glsl_type *type) +{ + return st_glsl_storage_type_size(type, false); +} + static void resize_qreg_array(struct vc4_compile *c, struct qreg **regs, @@ -137,6 +144,32 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) return qir_TEX_RESULT(c); } +static struct qreg +vc4_ubo_load(struct vc4_compile *c, nir_intrinsic_instr *intr) +{ + nir_const_value *buffer_index = + nir_src_as_const_value(intr->src[0]); + assert(buffer_index->u32[0] == 1); + assert(c->stage == QSTAGE_FRAG); + + struct qreg offset = ntq_get_src(c, intr->src[1], 0); + + /* Clamp to [0, array size). Note that MIN/MAX are signed. */ + offset = qir_MAX(c, offset, qir_uniform_ui(c, 0)); + offset = qir_MIN_NOIMM(c, offset, + qir_uniform_ui(c, c->fs_key->ubo_1_size - 4)); + + qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), + offset, + qir_uniform(c, QUNIFORM_UBO_ADDR, buffer_index->u32[0])); + + c->num_texture_samples++; + + ntq_emit_thrsw(c); + + return qir_TEX_RESULT(c); +} + nir_ssa_def * vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz) { @@ -287,7 +320,7 @@ static struct qreg ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr, unsigned src) { - assert(util_is_power_of_two(instr->dest.write_mask)); + assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); unsigned chan = ffs(instr->dest.write_mask) - 1; struct qreg r = ntq_get_src(c, instr->src[src].src, instr->src[src].swizzle[chan]); @@ -654,24 +687,44 @@ ntq_fceil(struct vc4_compile *c, struct qreg src) } static struct qreg +ntq_shrink_sincos_input_range(struct vc4_compile *c, struct qreg x) +{ + /* Since we're using a Taylor approximation, we want to have a small + * number of coefficients and take advantage of sin/cos repeating + * every 2pi. We keep our x as close to 0 as we can, since the series + * will be less accurate as |x| increases. (Also, be careful of + * shifting the input x value to be tricky with sin/cos relations, + * because getting accurate values for x==0 is very important for SDL + * rendering) + */ + struct qreg scaled_x = + qir_FMUL(c, x, + qir_uniform_f(c, 1.0f / (M_PI * 2.0f))); + /* Note: FTOI truncates toward 0. */ + struct qreg x_frac = qir_FSUB(c, scaled_x, + qir_ITOF(c, qir_FTOI(c, scaled_x))); + /* Map [0.5, 1] to [-0.5, 0] */ + qir_SF(c, qir_FSUB(c, x_frac, qir_uniform_f(c, 0.5))); + qir_FSUB_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NC; + /* Map [-1, -0.5] to [0, 0.5] */ + qir_SF(c, qir_FADD(c, x_frac, qir_uniform_f(c, 0.5))); + qir_FADD_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS; + + return x_frac; +} + +static struct qreg ntq_fsin(struct vc4_compile *c, struct qreg src) { float coeff[] = { - -2.0 * M_PI, - pow(2.0 * M_PI, 3) / (3 * 2 * 1), - -pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1), - pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1), - -pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), + 2.0 * M_PI, + -pow(2.0 * M_PI, 3) / (3 * 2 * 1), + pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1), + -pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1), + pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), }; - struct qreg scaled_x = - qir_FMUL(c, - src, - qir_uniform_f(c, 1.0 / (M_PI * 2.0))); - - struct qreg x = qir_FADD(c, - ntq_ffract(c, scaled_x), - qir_uniform_f(c, -0.5)); + struct qreg x = ntq_shrink_sincos_input_range(c, src); struct qreg x2 = qir_FMUL(c, x, x); struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0])); for (int i = 1; i < ARRAY_SIZE(coeff); i++) { @@ -689,21 +742,15 @@ static struct qreg ntq_fcos(struct vc4_compile *c, struct qreg src) { float coeff[] = { - -1.0f, - pow(2.0 * M_PI, 2) / (2 * 1), - -pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1), - pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1), - -pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), - pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), + 1.0f, + -pow(2.0 * M_PI, 2) / (2 * 1), + pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1), + -pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1), + pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), + -pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), }; - struct qreg scaled_x = - qir_FMUL(c, src, - qir_uniform_f(c, 1.0f / (M_PI * 2.0f))); - struct qreg x_frac = qir_FADD(c, - ntq_ffract(c, scaled_x), - qir_uniform_f(c, -0.5)); - + struct qreg x_frac = ntq_shrink_sincos_input_range(c, src); struct qreg sum = qir_uniform_f(c, coeff[0]); struct qreg x2 = qir_FMUL(c, x_frac, x_frac); struct qreg x = x2; /* Current x^2, x^4, or x^6 */ @@ -711,13 +758,10 @@ ntq_fcos(struct vc4_compile *c, struct qreg src) if (i != 1) x = qir_FMUL(c, x, x2); - struct qreg mul = qir_FMUL(c, + sum = qir_FADD(c, qir_FMUL(c, x, - qir_uniform_f(c, coeff[i])); - if (i == 0) - sum = mul; - else - sum = qir_FADD(c, sum, mul); + qir_uniform_f(c, coeff[i])), + sum); } return sum; } @@ -1337,7 +1381,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) /* We have a scalar result, so the instruction should only have a * single channel written to. */ - assert(util_is_power_of_two(instr->dest.write_mask)); + assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); ntq_store_dest(c, &instr->dest.dest, ffs(instr->dest.write_mask) - 1, result); } @@ -1659,7 +1703,7 @@ static void ntq_setup_uniforms(struct vc4_compile *c) { nir_foreach_variable(var, &c->s->uniforms) { - uint32_t vec4_count = type_size(var->type); + uint32_t vec4_count = uniforms_type_size(var->type); unsigned vec4_size = 4 * sizeof(float); declare_uniform_range(c, var->data.driver_location * vec4_size, @@ -1775,6 +1819,11 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) } break; + case nir_intrinsic_load_ubo: + assert(instr->num_components == 1); + ntq_store_dest(c, &instr->dest, 0, vc4_ubo_load(c, instr)); + break; + case nir_intrinsic_load_user_clip_plane: for (int i = 0; i < instr->num_components; i++) { ntq_store_dest(c, &instr->dest, i, @@ -2180,13 +2229,16 @@ nir_to_qir(struct vc4_compile *c) } static const nir_shader_compiler_options nir_options = { + .lower_all_io_to_temps = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_fdiv = true, .lower_ffma = true, .lower_flrp32 = true, .lower_fpow = true, .lower_fsat = true, .lower_fsqrt = true, + .lower_ldexp = true, .lower_negate = true, .native_integers = true, .max_unroll_iterations = 32, @@ -2435,9 +2487,10 @@ vc4_shader_state_create(struct pipe_context *pctx, */ s = cso->ir.nir; - NIR_PASS_V(s, nir_lower_io, nir_var_all, type_size, + NIR_PASS_V(s, nir_lower_io, nir_var_uniform, + uniforms_type_size, (nir_lower_io_options)0); - } else { + } else { assert(cso->type == PIPE_SHADER_IR_TGSI); if (vc4_debug & VC4_DEBUG_TGSI) { @@ -2449,6 +2502,10 @@ vc4_shader_state_create(struct pipe_context *pctx, s = tgsi_to_nir(cso->tokens, &nir_options); } + NIR_PASS_V(s, nir_lower_io, nir_var_all & ~nir_var_uniform, + type_size, + (nir_lower_io_options)0); + NIR_PASS_V(s, nir_opt_global_to_local); NIR_PASS_V(s, nir_lower_regs_to_ssa); NIR_PASS_V(s, nir_normalize_cubemap_coords); @@ -2724,7 +2781,8 @@ vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode) VC4_DIRTY_RASTERIZER | VC4_DIRTY_SAMPLE_MASK | VC4_DIRTY_FRAGTEX | - VC4_DIRTY_UNCOMPILED_FS))) { + VC4_DIRTY_UNCOMPILED_FS | + VC4_DIRTY_UBO_1_SIZE))) { return; } @@ -2768,6 +2826,7 @@ vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode) PIPE_SPRITE_COORD_UPPER_LEFT); } + key->ubo_1_size = vc4->constbuf[PIPE_SHADER_FRAGMENT].cb[1].buffer_size; key->light_twoside = vc4->rasterizer->base.light_twoside; struct vc4_compiled_shader *old_fs = vc4->prog.fs; @@ -2916,7 +2975,6 @@ vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso) struct vc4_context *vc4 = vc4_context(pctx); struct vc4_uncompiled_shader *so = hwcso; - struct hash_entry *entry; hash_table_foreach(vc4->fs_cache, entry) { delete_from_cache_if_matches(vc4->fs_cache, &vc4->prog.fs, entry, so); @@ -2973,7 +3031,6 @@ vc4_program_fini(struct pipe_context *pctx) { struct vc4_context *vc4 = vc4_context(pctx); - struct hash_entry *entry; hash_table_foreach(vc4->fs_cache, entry) { struct vc4_compiled_shader *shader = entry->data; vc4_bo_unreference(&shader->bo); diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c index c829e7f93..71f06aebf 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c @@ -343,13 +343,57 @@ qir_channels_written(struct qinst *inst) unreachable("Bad pack field"); } +char * +qir_describe_uniform(enum quniform_contents contents, uint32_t data, + const uint32_t *uniforms) +{ + static const char *quniform_names[] = { + [QUNIFORM_VIEWPORT_X_SCALE] = "vp_x_scale", + [QUNIFORM_VIEWPORT_Y_SCALE] = "vp_y_scale", + [QUNIFORM_VIEWPORT_Z_OFFSET] = "vp_z_offset", + [QUNIFORM_VIEWPORT_Z_SCALE] = "vp_z_scale", + [QUNIFORM_TEXTURE_CONFIG_P0] = "tex_p0", + [QUNIFORM_TEXTURE_CONFIG_P1] = "tex_p1", + [QUNIFORM_TEXTURE_CONFIG_P2] = "tex_p2", + [QUNIFORM_TEXTURE_FIRST_LEVEL] = "tex_first_level", + }; + + switch (contents) { + case QUNIFORM_CONSTANT: + return ralloc_asprintf(NULL, "0x%08x / %f", data, uif(data)); + case QUNIFORM_UNIFORM: + if (uniforms) { + uint32_t unif = uniforms[data]; + return ralloc_asprintf(NULL, "unif[%d] = 0x%08x / %f", + data, unif, uif(unif)); + } else { + return ralloc_asprintf(NULL, "unif[%d]", data); + } + + case QUNIFORM_TEXTURE_CONFIG_P0: + case QUNIFORM_TEXTURE_CONFIG_P1: + case QUNIFORM_TEXTURE_CONFIG_P2: + case QUNIFORM_TEXTURE_FIRST_LEVEL: + return ralloc_asprintf(NULL, "%s[%d]", + quniform_names[contents], data); + + default: + if (contents < ARRAY_SIZE(quniform_names) && + quniform_names[contents]) { + return ralloc_asprintf(NULL, "%s", + quniform_names[contents]); + } else { + return ralloc_asprintf(NULL, "??? %d", contents); + } + } +} + static void qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write) { static const char *files[] = { [QFILE_TEMP] = "t", [QFILE_VARY] = "v", - [QFILE_UNIF] = "u", [QFILE_TLB_COLOR_WRITE] = "tlb_c", [QFILE_TLB_COLOR_WRITE_MS] = "tlb_c_ms", [QFILE_TLB_Z_WRITE] = "tlb_z", @@ -403,16 +447,18 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write) fprintf(stderr, "%s", files[reg.file]); break; - default: - fprintf(stderr, "%s%d", files[reg.file], reg.index); + case QFILE_UNIF: { + char *desc = qir_describe_uniform(c->uniform_contents[reg.index], + c->uniform_data[reg.index], + NULL); + fprintf(stderr, "u%d (%s)", reg.index, desc); + ralloc_free(desc); break; } - if (reg.file == QFILE_UNIF && - c->uniform_contents[reg.index] == QUNIFORM_CONSTANT) { - fprintf(stderr, " (0x%08x / %f)", - c->uniform_data[reg.index], - uif(c->uniform_data[reg.index])); + default: + fprintf(stderr, "%s%d", files[reg.file], reg.index); + break; } } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h index 90acaef28..1aa5f652f 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h @@ -363,6 +363,7 @@ struct vc4_fs_key { uint8_t alpha_test_func; uint8_t logicop_func; uint32_t point_sprite_mask; + uint32_t ubo_1_size; struct pipe_rt_blend_state blend; }; @@ -591,6 +592,8 @@ uint8_t qir_channels_written(struct qinst *inst); void qir_dump(struct vc4_compile *c); void qir_dump_inst(struct vc4_compile *c, struct qinst *inst); +char *qir_describe_uniform(enum quniform_contents contents, uint32_t data, + const uint32_t *uniforms); const char *qir_get_stage_name(enum qstage stage); void qir_validate(struct vc4_compile *c); diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c index 7108b3ee9..5629ce044 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c @@ -173,8 +173,6 @@ qir_setup_def(struct vc4_compile *c, struct qblock *block, int ip, static void sf_state_clear(struct hash_table *partial_update_ht) { - struct hash_entry *entry; - hash_table_foreach(partial_update_ht, entry) { struct partial_update_state *state = entry->data; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c index ad19f06d3..d7c22e75c 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c @@ -136,7 +136,6 @@ qir_lower_uniforms(struct vc4_compile *c) */ uint32_t max_count = 0; uint32_t max_index = 0; - struct hash_entry *entry; hash_table_foreach(ht, entry) { uint32_t count = (uintptr_t)entry->data; uint32_t index = (uintptr_t)entry->key - 1; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c index cdcbcc917..41e6ec5c1 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c @@ -22,11 +22,13 @@ * IN THE SOFTWARE. */ +#include "pipe/p_defines.h" #include "util/u_blit.h" #include "util/u_memory.h" #include "util/u_format.h" #include "util/u_inlines.h" #include "util/u_surface.h" +#include "util/u_transfer_helper.h" #include "util/u_upload_mgr.h" #include "drm_fourcc.h" @@ -36,10 +38,6 @@ #include "vc4_resource.h" #include "vc4_tiling.h" -#ifndef DRM_FORMAT_MOD_INVALID -#define DRM_FORMAT_MOD_INVALID ((1ULL << 56) - 1) -#endif - static bool vc4_resource_bo_alloc(struct vc4_resource *rsc) { @@ -79,15 +77,8 @@ vc4_resource_transfer_unmap(struct pipe_context *pctx, struct vc4_transfer *trans = vc4_transfer(ptrans); if (trans->map) { - struct vc4_resource *rsc; - struct vc4_resource_slice *slice; - if (trans->ss_resource) { - rsc = vc4_resource(trans->ss_resource); - slice = &rsc->slices[0]; - } else { - rsc = vc4_resource(ptrans->resource); - slice = &rsc->slices[ptrans->level]; - } + struct vc4_resource *rsc = vc4_resource(ptrans->resource); + struct vc4_resource_slice *slice = &rsc->slices[ptrans->level]; if (ptrans->usage & PIPE_TRANSFER_WRITE) { vc4_store_tiled_image(rsc->bo->map + slice->offset + @@ -100,51 +91,10 @@ vc4_resource_transfer_unmap(struct pipe_context *pctx, free(trans->map); } - if (trans->ss_resource && (ptrans->usage & PIPE_TRANSFER_WRITE)) { - struct pipe_blit_info blit; - memset(&blit, 0, sizeof(blit)); - - blit.src.resource = trans->ss_resource; - blit.src.format = trans->ss_resource->format; - blit.src.box.width = trans->ss_box.width; - blit.src.box.height = trans->ss_box.height; - blit.src.box.depth = 1; - - blit.dst.resource = ptrans->resource; - blit.dst.format = ptrans->resource->format; - blit.dst.level = ptrans->level; - blit.dst.box = trans->ss_box; - - blit.mask = util_format_get_mask(ptrans->resource->format); - blit.filter = PIPE_TEX_FILTER_NEAREST; - - pctx->blit(pctx, &blit); - - pipe_resource_reference(&trans->ss_resource, NULL); - } - pipe_resource_reference(&ptrans->resource, NULL); slab_free(&vc4->transfer_pool, ptrans); } -static struct pipe_resource * -vc4_get_temp_resource(struct pipe_context *pctx, - struct pipe_resource *prsc, - const struct pipe_box *box) -{ - struct pipe_resource temp_setup; - - memset(&temp_setup, 0, sizeof(temp_setup)); - temp_setup.target = prsc->target; - temp_setup.format = prsc->format; - temp_setup.width0 = box->width; - temp_setup.height0 = box->height; - temp_setup.depth0 = 1; - temp_setup.array_size = 1; - - return pctx->screen->resource_create(pctx->screen, &temp_setup); -} - static void * vc4_resource_transfer_map(struct pipe_context *pctx, struct pipe_resource *prsc, @@ -164,7 +114,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx, */ if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) && - !(prsc->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) && + !(prsc->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) && prsc->last_level == 0 && prsc->width0 == box->width && prsc->height0 == box->height && @@ -218,50 +168,6 @@ vc4_resource_transfer_map(struct pipe_context *pctx, ptrans->usage = usage; ptrans->box = *box; - /* If the resource is multisampled, we need to resolve to single - * sample. This seems like it should be handled at a higher layer. - */ - if (prsc->nr_samples > 1) { - trans->ss_resource = vc4_get_temp_resource(pctx, prsc, box); - if (!trans->ss_resource) - goto fail; - assert(!trans->ss_resource->nr_samples); - - /* The ptrans->box gets modified for tile alignment, so save - * the original box for unmap time. - */ - trans->ss_box = *box; - - if (usage & PIPE_TRANSFER_READ) { - struct pipe_blit_info blit; - memset(&blit, 0, sizeof(blit)); - - blit.src.resource = ptrans->resource; - blit.src.format = ptrans->resource->format; - blit.src.level = ptrans->level; - blit.src.box = trans->ss_box; - - blit.dst.resource = trans->ss_resource; - blit.dst.format = trans->ss_resource->format; - blit.dst.box.width = trans->ss_box.width; - blit.dst.box.height = trans->ss_box.height; - blit.dst.box.depth = 1; - - blit.mask = util_format_get_mask(prsc->format); - blit.filter = PIPE_TEX_FILTER_NEAREST; - - pctx->blit(pctx, &blit); - vc4_flush_jobs_writing_resource(vc4, blit.dst.resource); - } - - /* The rest of the mapping process should use our temporary. */ - prsc = trans->ss_resource; - rsc = vc4_resource(prsc); - ptrans->box.x = 0; - ptrans->box.y = 0; - ptrans->box.z = 0; - } - if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) buf = vc4_bo_map_unsynchronized(rsc->bo); else @@ -275,9 +181,6 @@ vc4_resource_transfer_map(struct pipe_context *pctx, struct vc4_resource_slice *slice = &rsc->slices[level]; if (rsc->tiled) { - uint32_t utile_w = vc4_utile_width(rsc->cpp); - uint32_t utile_h = vc4_utile_height(rsc->cpp); - /* No direct mappings of tiled, since we need to manually * tile/untile. */ @@ -298,49 +201,12 @@ vc4_resource_transfer_map(struct pipe_context *pctx, ptrans->box.height = (ptrans->box.height + 3) >> 2; } - /* We need to align the box to utile boundaries, since that's - * what load/store operates on. This may cause us to need to - * read out the original contents in that border area. Right - * now we just read out the entire contents, including the - * middle area that will just get overwritten. - */ - uint32_t box_start_x = ptrans->box.x & (utile_w - 1); - uint32_t box_start_y = ptrans->box.y & (utile_h - 1); - bool needs_load = (usage & PIPE_TRANSFER_READ) != 0; - - if (box_start_x) { - ptrans->box.width += box_start_x; - ptrans->box.x -= box_start_x; - needs_load = true; - } - if (box_start_y) { - ptrans->box.height += box_start_y; - ptrans->box.y -= box_start_y; - needs_load = true; - } - if (ptrans->box.width & (utile_w - 1)) { - /* We only need to force a load if our border region - * we're extending into is actually part of the - * texture. - */ - uint32_t slice_width = u_minify(prsc->width0, level); - if (ptrans->box.x + ptrans->box.width != slice_width) - needs_load = true; - ptrans->box.width = align(ptrans->box.width, utile_w); - } - if (ptrans->box.height & (utile_h - 1)) { - uint32_t slice_height = u_minify(prsc->height0, level); - if (ptrans->box.y + ptrans->box.height != slice_height) - needs_load = true; - ptrans->box.height = align(ptrans->box.height, utile_h); - } - ptrans->stride = ptrans->box.width * rsc->cpp; ptrans->layer_stride = ptrans->stride * ptrans->box.height; trans->map = malloc(ptrans->layer_stride * ptrans->box.depth); - if (needs_load) { + if (usage & PIPE_TRANSFER_READ) { vc4_load_tiled_image(trans->map, ptrans->stride, buf + slice->offset + ptrans->box.z * rsc->cube_map_stride, @@ -348,9 +214,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx, slice->tiling, rsc->cpp, &ptrans->box); } - return (trans->map + - box_start_x * rsc->cpp + - box_start_y * ptrans->stride); + return trans->map; } else { ptrans->stride = slice->stride; ptrans->layer_stride = ptrans->stride; @@ -368,6 +232,44 @@ fail: } static void +vc4_texture_subdata(struct pipe_context *pctx, + struct pipe_resource *prsc, + unsigned level, + unsigned usage, + const struct pipe_box *box, + const void *data, + unsigned stride, + unsigned layer_stride) +{ + struct vc4_resource *rsc = vc4_resource(prsc); + struct vc4_resource_slice *slice = &rsc->slices[level]; + + /* For a direct mapping, we can just take the u_transfer path. */ + if (!rsc->tiled || + box->depth != 1 || + (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)) { + return u_default_texture_subdata(pctx, prsc, level, usage, box, + data, stride, layer_stride); + } + + /* Otherwise, map and store the texture data directly into the tiled + * texture. + */ + void *buf; + if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) + buf = vc4_bo_map_unsynchronized(rsc->bo); + else + buf = vc4_bo_map(rsc->bo); + + vc4_store_tiled_image(buf + slice->offset + + box->z * rsc->cube_map_stride, + slice->stride, + (void *)data, stride, + slice->tiling, rsc->cpp, + box); +} + +static void vc4_resource_destroy(struct pipe_screen *pscreen, struct pipe_resource *prsc) { @@ -406,7 +308,7 @@ vc4_resource_get_handle(struct pipe_screen *pscreen, whandle->modifier = DRM_FORMAT_MOD_LINEAR; switch (whandle->type) { - case DRM_API_HANDLE_TYPE_SHARED: + case WINSYS_HANDLE_TYPE_SHARED: if (screen->ro) { /* This could probably be supported, assuming that a * control node was used for pl111. @@ -416,12 +318,12 @@ vc4_resource_get_handle(struct pipe_screen *pscreen, } return vc4_bo_flink(rsc->bo, &whandle->handle); - case DRM_API_HANDLE_TYPE_KMS: + case WINSYS_HANDLE_TYPE_KMS: if (screen->ro && renderonly_get_handle(rsc->scanout, whandle)) return TRUE; whandle->handle = rsc->bo->handle; return TRUE; - case DRM_API_HANDLE_TYPE_FD: + case WINSYS_HANDLE_TYPE_FD: /* FDs are cross-device, so we can export directly from vc4. */ whandle->handle = vc4_bo_get_dmabuf(rsc->bo); @@ -564,8 +466,10 @@ get_resource_texture_format(struct pipe_resource *prsc) if (prsc->nr_samples > 1) { return ~0; } else { - assert(format == VC4_TEXTURE_TYPE_RGBA8888); - return VC4_TEXTURE_TYPE_RGBA32R; + if (format == VC4_TEXTURE_TYPE_RGBA8888) + return VC4_TEXTURE_TYPE_RGBA32R; + else + return ~0; } } @@ -668,7 +572,15 @@ vc4_resource_create_with_modifiers(struct pipe_screen *pscreen, goto fail; } - if (screen->ro && tmpl->bind & PIPE_BIND_SCANOUT) { + /* Set up the "scanout resource" (the dmabuf export of our buffer to + * the KMS handle) if the buffer might ever have + * resource_get_handle(WINSYS_HANDLE_TYPE_KMS) called on it. + * create_with_modifiers() doesn't give us usage flags, so we have to + * assume that all calls with modifiers are scanout-possible. + */ + if (screen->ro && + ((tmpl->bind & PIPE_BIND_SCANOUT) || + !(count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID))) { rsc->scanout = renderonly_scanout_for_resource(prsc, screen->ro, NULL); if (!rsc->scanout) @@ -708,19 +620,12 @@ vc4_resource_from_handle(struct pipe_screen *pscreen, if (!rsc) return NULL; - if (whandle->offset != 0) { - fprintf(stderr, - "Attempt to import unsupported winsys offset %u\n", - whandle->offset); - return NULL; - } - switch (whandle->type) { - case DRM_API_HANDLE_TYPE_SHARED: + case WINSYS_HANDLE_TYPE_SHARED: rsc->bo = vc4_bo_open_name(screen, whandle->handle, whandle->stride); break; - case DRM_API_HANDLE_TYPE_FD: + case WINSYS_HANDLE_TYPE_FD: rsc->bo = vc4_bo_open_dmabuf(screen, whandle->handle, whandle->stride); break; @@ -766,6 +671,28 @@ vc4_resource_from_handle(struct pipe_screen *pscreen, rsc->vc4_format = get_resource_texture_format(prsc); vc4_setup_slices(rsc, "import"); + if (whandle->offset != 0) { + if (rsc->tiled) { + fprintf(stderr, + "Attempt to import unsupported " + "winsys offset %u\n", + whandle->offset); + goto fail; + } + + rsc->slices[0].offset += whandle->offset; + + if (rsc->slices[0].offset + rsc->slices[0].size > + rsc->bo->size) { + fprintf(stderr, "Attempt to import " + "with overflowing offset (%d + %d > %d)\n", + whandle->offset, + rsc->slices[0].size, + rsc->bo->size); + goto fail; + } + } + if (screen->ro) { /* Make sure that renderonly has a handle to our buffer in the * display's fd, so that a later renderonly_get_handle() @@ -779,7 +706,7 @@ vc4_resource_from_handle(struct pipe_screen *pscreen, goto fail; } - if (whandle->stride != slice->stride) { + if (rsc->tiled && whandle->stride != slice->stride) { static bool warned = false; if (!warned) { warned = true; @@ -792,6 +719,8 @@ vc4_resource_from_handle(struct pipe_screen *pscreen, slice->stride); } goto fail; + } else if (!rsc->tiled) { + slice->stride = whandle->stride; } return prsc; @@ -1187,6 +1116,14 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx, return shadow_rsc; } +static const struct u_transfer_vtbl transfer_vtbl = { + .resource_create = vc4_resource_create, + .resource_destroy = vc4_resource_destroy, + .transfer_map = vc4_resource_transfer_map, + .transfer_unmap = vc4_resource_transfer_unmap, + .transfer_flush_region = u_default_transfer_flush_region, +}; + void vc4_resource_screen_init(struct pipe_screen *pscreen) { @@ -1199,6 +1136,9 @@ vc4_resource_screen_init(struct pipe_screen *pscreen) pscreen->resource_destroy = u_resource_destroy_vtbl; pscreen->resource_get_handle = vc4_resource_get_handle; pscreen->resource_destroy = vc4_resource_destroy; + pscreen->transfer_helper = u_transfer_helper_create(&transfer_vtbl, + false, false, + false, true); /* Test if the kernel has GET_TILING; it will return -EINVAL if the * ioctl does not exist, but -ENOENT if we pass an impossible handle. @@ -1215,11 +1155,11 @@ vc4_resource_screen_init(struct pipe_screen *pscreen) void vc4_resource_context_init(struct pipe_context *pctx) { - pctx->transfer_map = vc4_resource_transfer_map; - pctx->transfer_flush_region = u_default_transfer_flush_region; - pctx->transfer_unmap = vc4_resource_transfer_unmap; + pctx->transfer_map = u_transfer_helper_transfer_map; + pctx->transfer_flush_region = u_transfer_helper_transfer_flush_region; + pctx->transfer_unmap = u_transfer_helper_transfer_unmap; pctx->buffer_subdata = u_default_buffer_subdata; - pctx->texture_subdata = u_default_texture_subdata; + pctx->texture_subdata = vc4_texture_subdata; pctx->create_surface = vc4_create_surface; pctx->surface_destroy = vc4_surface_destroy; pctx->resource_copy_region = util_resource_copy_region; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.h b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.h index d4c491e50..8c0aadbcc 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.h @@ -32,9 +32,6 @@ struct vc4_transfer { struct pipe_transfer base; void *map; - - struct pipe_resource *ss_resource; - struct pipe_box ss_box; }; struct vc4_resource_slice { diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c index 9879a4db1..e7f7c82c2 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c @@ -22,7 +22,7 @@ * IN THE SOFTWARE. */ -#include "os/os_misc.h" +#include "util/os_misc.h" #include "pipe/p_defines.h" #include "pipe/p_screen.h" #include "pipe/p_state.h" @@ -32,6 +32,8 @@ #include "util/u_memory.h" #include "util/u_format.h" #include "util/u_hash_table.h" +#include "util/u_screen.h" +#include "util/u_transfer_helper.h" #include "util/ralloc.h" #include <xf86drm.h> @@ -64,7 +66,7 @@ static const struct debug_named_value debug_options[] = { "Flush after each draw call" }, { "always_sync", VC4_DEBUG_ALWAYS_SYNC, "Wait for finish after each flush" }, -#if USE_VC4_SIMULATOR +#ifdef USE_VC4_SIMULATOR { "dump", VC4_DEBUG_DUMP, "Write a GPU command stream trace file" }, #endif @@ -105,10 +107,12 @@ vc4_screen_destroy(struct pipe_screen *pscreen) slab_destroy_parent(&screen->transfer_pool); free(screen->ro); -#if USE_VC4_SIMULATOR +#ifdef USE_VC4_SIMULATOR vc4_simulator_destroy(screen); #endif + u_transfer_helper_destroy(pscreen->transfer_helper); + close(screen->fd); ralloc_free(pscreen); } @@ -140,17 +144,15 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: case PIPE_CAP_NPOT_TEXTURES: case PIPE_CAP_SHAREABLE_SHADERS: - case PIPE_CAP_USER_CONSTANT_BUFFERS: - case PIPE_CAP_TEXTURE_SHADOW_MAP: case PIPE_CAP_BLEND_EQUATION_SEPARATE: - case PIPE_CAP_TWO_SIDED_STENCIL: case PIPE_CAP_TEXTURE_MULTISAMPLE: case PIPE_CAP_TEXTURE_SWIZZLE: - case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: - case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION: case PIPE_CAP_TEXTURE_BARRIER: return 1; + case PIPE_CAP_NATIVE_FENCE_FD: + return screen->has_syncobj; + case PIPE_CAP_TILE_RASTER_ORDER: return vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER); @@ -160,15 +162,6 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_POINT_SPRITE: return 1; - case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: - return 256; - - case PIPE_CAP_GLSL_FEATURE_LEVEL: - return 120; - - case PIPE_CAP_MAX_VIEWPORTS: - return 1; - case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: return 1; @@ -177,130 +170,6 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: return 1; - /* Unsupported features. */ - case PIPE_CAP_ANISOTROPIC_FILTER: - case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: - case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY: - case PIPE_CAP_CUBE_MAP_ARRAY: - case PIPE_CAP_TEXTURE_MIRROR_CLAMP: - case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: - case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: - case PIPE_CAP_SEAMLESS_CUBE_MAP: - case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: - case PIPE_CAP_TGSI_INSTANCEID: - case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_COMPUTE: - case PIPE_CAP_START_INSTANCE: - case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: - case PIPE_CAP_SHADER_STENCIL_EXPORT: - case PIPE_CAP_TGSI_TEXCOORD: - case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: - case PIPE_CAP_CONDITIONAL_RENDER: - case PIPE_CAP_PRIMITIVE_RESTART: - case PIPE_CAP_SM3: - case PIPE_CAP_INDEP_BLEND_ENABLE: - case PIPE_CAP_INDEP_BLEND_FUNC: - case PIPE_CAP_DEPTH_CLIP_DISABLE: - case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: - case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: - case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: - case PIPE_CAP_USER_VERTEX_BUFFERS: - case PIPE_CAP_QUERY_PIPELINE_STATISTICS: - case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: - case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: - case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: - case PIPE_CAP_TEXTURE_GATHER_SM5: - case PIPE_CAP_FAKE_SW_MSAA: - case PIPE_CAP_TEXTURE_QUERY_LOD: - case PIPE_CAP_SAMPLE_SHADING: - case PIPE_CAP_TEXTURE_GATHER_OFFSETS: - case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: - case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: - case PIPE_CAP_MAX_TEXEL_OFFSET: - case PIPE_CAP_MAX_VERTEX_STREAMS: - case PIPE_CAP_DRAW_INDIRECT: - case PIPE_CAP_MULTI_DRAW_INDIRECT: - case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: - case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: - case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: - case PIPE_CAP_SAMPLER_VIEW_TARGET: - case PIPE_CAP_CLIP_HALFZ: - case PIPE_CAP_VERTEXID_NOBASE: - case PIPE_CAP_POLYGON_OFFSET_CLAMP: - case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: - case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: - case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: - case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: - case PIPE_CAP_TEXTURE_FLOAT_LINEAR: - case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: - case PIPE_CAP_DEPTH_BOUNDS_TEST: - case PIPE_CAP_TGSI_TXQS: - case PIPE_CAP_FORCE_PERSAMPLE_INTERP: - case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: - case PIPE_CAP_CLEAR_TEXTURE: - case PIPE_CAP_DRAW_PARAMETERS: - case PIPE_CAP_TGSI_PACK_HALF_FLOAT: - case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: - case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: - case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_INVALIDATE_BUFFER: - case PIPE_CAP_GENERATE_MIPMAP: - case PIPE_CAP_STRING_MARKER: - case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: - case PIPE_CAP_QUERY_BUFFER_OBJECT: - case PIPE_CAP_QUERY_MEMORY_INFO: - case PIPE_CAP_PCI_GROUP: - case PIPE_CAP_PCI_BUS: - case PIPE_CAP_PCI_DEVICE: - case PIPE_CAP_PCI_FUNCTION: - case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: - case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: - case PIPE_CAP_CULL_DISTANCE: - case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES: - case PIPE_CAP_TGSI_VOTE: - case PIPE_CAP_MAX_WINDOW_RECTANGLES: - case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED: - case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS: - case PIPE_CAP_TGSI_ARRAY_COMPONENTS: - case PIPE_CAP_TGSI_CAN_READ_OUTPUTS: - case PIPE_CAP_NATIVE_FENCE_FD: - case PIPE_CAP_TGSI_FS_FBFETCH: - case PIPE_CAP_TGSI_MUL_ZERO_WINS: - case PIPE_CAP_DOUBLES: - case PIPE_CAP_INT64: - case PIPE_CAP_INT64_DIVMOD: - case PIPE_CAP_TGSI_TEX_TXF_LZ: - case PIPE_CAP_TGSI_CLOCK: - case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE: - case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE: - case PIPE_CAP_TGSI_BALLOT: - case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT: - case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX: - case PIPE_CAP_POST_DEPTH_COVERAGE: - case PIPE_CAP_BINDLESS_TEXTURE: - case PIPE_CAP_NIR_SAMPLERS_AS_DEREF: - case PIPE_CAP_QUERY_SO_OVERFLOW: - case PIPE_CAP_MEMOBJ: - case PIPE_CAP_LOAD_CONSTBUF: - case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS: - return 0; - - /* Stream output. */ - case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: - case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: - case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: - case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: - case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: - return 0; - - /* Geometry shader output, unsupported. */ - case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: - case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: - return 0; - /* Texturing. */ case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: @@ -308,35 +177,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: /* Note: Not supported in hardware, just faking it. */ return 5; - case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: - return 0; - - /* Render targets. */ - case PIPE_CAP_MAX_RENDER_TARGETS: - return 1; - - /* Queries. */ - case PIPE_CAP_QUERY_TIME_ELAPSED: - case PIPE_CAP_QUERY_TIMESTAMP: - return 0; - - case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: - case PIPE_CAP_MIN_TEXEL_OFFSET: - return 0; - - case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: - return 2048; - - case PIPE_CAP_ENDIANNESS: - return PIPE_ENDIAN_LITTLE; - - case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: - return 64; case PIPE_CAP_VENDOR_ID: return 0x14E4; - case PIPE_CAP_DEVICE_ID: - return 0xFFFFFFFF; case PIPE_CAP_ACCELERATED: return 1; case PIPE_CAP_VIDEO_MEMORY: { @@ -351,8 +194,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return 1; default: - fprintf(stderr, "unknown param %d\n", param); - return 0; + return u_pipe_screen_get_param_defaults(pscreen, param); } } @@ -372,10 +214,10 @@ vc4_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param) return 0.0f; case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: return 0.0f; - case PIPE_CAPF_GUARD_BAND_LEFT: - case PIPE_CAPF_GUARD_BAND_TOP: - case PIPE_CAPF_GUARD_BAND_RIGHT: - case PIPE_CAPF_GUARD_BAND_BOTTOM: + + case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY: return 0.0f; default: fprintf(stderr, "unknown paramf %d\n", param); @@ -443,13 +285,17 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, return PIPE_SHADER_IR_NIR; case PIPE_SHADER_CAP_SUPPORTED_IRS: return 0; - case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: - return 32; + case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: + return 32; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: - case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: + case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: return 0; + case PIPE_SHADER_CAP_SCALAR_ISA: + return 1; default: fprintf(stderr, "unknown shader param %d\n", param); return 0; @@ -462,16 +308,18 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen, enum pipe_format format, enum pipe_texture_target target, unsigned sample_count, + unsigned storage_sample_count, unsigned usage) { struct vc4_screen *screen = vc4_screen(pscreen); - unsigned retval = 0; + + if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) + return false; if (sample_count > 1 && sample_count != VC4_MAX_SAMPLES) return FALSE; - if ((target >= PIPE_MAX_TEXTURE_TYPES) || - !util_format_is_supported(format, usage)) { + if (target >= PIPE_MAX_TEXTURE_TYPES) { return FALSE; } @@ -521,46 +369,36 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen, case PIPE_FORMAT_R8G8B8_SSCALED: case PIPE_FORMAT_R8G8_SSCALED: case PIPE_FORMAT_R8_SSCALED: - retval |= PIPE_BIND_VERTEX_BUFFER; break; default: - break; + return FALSE; } } if ((usage & PIPE_BIND_RENDER_TARGET) && - vc4_rt_format_supported(format)) { - retval |= PIPE_BIND_RENDER_TARGET; + !vc4_rt_format_supported(format)) { + return FALSE; } if ((usage & PIPE_BIND_SAMPLER_VIEW) && - vc4_tex_format_supported(format) && - (format != PIPE_FORMAT_ETC1_RGB8 || screen->has_etc1)) { - retval |= PIPE_BIND_SAMPLER_VIEW; + (!vc4_tex_format_supported(format) || + (format == PIPE_FORMAT_ETC1_RGB8 && !screen->has_etc1))) { + return FALSE; } if ((usage & PIPE_BIND_DEPTH_STENCIL) && - (format == PIPE_FORMAT_S8_UINT_Z24_UNORM || - format == PIPE_FORMAT_X8Z24_UNORM)) { - retval |= PIPE_BIND_DEPTH_STENCIL; + format != PIPE_FORMAT_S8_UINT_Z24_UNORM && + format != PIPE_FORMAT_X8Z24_UNORM) { + return FALSE; } if ((usage & PIPE_BIND_INDEX_BUFFER) && - (format == PIPE_FORMAT_I8_UINT || - format == PIPE_FORMAT_I16_UINT)) { - retval |= PIPE_BIND_INDEX_BUFFER; - } - -#if 0 - if (retval != usage) { - fprintf(stderr, - "not supported: format=%s, target=%d, sample_count=%d, " - "usage=0x%x, retval=0x%x\n", util_format_name(format), - target, sample_count, usage, retval); + format != PIPE_FORMAT_I8_UINT && + format != PIPE_FORMAT_I16_UINT) { + return FALSE; } -#endif - return retval == usage; + return TRUE; } static void @@ -659,7 +497,9 @@ struct pipe_screen * vc4_screen_create(int fd, struct renderonly *ro) { struct vc4_screen *screen = rzalloc(NULL, struct vc4_screen); + uint64_t syncobj_cap = 0; struct pipe_screen *pscreen; + int err; pscreen = &screen->base; @@ -690,6 +530,14 @@ vc4_screen_create(int fd, struct renderonly *ro) vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_ETC1); screen->has_threaded_fs = vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_THREADED_FS); + screen->has_madvise = + vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_MADVISE); + screen->has_perfmon_ioctl = + vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_PERFMON); + + err = drmGetCap(fd, DRM_CAP_SYNCOBJ, &syncobj_cap); + if (err == 0 && syncobj_cap) + screen->has_syncobj = true; if (!vc4_get_chip_info(screen)) goto fail; @@ -698,13 +546,13 @@ vc4_screen_create(int fd, struct renderonly *ro) slab_create_parent(&screen->transfer_pool, sizeof(struct vc4_transfer), 16); - vc4_fence_init(screen); + vc4_fence_screen_init(screen); vc4_debug = debug_get_option_vc4_debug(); if (vc4_debug & VC4_DEBUG_SHADERDB) vc4_debug |= VC4_DEBUG_NORAST; -#if USE_VC4_SIMULATOR +#ifdef USE_VC4_SIMULATOR vc4_simulator_init(screen); #endif @@ -716,6 +564,11 @@ vc4_screen_create(int fd, struct renderonly *ro) pscreen->get_compiler_options = vc4_screen_get_compiler_options; pscreen->query_dmabuf_modifiers = vc4_screen_query_dmabuf_modifiers; + if (screen->has_perfmon_ioctl) { + pscreen->get_driver_query_group_info = vc4_get_driver_query_group_info; + pscreen->get_driver_query_info = vc4_get_driver_query_info; + } + return pscreen; fail: diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h index 85108219e..f4550d1c2 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h @@ -95,7 +95,10 @@ struct vc4_screen { bool has_control_flow; bool has_etc1; bool has_threaded_fs; + bool has_madvise; bool has_tiling_ioctl; + bool has_perfmon_ioctl; + bool has_syncobj; struct vc4_simulator_file *sim_file; }; @@ -116,9 +119,9 @@ vc4_screen_get_compiler_options(struct pipe_screen *pscreen, extern uint32_t vc4_debug; void -vc4_fence_init(struct vc4_screen *screen); +vc4_fence_screen_init(struct vc4_screen *screen); struct vc4_fence * -vc4_fence_create(struct vc4_screen *screen, uint64_t seqno); +vc4_fence_create(struct vc4_screen *screen, uint64_t seqno, int fd); #endif /* VC4_SCREEN_H */ diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c index a73e40969..37c098a04 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c @@ -619,6 +619,11 @@ vc4_simulator_get_param_ioctl(int fd, struct drm_vc4_get_param *args) args->value = true; return 0; + case DRM_VC4_PARAM_SUPPORTS_MADVISE: + case DRM_VC4_PARAM_SUPPORTS_PERFMON: + errno = -EINVAL; + return -1; + case DRM_VC4_PARAM_V3D_IDENT0: args->value = 0x02000000; return 0; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_state.c b/lib/mesa/src/gallium/drivers/vc4/vc4_state.c index ed8d404a4..1e4657a79 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_state.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_state.c @@ -23,6 +23,7 @@ */ #include "pipe/p_state.h" +#include "util/u_framebuffer.h" #include "util/u_inlines.h" #include "util/u_math.h" #include "util/u_memory.h" @@ -386,8 +387,6 @@ vc4_set_constant_buffer(struct pipe_context *pctx, struct vc4_context *vc4 = vc4_context(pctx); struct vc4_constbuf_stateobj *so = &vc4->constbuf[shader]; - assert(index == 0); - /* Note that the state tracker can unbind constant buffers by * passing NULL here. */ @@ -397,7 +396,10 @@ vc4_set_constant_buffer(struct pipe_context *pctx, return; } - assert(!cb->buffer); + if (index == 1 && so->cb[index].buffer_size != cb->buffer_size) + vc4->dirty |= VC4_DIRTY_UBO_1_SIZE; + + pipe_resource_reference(&so->cb[index].buffer, cb->buffer); so->cb[index].buffer_offset = cb->buffer_offset; so->cb[index].buffer_size = cb->buffer_size; so->cb[index].user_buffer = cb->user_buffer; @@ -413,21 +415,10 @@ vc4_set_framebuffer_state(struct pipe_context *pctx, { struct vc4_context *vc4 = vc4_context(pctx); struct pipe_framebuffer_state *cso = &vc4->framebuffer; - unsigned i; vc4->job = NULL; - for (i = 0; i < framebuffer->nr_cbufs; i++) - pipe_surface_reference(&cso->cbufs[i], framebuffer->cbufs[i]); - for (; i < vc4->framebuffer.nr_cbufs; i++) - pipe_surface_reference(&cso->cbufs[i], NULL); - - cso->nr_cbufs = framebuffer->nr_cbufs; - - pipe_surface_reference(&cso->zsbuf, framebuffer->zsbuf); - - cso->width = framebuffer->width; - cso->height = framebuffer->height; + util_copy_framebuffer_state(cso, framebuffer); /* Nonzero texture mipmap levels are laid out as if they were in * power-of-two-sized spaces. The renderbuffer config infers its @@ -567,8 +558,8 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, so->base = *cso; - pipe_reference(NULL, &prsc->reference); - so->base.texture = prsc; + so->base.texture = NULL; + pipe_resource_reference(&so->base.texture, prsc); so->base.reference.count = 1; so->base.context = pctx; @@ -581,14 +572,20 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, */ if ((cso->u.tex.first_level && (cso->u.tex.first_level != cso->u.tex.last_level)) || - rsc->vc4_format == VC4_TEXTURE_TYPE_RGBA32R) { + rsc->vc4_format == VC4_TEXTURE_TYPE_RGBA32R || + rsc->vc4_format == ~0) { struct vc4_resource *shadow_parent = rsc; - struct pipe_resource tmpl = *prsc; - - tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET; - tmpl.width0 = u_minify(tmpl.width0, cso->u.tex.first_level); - tmpl.height0 = u_minify(tmpl.height0, cso->u.tex.first_level); - tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level; + struct pipe_resource tmpl = { + .target = prsc->target, + .format = prsc->format, + .width0 = u_minify(prsc->width0, + cso->u.tex.first_level), + .height0 = u_minify(prsc->height0, + cso->u.tex.first_level), + .bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET, + .last_level = cso->u.tex.last_level - cso->u.tex.first_level, + .nr_samples = prsc->nr_samples, + }; /* Create the shadow texture. The rest of the texture * parameter setup will use the shadow. @@ -617,7 +614,9 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, } so->texture_p0 = - (VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) | + (VC4_SET_FIELD((rsc->slices[0].offset + + cso->u.tex.first_layer * + rsc->cube_map_stride) >> 12, VC4_TEX_P0_OFFSET) | VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE) | VC4_SET_FIELD(so->force_first_level ? cso->u.tex.last_level : diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c index 07e1c9c5f..2da520eb4 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c @@ -63,15 +63,6 @@ vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) height <= 4 * vc4_utile_height(cpp)); } -static void -check_box_utile_alignment(const struct pipe_box *box, int cpp) -{ - assert(!(box->x & (vc4_utile_width(cpp) - 1))); - assert(!(box->y & (vc4_utile_height(cpp) - 1))); - assert(!(box->width & (vc4_utile_width(cpp) - 1))); - assert(!(box->height & (vc4_utile_height(cpp) - 1))); -} - /** * Takes a utile x and y (and the number of utiles of width of the image) and * returns the offset to the utile within a VC4_TILING_FORMAT_TF image. @@ -216,8 +207,6 @@ vc4_load_tiled_image(void *dst, uint32_t dst_stride, uint8_t tiling_format, int cpp, const struct pipe_box *box) { - check_box_utile_alignment(box, cpp); - if (tiling_format == VC4_TILING_FORMAT_LT) { vc4_load_lt_image(dst, dst_stride, src, src_stride, @@ -240,8 +229,6 @@ vc4_store_tiled_image(void *dst, uint32_t dst_stride, uint8_t tiling_format, int cpp, const struct pipe_box *box) { - check_box_utile_alignment(box, cpp); - if (tiling_format == VC4_TILING_FORMAT_LT) { vc4_store_lt_image(dst, dst_stride, src, src_stride, diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c index 4a76c0ff7..ec42a3dc2 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c @@ -41,6 +41,12 @@ #define NEON_TAG(x) x ## _base #endif +static inline uint32_t +align_down(uint32_t val, uint32_t align) +{ + return val & ~(align - 1); +} + /** Returns the stride in bytes of a 64-byte microtile. */ static uint32_t vc4_utile_stride(int cpp) @@ -252,11 +258,78 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp) #endif } +/** + * Returns the X value into the address bits for LT tiling. + * + * The LT tile load/stores rely on the X bits not intersecting with the Y + * bits. Because of this, we have to choose to put the utile index within the + * LT tile into one of the two values, and we do so in swizzle_lt_x() to make + * NPOT handling easier. + */ +static uint32_t +swizzle_lt_x(int x, int cpp) +{ + switch (cpp) { + case 1: + /* 8x8 inside of 4x4 */ + return ((x & 0x7) << (0 - 0) | + (x & ~0x7) << (6 - 3)); + case 2: + /* 8x4 inside of 4x4 */ + return ((x & 0x7) << (1 - 0) | + (x & ~0x7) << (6 - 3)); + case 4: + /* 4x4 inside of 4x4 */ + return ((x & 0x3) << (2 - 0) | + (x & ~0x3) << (6 - 2)); + case 8: + /* 2x4 inside of 4x4 */ + return ((x & 0x1) << (3 - 0) | + (x & ~0x1) << (6 - 1)); + default: + unreachable("bad cpp"); + } +} -void -NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride, - void *src, uint32_t src_stride, - int cpp, const struct pipe_box *box) +/** + * Returns the Y value into the address bits for LT tiling. + * + * The LT tile load/stores rely on the X bits not intersecting with the Y + * bits. + */ +static uint32_t +swizzle_lt_y(int y, int cpp) +{ + + switch (cpp) { + case 1: + /* 8x8 inside of 4x4 */ + return ((y & 0x7) << 3); + case 2: + /* 8x4 inside of 4x4 */ + return ((y & 0x3) << 4); + case 4: + /* 4x4 inside of 4x4 */ + return ((y & 0x3) << 4); + case 8: + /* 2x4 inside of 4x4 */ + return ((y & 0x3) << 4); + default: + unreachable("bad cpp"); + } +} + +/** + * Helper for loading or storing to an LT image, where the box is aligned + * to utiles. + * + * This just breaks the box down into calls to the fast + * vc4_load_utile/vc4_store_utile helpers. + */ +static inline void +vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride, + void *cpu, uint32_t cpu_stride, + int cpp, const struct pipe_box *box, bool to_cpu) { uint32_t utile_w = vc4_utile_width(cpp); uint32_t utile_h = vc4_utile_height(cpp); @@ -264,33 +337,149 @@ NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride, uint32_t ystart = box->y; for (uint32_t y = 0; y < box->height; y += utile_h) { - for (int x = 0; x < box->width; x += utile_w) { - vc4_load_utile(dst + (dst_stride * y + - x * cpp), - src + ((ystart + y) * src_stride + - (xstart + x) * 64 / utile_w), - dst_stride, cpp); + for (uint32_t x = 0; x < box->width; x += utile_w) { + void *gpu_tile = gpu + ((ystart + y) * gpu_stride + + (xstart + x) * 64 / utile_w); + if (to_cpu) { + vc4_load_utile(cpu + (cpu_stride * y + + x * cpp), + gpu_tile, + cpu_stride, cpp); + } else { + vc4_store_utile(gpu_tile, + cpu + (cpu_stride * y + + x * cpp), + cpu_stride, cpp); + } + } + } +} + +/** + * Helper for loading or storing to an LT image, where the box is not aligned + * to utiles. + * + * This walks through the raster-order data, copying to/from the corresponding + * tiled pixel. This means we don't get write-combining on stores, but the + * loop is very few CPU instructions since the memcpy will be inlined. + */ +static inline void +vc4_lt_image_unaligned(void *gpu, uint32_t gpu_stride, + void *cpu, uint32_t cpu_stride, + int cpp, const struct pipe_box *box, bool to_cpu) +{ + + /* These are the address bits for the start of the box, split out into + * x/y so that they can be incremented separately in their loops. + */ + uint32_t offs_x0 = swizzle_lt_x(box->x, cpp); + uint32_t offs_y = swizzle_lt_y(box->y, cpp); + /* The *_mask values are "what bits of the address are from x or y" */ + uint32_t x_mask = swizzle_lt_x(~0, cpp); + uint32_t y_mask = swizzle_lt_y(~0, cpp); + uint32_t incr_y = swizzle_lt_x(gpu_stride / cpp, cpp); + + assert(!(x_mask & y_mask)); + + offs_x0 += incr_y * (box->y / vc4_utile_height(cpp)); + + for (uint32_t y = 0; y < box->height; y++) { + void *gpu_row = gpu + offs_y; + + uint32_t offs_x = offs_x0; + + for (uint32_t x = 0; x < box->width; x++) { + /* Use a memcpy here to move a pixel's worth of data. + * We're relying on this function to be inlined, so + * this will get expanded into the appropriate 1, 2, + * or 4-byte move. + */ + if (to_cpu) { + memcpy(cpu + x * cpp, gpu_row + offs_x, cpp); + } else { + memcpy(gpu_row + offs_x, cpu + x * cpp, cpp); + } + + /* This math trick with x_mask increments offs_x by 1 + * in x. + */ + offs_x = (offs_x - x_mask) & x_mask; } + + offs_y = (offs_y - y_mask) & y_mask; + /* When offs_y wraps (we hit the end of the utile), we + * increment offs_x0 by effectively the utile stride. + */ + if (!offs_y) + offs_x0 += incr_y; + + cpu += cpu_stride; + } +} + +/** + * General LT image load/store helper. + */ +static inline void +vc4_lt_image_helper(void *gpu, uint32_t gpu_stride, + void *cpu, uint32_t cpu_stride, + int cpp, const struct pipe_box *box, bool to_cpu) +{ + if (box->x & (vc4_utile_width(cpp) - 1) || + box->y & (vc4_utile_height(cpp) - 1) || + box->width & (vc4_utile_width(cpp) - 1) || + box->height & (vc4_utile_height(cpp) - 1)) { + vc4_lt_image_unaligned(gpu, gpu_stride, + cpu, cpu_stride, + cpp, box, to_cpu); + } else { + vc4_lt_image_aligned(gpu, gpu_stride, + cpu, cpu_stride, + cpp, box, to_cpu); + } +} + +static inline void +vc4_lt_image_cpp_helper(void *gpu, uint32_t gpu_stride, + void *cpu, uint32_t cpu_stride, + int cpp, const struct pipe_box *box, bool to_cpu) +{ + switch (cpp) { + case 1: + vc4_lt_image_helper(gpu, gpu_stride, cpu, cpu_stride, 1, box, + to_cpu); + break; + case 2: + vc4_lt_image_helper(gpu, gpu_stride, cpu, cpu_stride, 2, box, + to_cpu); + break; + case 4: + vc4_lt_image_helper(gpu, gpu_stride, cpu, cpu_stride, 4, box, + to_cpu); + break; + case 8: + vc4_lt_image_helper(gpu, gpu_stride, cpu, cpu_stride, 8, box, + to_cpu); + break; + default: + unreachable("bad cpp"); } } void +NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride, + void *src, uint32_t src_stride, + int cpp, const struct pipe_box *box) +{ + vc4_lt_image_cpp_helper(src, src_stride, dst, dst_stride, cpp, box, + true); +} + +void NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride, void *src, uint32_t src_stride, int cpp, const struct pipe_box *box) { - uint32_t utile_w = vc4_utile_width(cpp); - uint32_t utile_h = vc4_utile_height(cpp); - uint32_t xstart = box->x; - uint32_t ystart = box->y; - - for (uint32_t y = 0; y < box->height; y += utile_h) { - for (int x = 0; x < box->width; x += utile_w) { - vc4_store_utile(dst + ((ystart + y) * dst_stride + - (xstart + x) * 64 / utile_w), - src + (src_stride * y + - x * cpp), - src_stride, cpp); - } - } + vc4_lt_image_cpp_helper(dst, dst_stride, src, src_stride, cpp, box, + false); } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_uniforms.c b/lib/mesa/src/gallium/drivers/vc4/vc4_uniforms.c index 12e6504bb..3801fbc8f 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_uniforms.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_uniforms.c @@ -224,14 +224,16 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, uinfo->num_texture_samples); for (int i = 0; i < uinfo->count; i++) { + enum quniform_contents contents = uinfo->contents[i]; + uint32_t data = uinfo->data[i]; - switch (uinfo->contents[i]) { + switch (contents) { case QUNIFORM_CONSTANT: - cl_aligned_u32(&uniforms, uinfo->data[i]); + cl_aligned_u32(&uniforms, data); break; case QUNIFORM_UNIFORM: cl_aligned_u32(&uniforms, - gallium_uniforms[uinfo->data[i]]); + gallium_uniforms[data]); break; case QUNIFORM_VIEWPORT_X_SCALE: cl_aligned_f(&uniforms, vc4->viewport.scale[0] * 16.0f); @@ -249,41 +251,49 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, case QUNIFORM_USER_CLIP_PLANE: cl_aligned_f(&uniforms, - vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]); + vc4->clip.ucp[data / 4][data % 4]); break; case QUNIFORM_TEXTURE_CONFIG_P0: - write_texture_p0(job, &uniforms, texstate, - uinfo->data[i]); + write_texture_p0(job, &uniforms, texstate, data); break; case QUNIFORM_TEXTURE_CONFIG_P1: - write_texture_p1(job, &uniforms, texstate, - uinfo->data[i]); + write_texture_p1(job, &uniforms, texstate, data); break; case QUNIFORM_TEXTURE_CONFIG_P2: - write_texture_p2(job, &uniforms, texstate, - uinfo->data[i]); + write_texture_p2(job, &uniforms, texstate, data); break; case QUNIFORM_TEXTURE_FIRST_LEVEL: write_texture_first_level(job, &uniforms, texstate, - uinfo->data[i]); + data); break; case QUNIFORM_UBO_ADDR: - cl_aligned_reloc(job, &job->uniforms, &uniforms, ubo, 0); + if (data == 0) { + cl_aligned_reloc(job, &job->uniforms, + &uniforms, ubo, 0); + } else { + struct pipe_constant_buffer *c = + &cb->cb[data]; + struct vc4_resource *rsc = + vc4_resource(c->buffer); + + cl_aligned_reloc(job, &job->uniforms, + &uniforms, + rsc->bo, c->buffer_offset); + } break; case QUNIFORM_TEXTURE_MSAA_ADDR: - write_texture_msaa_addr(job, &uniforms, - texstate, uinfo->data[i]); + write_texture_msaa_addr(job, &uniforms, texstate, data); break; case QUNIFORM_TEXTURE_BORDER_COLOR: write_texture_border_color(job, &uniforms, - texstate, uinfo->data[i]); + texstate, data); break; case QUNIFORM_TEXRECT_SCALE_X: @@ -291,7 +301,7 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, cl_aligned_u32(&uniforms, get_texrect_scale(texstate, uinfo->contents[i], - uinfo->data[i])); + data)); break; case QUNIFORM_BLEND_CONST_COLOR_X: @@ -330,9 +340,9 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, case QUNIFORM_STENCIL: cl_aligned_u32(&uniforms, - vc4->zsa->stencil_uniforms[uinfo->data[i]] | - (uinfo->data[i] <= 1 ? - (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) : + vc4->zsa->stencil_uniforms[data] | + (data <= 1 ? + (vc4->stencil_ref.ref_value[data] << 8) : 0)); break; @@ -350,11 +360,18 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, cl_aligned_u32(&uniforms, 0xd0d0d0d0); break; } -#if 0 - uint32_t written_val = *((uint32_t *)uniforms - 1); - fprintf(stderr, "%p: %d / 0x%08x (%f)\n", - shader, i, written_val, uif(written_val)); -#endif + + if (false) { + uint32_t written_val = *((uint32_t *)uniforms - 1); + char *desc = qir_describe_uniform(uinfo->contents[i], + uinfo->data[i], + gallium_uniforms); + + fprintf(stderr, "%p/%d: 0x%08x %s\n", + shader, i, written_val, desc); + + ralloc_free(desc); + } } cl_end(&job->uniforms, uniforms); |