Merge Mesa 18.3.2

author: Jonathan Gray <jsg@cvs.openbsd.org> 2019-01-29 11:52:33 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2019-01-29 11:52:33 +0000
commit: 37bbf6a1792773f11c15a4da1588a7520ee2fb4e (patch)
tree: 64944d4aa665a1e479cfc004e446593062254550 /lib/mesa/src/gallium/drivers/vc4
parent: 6b139c2063623e9310025247cd966490b9aa57ea (diff)
26 files changed, 1165 insertions, 626 deletions
diff --git a/lib/mesa/src/gallium/drivers/vc4/Makefile.am b/lib/mesa/src/gallium/drivers/vc4/Makefile.am
index c3e49af97..4c7dd843d 100644
--- a/lib/mesa/src/gallium/drivers/vc4/Makefile.am
+++ b/lib/mesa/src/gallium/drivers/vc4/Makefile.am
@@ -30,7 +30,8 @@ AM_CFLAGS = \
 	-I$(top_builddir)/src/compiler/nir \
 	-I$(top_srcdir)/include/drm-uapi \
 	-I$(top_builddir)/src \
-	-I$(top_srcdir)/src/broadcom/cle \
+	-I$(top_srcdir)/src/broadcom \
+	-I$(top_builddir)/src/broadcom \
 	$(LIBDRM_CFLAGS) \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(SIM_CFLAGS) \
@@ -54,4 +55,4 @@ endif
 
 libvc4_la_LDFLAGS = $(SIM_LDFLAGS)
 
-EXTRA_DIST = kernel/README
+EXTRA_DIST = kernel/README meson.build
diff --git a/lib/mesa/src/gallium/drivers/vc4/Makefile.in b/lib/mesa/src/gallium/drivers/vc4/Makefile.in
index 195f7e2c1..f55b61922 100644
--- a/lib/mesa/src/gallium/drivers/vc4/Makefile.in
+++ b/lib/mesa/src/gallium/drivers/vc4/Makefile.in
@@ -78,15 +78,19 @@ DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
 @HAVE_LIBDRM_TRUE@am__append_1 = \
 @HAVE_LIBDRM_TRUE@	$(LIBDRM_LIBS)
 
-@HAVE_DRISW_TRUE@am__append_2 = \
+@HAVE_PLATFORM_ANDROID_TRUE@am__append_2 = \
+@HAVE_PLATFORM_ANDROID_TRUE@	$(ANDROID_LIBS) \
+@HAVE_PLATFORM_ANDROID_TRUE@	$(BACKTRACE_LIBS)
+
+@HAVE_DRISW_TRUE@am__append_3 = \
 @HAVE_DRISW_TRUE@	$(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la
 
-@HAVE_DRISW_KMS_TRUE@am__append_3 = \
+@HAVE_DRISW_KMS_TRUE@am__append_4 = \
 @HAVE_DRISW_KMS_TRUE@	$(top_builddir)/src/gallium/winsys/sw/kms-dri/libswkmsdri.la \
 @HAVE_DRISW_KMS_TRUE@	$(LIBDRM_LIBS)
 
-@HAVE_ARM_ASM_TRUE@am__append_4 = libvc4_neon.la
 @HAVE_ARM_ASM_TRUE@am__append_5 = libvc4_neon.la
+@HAVE_ARM_ASM_TRUE@am__append_6 = libvc4_neon.la
 subdir = src/gallium/drivers/vc4
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_compile_flag.m4 \
@@ -106,7 +110,7 @@ mkinstalldirs = $(install_sh) -d
 CONFIG_CLEAN_FILES =
 CONFIG_CLEAN_VPATH_FILES =
 LTLIBRARIES = $(noinst_LTLIBRARIES)
-libvc4_la_DEPENDENCIES = $(am__append_5)
+libvc4_la_DEPENDENCIES = $(am__append_6)
 am__dirstamp = $(am__leading_dot)dirstamp
 am__objects_1 = kernel/vc4_gem.lo kernel/vc4_render_cl.lo \
 	kernel/vc4_validate.lo kernel/vc4_validate_shaders.lo \
@@ -201,6 +205,8 @@ AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
+BACKTRACE_CFLAGS = @BACKTRACE_CFLAGS@
+BACKTRACE_LIBS = @BACKTRACE_LIBS@
 BSYMBOLIC = @BSYMBOLIC@
 CC = @CC@
 CCAS = @CCAS@
@@ -214,6 +220,7 @@ CLOVER_STD_OVERRIDE = @CLOVER_STD_OVERRIDE@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CXX = @CXX@
+CXX11_CXXFLAGS = @CXX11_CXXFLAGS@
 CXXCPP = @CXXCPP@
 CXXDEPMODE = @CXXDEPMODE@
 CXXFLAGS = @CXXFLAGS@
@@ -247,8 +254,6 @@ EXEEXT = @EXEEXT@
 EXPAT_CFLAGS = @EXPAT_CFLAGS@
 EXPAT_LIBS = @EXPAT_LIBS@
 FGREP = @FGREP@
-FREEDRENO_CFLAGS = @FREEDRENO_CFLAGS@
-FREEDRENO_LIBS = @FREEDRENO_LIBS@
 GALLIUM_PIPE_LOADER_DEFINES = @GALLIUM_PIPE_LOADER_DEFINES@
 GBM_PC_LIB_PRIV = @GBM_PC_LIB_PRIV@
 GBM_PC_REQ_PRIV = @GBM_PC_REQ_PRIV@
@@ -267,8 +272,8 @@ GL_LIB_DEPS = @GL_LIB_DEPS@
 GL_PC_CFLAGS = @GL_PC_CFLAGS@
 GL_PC_LIB_PRIV = @GL_PC_LIB_PRIV@
 GL_PC_REQ_PRIV = @GL_PC_REQ_PRIV@
+GL_PKGCONF_LIB = @GL_PKGCONF_LIB@
 GREP = @GREP@
-HAVE_XF86VIDMODE = @HAVE_XF86VIDMODE@
 I915_CFLAGS = @I915_CFLAGS@
 I915_LIBS = @I915_LIBS@
 INDENT = @INDENT@
@@ -280,6 +285,7 @@ INSTALL_SCRIPT = @INSTALL_SCRIPT@
 INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
 LD = @LD@
 LDFLAGS = @LDFLAGS@
+LD_BUILD_ID = @LD_BUILD_ID@
 LD_NO_UNDEFINED = @LD_NO_UNDEFINED@
 LEX = @LEX@
 LEXLIB = @LEXLIB@
@@ -317,7 +323,7 @@ MSVC2013_COMPAT_CFLAGS = @MSVC2013_COMPAT_CFLAGS@
 MSVC2013_COMPAT_CXXFLAGS = @MSVC2013_COMPAT_CXXFLAGS@
 NINE_MAJOR = @NINE_MAJOR@
 NINE_MINOR = @NINE_MINOR@
-NINE_TINY = @NINE_TINY@
+NINE_PATCH = @NINE_PATCH@
 NINE_VERSION = @NINE_VERSION@
 NM = @NM@
 NMEDIT = @NMEDIT@
@@ -330,6 +336,9 @@ OBJEXT = @OBJEXT@
 OMX_BELLAGIO_CFLAGS = @OMX_BELLAGIO_CFLAGS@
 OMX_BELLAGIO_LIBS = @OMX_BELLAGIO_LIBS@
 OMX_BELLAGIO_LIB_INSTALL_DIR = @OMX_BELLAGIO_LIB_INSTALL_DIR@
+OMX_TIZONIA_CFLAGS = @OMX_TIZONIA_CFLAGS@
+OMX_TIZONIA_LIBS = @OMX_TIZONIA_LIBS@
+OMX_TIZONIA_LIB_INSTALL_DIR = @OMX_TIZONIA_LIB_INSTALL_DIR@
 OPENCL_LIBNAME = @OPENCL_LIBNAME@
 OPENCL_VERSION = @OPENCL_VERSION@
 OSMESA_LIB = @OSMESA_LIB@
@@ -357,11 +366,16 @@ PTHREAD_CC = @PTHREAD_CC@
 PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
 PTHREAD_LIBS = @PTHREAD_LIBS@
 PWR8_CFLAGS = @PWR8_CFLAGS@
-PYTHON2 = @PYTHON2@
+PYTHON = @PYTHON@
+PYTHON_EXEC_PREFIX = @PYTHON_EXEC_PREFIX@
+PYTHON_PLATFORM = @PYTHON_PLATFORM@
+PYTHON_PREFIX = @PYTHON_PREFIX@
+PYTHON_VERSION = @PYTHON_VERSION@
 RADEON_CFLAGS = @RADEON_CFLAGS@
 RADEON_LIBS = @RADEON_LIBS@
 RANLIB = @RANLIB@
 RM = @RM@
+SCANNER_ARG = @SCANNER_ARG@
 SED = @SED@
 SELINUX_CFLAGS = @SELINUX_CFLAGS@
 SELINUX_LIBS = @SELINUX_LIBS@
@@ -373,9 +387,10 @@ SSE41_CFLAGS = @SSE41_CFLAGS@
 STRIP = @STRIP@
 SWR_AVX2_CXXFLAGS = @SWR_AVX2_CXXFLAGS@
 SWR_AVX_CXXFLAGS = @SWR_AVX_CXXFLAGS@
-SWR_CXX11_CXXFLAGS = @SWR_CXX11_CXXFLAGS@
 SWR_KNL_CXXFLAGS = @SWR_KNL_CXXFLAGS@
 SWR_SKX_CXXFLAGS = @SWR_SKX_CXXFLAGS@
+V3D_SIMULATOR_CFLAGS = @V3D_SIMULATOR_CFLAGS@
+V3D_SIMULATOR_LIBS = @V3D_SIMULATOR_LIBS@
 VALGRIND_CFLAGS = @VALGRIND_CFLAGS@
 VALGRIND_LIBS = @VALGRIND_LIBS@
 VA_CFLAGS = @VA_CFLAGS@
@@ -383,8 +398,8 @@ VA_LIBS = @VA_LIBS@
 VA_LIB_INSTALL_DIR = @VA_LIB_INSTALL_DIR@
 VA_MAJOR = @VA_MAJOR@
 VA_MINOR = @VA_MINOR@
-VC5_SIMULATOR_CFLAGS = @VC5_SIMULATOR_CFLAGS@
-VC5_SIMULATOR_LIBS = @VC5_SIMULATOR_LIBS@
+VC4_CFLAGS = @VC4_CFLAGS@
+VC4_LIBS = @VC4_LIBS@
 VDPAU_CFLAGS = @VDPAU_CFLAGS@
 VDPAU_LIBS = @VDPAU_LIBS@
 VDPAU_LIB_INSTALL_DIR = @VDPAU_LIB_INSTALL_DIR@
@@ -398,7 +413,11 @@ VL_LIBS = @VL_LIBS@
 VULKAN_ICD_INSTALL_DIR = @VULKAN_ICD_INSTALL_DIR@
 WAYLAND_CLIENT_CFLAGS = @WAYLAND_CLIENT_CFLAGS@
 WAYLAND_CLIENT_LIBS = @WAYLAND_CLIENT_LIBS@
+WAYLAND_EGL_CFLAGS = @WAYLAND_EGL_CFLAGS@
+WAYLAND_EGL_LIBS = @WAYLAND_EGL_LIBS@
+WAYLAND_PROTOCOLS_CFLAGS = @WAYLAND_PROTOCOLS_CFLAGS@
 WAYLAND_PROTOCOLS_DATADIR = @WAYLAND_PROTOCOLS_DATADIR@
+WAYLAND_PROTOCOLS_LIBS = @WAYLAND_PROTOCOLS_LIBS@
 WAYLAND_SCANNER = @WAYLAND_SCANNER@
 WAYLAND_SCANNER_CFLAGS = @WAYLAND_SCANNER_CFLAGS@
 WAYLAND_SCANNER_LIBS = @WAYLAND_SCANNER_LIBS@
@@ -408,16 +427,20 @@ WNO_OVERRIDE_INIT = @WNO_OVERRIDE_INIT@
 X11_INCLUDES = @X11_INCLUDES@
 XA_MAJOR = @XA_MAJOR@
 XA_MINOR = @XA_MINOR@
-XA_TINY = @XA_TINY@
+XA_PATCH = @XA_PATCH@
 XA_VERSION = @XA_VERSION@
 XCB_DRI2_CFLAGS = @XCB_DRI2_CFLAGS@
 XCB_DRI2_LIBS = @XCB_DRI2_LIBS@
 XCB_DRI3_CFLAGS = @XCB_DRI3_CFLAGS@
 XCB_DRI3_LIBS = @XCB_DRI3_LIBS@
-XF86VIDMODE_CFLAGS = @XF86VIDMODE_CFLAGS@
-XF86VIDMODE_LIBS = @XF86VIDMODE_LIBS@
+XCB_DRI3_MODIFIERS_CFLAGS = @XCB_DRI3_MODIFIERS_CFLAGS@
+XCB_DRI3_MODIFIERS_LIBS = @XCB_DRI3_MODIFIERS_LIBS@
+XCB_RANDR_CFLAGS = @XCB_RANDR_CFLAGS@
+XCB_RANDR_LIBS = @XCB_RANDR_LIBS@
 XLIBGL_CFLAGS = @XLIBGL_CFLAGS@
 XLIBGL_LIBS = @XLIBGL_LIBS@
+XLIB_RANDR_CFLAGS = @XLIB_RANDR_CFLAGS@
+XLIB_RANDR_LIBS = @XLIB_RANDR_LIBS@
 XVMC_CFLAGS = @XVMC_CFLAGS@
 XVMC_LIBS = @XVMC_LIBS@
 XVMC_LIB_INSTALL_DIR = @XVMC_LIB_INSTALL_DIR@
@@ -472,9 +495,13 @@ mandir = @mandir@
 mkdir_p = @mkdir_p@
 oldincludedir = @oldincludedir@
 pdfdir = @pdfdir@
+pkgpyexecdir = @pkgpyexecdir@
+pkgpythondir = @pkgpythondir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+pyexecdir = @pyexecdir@
+pythondir = @pythondir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
@@ -601,7 +628,8 @@ GALLIUM_TARGET_CFLAGS = \
 	$(VISIBILITY_CFLAGS)
 
 GALLIUM_COMMON_LIB_DEPS = -lm $(LIBUNWIND_LIBS) $(LIBSENSORS_LIBS) \
-	$(CLOCK_LIB) $(PTHREAD_LIBS) $(DLOPEN_LIBS) $(am__append_1)
+	$(CLOCK_LIB) $(PTHREAD_LIBS) $(DLOPEN_LIBS) $(am__append_1) \
+	$(am__append_2)
 GALLIUM_WINSYS_CFLAGS = \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/include \
@@ -613,26 +641,27 @@ GALLIUM_WINSYS_CFLAGS = \
 GALLIUM_PIPE_LOADER_WINSYS_LIBS =  \
 	$(top_builddir)/src/gallium/winsys/sw/null/libws_null.la \
 	$(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \
-	$(am__append_2) $(am__append_3)
+	$(am__append_3) $(am__append_4)
 @USE_VC4_SIMULATOR_TRUE@SIM_LDFLAGS = -lsimpenrose
 AM_CFLAGS = \
 	-I$(top_builddir)/src/compiler/nir \
 	-I$(top_srcdir)/include/drm-uapi \
 	-I$(top_builddir)/src \
-	-I$(top_srcdir)/src/broadcom/cle \
+	-I$(top_srcdir)/src/broadcom \
+	-I$(top_builddir)/src/broadcom \
 	$(LIBDRM_CFLAGS) \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(SIM_CFLAGS) \
 	$(VALGRIND_CFLAGS) \
 	$()
 
-noinst_LTLIBRARIES = libvc4.la $(am__append_4)
+noinst_LTLIBRARIES = libvc4.la $(am__append_5)
 libvc4_la_SOURCES = $(C_SOURCES)
-libvc4_la_LIBADD = $(SIM_LIB) $() $(am__append_5)
+libvc4_la_LIBADD = $(SIM_LIB) $() $(am__append_6)
 @HAVE_ARM_ASM_TRUE@libvc4_neon_la_SOURCES = $(NEON_C_SOURCES)
 @HAVE_ARM_ASM_TRUE@libvc4_neon_la_CFLAGS = $(AM_CFLAGS) -mfpu=neon
 libvc4_la_LDFLAGS = $(SIM_LDFLAGS)
-EXTRA_DIST = kernel/README
+EXTRA_DIST = kernel/README meson.build
 all: all-am
 
 .SUFFIXES:
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c b/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c
index 7f4c76968..d3cc5152a 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c
@@ -24,6 +24,7 @@
 #include "util/u_format.h"
 #include "util/u_surface.h"
 #include "util/u_blitter.h"
+#include "compiler/nir/nir_builder.h"
 #include "vc4_context.h"
 
 static struct pipe_surface *
@@ -183,6 +184,231 @@ vc4_blitter_save(struct vc4_context *vc4)
                         vc4->fragtex.num_textures, vc4->fragtex.textures);
 }
 
+static void *vc4_get_yuv_vs(struct pipe_context *pctx)
+{
+   struct vc4_context *vc4 = vc4_context(pctx);
+   struct pipe_screen *pscreen = pctx->screen;
+
+   if (vc4->yuv_linear_blit_vs)
+           return vc4->yuv_linear_blit_vs;
+
+   const struct nir_shader_compiler_options *options =
+           pscreen->get_compiler_options(pscreen,
+                                         PIPE_SHADER_IR_NIR,
+                                         PIPE_SHADER_VERTEX);
+
+   nir_builder b;
+   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, options);
+   b.shader->info.name = ralloc_strdup(b.shader, "linear_blit_vs");
+
+   const struct glsl_type *vec4 = glsl_vec4_type();
+   nir_variable *pos_in = nir_variable_create(b.shader, nir_var_shader_in,
+                                              vec4, "pos");
+
+   nir_variable *pos_out = nir_variable_create(b.shader, nir_var_shader_out,
+                                               vec4, "gl_Position");
+   pos_out->data.location = VARYING_SLOT_POS;
+
+   nir_store_var(&b, pos_out, nir_load_var(&b, pos_in), 0xf);
+
+   struct pipe_shader_state shader_tmpl = {
+           .type = PIPE_SHADER_IR_NIR,
+           .ir.nir = b.shader,
+   };
+
+   vc4->yuv_linear_blit_vs = pctx->create_vs_state(pctx, &shader_tmpl);
+
+   return vc4->yuv_linear_blit_vs;
+}
+
+static void *vc4_get_yuv_fs(struct pipe_context *pctx, int cpp)
+{
+   struct vc4_context *vc4 = vc4_context(pctx);
+   struct pipe_screen *pscreen = pctx->screen;
+   struct pipe_shader_state **cached_shader;
+   const char *name;
+
+   if (cpp == 1) {
+           cached_shader = &vc4->yuv_linear_blit_fs_8bit;
+           name = "linear_blit_8bit_fs";
+   } else {
+           cached_shader = &vc4->yuv_linear_blit_fs_16bit;
+           name = "linear_blit_16bit_fs";
+   }
+
+   if (*cached_shader)
+           return *cached_shader;
+
+   const struct nir_shader_compiler_options *options =
+           pscreen->get_compiler_options(pscreen,
+                                         PIPE_SHADER_IR_NIR,
+                                         PIPE_SHADER_FRAGMENT);
+
+   nir_builder b;
+   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, options);
+   b.shader->info.name = ralloc_strdup(b.shader, name);
+
+   const struct glsl_type *vec4 = glsl_vec4_type();
+   const struct glsl_type *glsl_int = glsl_int_type();
+
+   nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out,
+                                                 vec4, "f_color");
+   color_out->data.location = FRAG_RESULT_COLOR;
+
+   nir_variable *pos_in = nir_variable_create(b.shader, nir_var_shader_in,
+                                              vec4, "pos");
+   pos_in->data.location = VARYING_SLOT_POS;
+   nir_ssa_def *pos = nir_load_var(&b, pos_in);
+
+   nir_ssa_def *one = nir_imm_int(&b, 1);
+   nir_ssa_def *two = nir_imm_int(&b, 2);
+
+   nir_ssa_def *x = nir_f2i32(&b, nir_channel(&b, pos, 0));
+   nir_ssa_def *y = nir_f2i32(&b, nir_channel(&b, pos, 1));
+
+   nir_variable *stride_in = nir_variable_create(b.shader, nir_var_uniform,
+                                                 glsl_int, "stride");
+   nir_ssa_def *stride = nir_load_var(&b, stride_in);
+
+   nir_ssa_def *x_offset;
+   nir_ssa_def *y_offset;
+   if (cpp == 1) {
+           nir_ssa_def *intra_utile_x_offset =
+                   nir_ishl(&b, nir_iand(&b, x, one), two);
+           nir_ssa_def *inter_utile_x_offset =
+                   nir_ishl(&b, nir_iand(&b, x, nir_imm_int(&b, ~3)), one);
+
+           x_offset = nir_iadd(&b,
+                               intra_utile_x_offset,
+                               inter_utile_x_offset);
+           y_offset = nir_imul(&b,
+                               nir_iadd(&b,
+                                        nir_ishl(&b, y, one),
+                                        nir_ushr(&b, nir_iand(&b, x, two), one)),
+                               stride);
+   } else {
+           x_offset = nir_ishl(&b, x, two);
+           y_offset = nir_imul(&b, y, stride);
+   }
+
+   nir_intrinsic_instr *load =
+           nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ubo);
+   load->num_components = 1;
+   nir_ssa_dest_init(&load->instr, &load->dest, load->num_components, 32, NULL);
+   load->src[0] = nir_src_for_ssa(one);
+   load->src[1] = nir_src_for_ssa(nir_iadd(&b, x_offset, y_offset));
+   nir_builder_instr_insert(&b, &load->instr);
+
+   nir_store_var(&b, color_out,
+                 nir_unpack_unorm_4x8(&b, &load->dest.ssa),
+                 0xf);
+
+   struct pipe_shader_state shader_tmpl = {
+           .type = PIPE_SHADER_IR_NIR,
+           .ir.nir = b.shader,
+   };
+
+   *cached_shader = pctx->create_fs_state(pctx, &shader_tmpl);
+
+   return *cached_shader;
+}
+
+static bool
+vc4_yuv_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
+{
+        struct vc4_context *vc4 = vc4_context(pctx);
+        struct vc4_resource *src = vc4_resource(info->src.resource);
+        struct vc4_resource *dst = vc4_resource(info->dst.resource);
+        bool ok;
+
+        if (src->tiled)
+                return false;
+        if (src->base.format != PIPE_FORMAT_R8_UNORM &&
+            src->base.format != PIPE_FORMAT_R8G8_UNORM)
+                return false;
+
+        /* YUV blits always turn raster-order to tiled */
+        assert(dst->base.format == src->base.format);
+        assert(dst->tiled);
+
+        /* Always 1:1 and at the origin */
+        assert(info->src.box.x == 0 && info->dst.box.x == 0);
+        assert(info->src.box.y == 0 && info->dst.box.y == 0);
+        assert(info->src.box.width == info->dst.box.width);
+        assert(info->src.box.height == info->dst.box.height);
+
+        if ((src->slices[info->src.level].offset & 3) ||
+            (src->slices[info->src.level].stride & 3)) {
+                perf_debug("YUV-blit src texture offset/stride misaligned: 0x%08x/%d\n",
+                           src->slices[info->src.level].offset,
+                           src->slices[info->src.level].stride);
+                goto fallback;
+        }
+
+        vc4_blitter_save(vc4);
+
+        /* Create a renderable surface mapping the T-tiled shadow buffer.
+         */
+        struct pipe_surface dst_tmpl;
+        util_blitter_default_dst_texture(&dst_tmpl, info->dst.resource,
+                                         info->dst.level, info->dst.box.z);
+        dst_tmpl.format = PIPE_FORMAT_RGBA8888_UNORM;
+        struct pipe_surface *dst_surf =
+                pctx->create_surface(pctx, info->dst.resource, &dst_tmpl);
+        if (!dst_surf) {
+                fprintf(stderr, "Failed to create YUV dst surface\n");
+                util_blitter_unset_running_flag(vc4->blitter);
+                return false;
+        }
+        dst_surf->width /= 2;
+        if (dst->cpp == 1)
+                dst_surf->height /= 2;
+
+        /* Set the constant buffer. */
+        uint32_t stride = src->slices[info->src.level].stride;
+        struct pipe_constant_buffer cb_uniforms = {
+                .user_buffer = &stride,
+                .buffer_size = sizeof(stride),
+        };
+        pctx->set_constant_buffer(pctx, PIPE_SHADER_FRAGMENT, 0, &cb_uniforms);
+        struct pipe_constant_buffer cb_src = {
+                .buffer = info->src.resource,
+                .buffer_offset = src->slices[info->src.level].offset,
+                .buffer_size = (src->bo->size -
+                                src->slices[info->src.level].offset),
+        };
+        pctx->set_constant_buffer(pctx, PIPE_SHADER_FRAGMENT, 1, &cb_src);
+
+        /* Unbind the textures, to make sure we don't try to recurse into the
+         * shadow blit.
+         */
+        pctx->set_sampler_views(pctx, PIPE_SHADER_FRAGMENT, 0, 0, NULL);
+        pctx->bind_sampler_states(pctx, PIPE_SHADER_FRAGMENT, 0, 0, NULL);
+
+        util_blitter_custom_shader(vc4->blitter, dst_surf,
+                                   vc4_get_yuv_vs(pctx),
+                                   vc4_get_yuv_fs(pctx, src->cpp));
+
+        util_blitter_restore_textures(vc4->blitter);
+        util_blitter_restore_constant_buffer_state(vc4->blitter);
+        /* Restore cb1 (util_blitter doesn't handle this one). */
+        struct pipe_constant_buffer cb_disabled = { 0 };
+        pctx->set_constant_buffer(pctx, PIPE_SHADER_FRAGMENT, 1, &cb_disabled);
+
+        pipe_surface_reference(&dst_surf, NULL);
+
+        return true;
+
+fallback:
+        /* Do an immediate SW fallback, since the render blit path
+         * would just recurse.
+         */
+        ok = util_try_blit_via_copy_region(pctx, info);
+        assert(ok); (void)ok;
+
+        return true;
+}
+
 static bool
 vc4_render_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
 {
@@ -218,6 +444,9 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
 {
         struct pipe_blit_info info = *blit_info;
 
+        if (vc4_yuv_blit(pctx, blit_info))
+                return;
+
         if (vc4_tile_blit(pctx, blit_info))
                 return;
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c
index d06d55f86..54f9d9c26 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -30,6 +30,7 @@
 
 #include "util/u_hash_table.h"
 #include "util/u_memory.h"
+#include "util/u_string.h"
 #include "util/ralloc.h"
 
 #include "vc4_context.h"
@@ -49,6 +50,13 @@ static void
 vc4_bo_cache_free_all(struct vc4_bo_cache *cache);
 
 void
+vc4_bo_debug_describe(char* buf, const struct vc4_bo *ptr)
+{
+   util_sprintf(buf, "vc4_bo<%s,%u,%u>", ptr->name ? ptr->name : "?",
+                ptr->handle, ptr->size);
+}
+
+void
 vc4_bo_label(struct vc4_screen *screen, struct vc4_bo *bo, const char *fmt, ...)
 {
         /* Perform BO labeling by default on debug builds (so that you get
@@ -113,35 +121,105 @@ vc4_bo_remove_from_cache(struct vc4_bo_cache *cache, struct vc4_bo *bo)
         cache->bo_size -= bo->size;
 }
 
+static void vc4_bo_purgeable(struct vc4_bo *bo)
+{
+        struct drm_vc4_gem_madvise arg = {
+                .handle = bo->handle,
+                .madv = VC4_MADV_DONTNEED,
+        };
+
+	if (bo->screen->has_madvise)
+		vc4_ioctl(bo->screen->fd, DRM_IOCTL_VC4_GEM_MADVISE, &arg);
+}
+
+static bool vc4_bo_unpurgeable(struct vc4_bo *bo)
+{
+        struct drm_vc4_gem_madvise arg = {
+                .handle = bo->handle,
+                .madv = VC4_MADV_WILLNEED,
+        };
+
+	if (!bo->screen->has_madvise)
+		return true;
+
+	if (vc4_ioctl(bo->screen->fd, DRM_IOCTL_VC4_GEM_MADVISE, &arg))
+		return false;
+
+	return arg.retained;
+}
+
+static void
+vc4_bo_free(struct vc4_bo *bo)
+{
+        struct vc4_screen *screen = bo->screen;
+
+        if (bo->map) {
+                if (using_vc4_simulator && bo->name &&
+                    strcmp(bo->name, "winsys") == 0) {
+                        free(bo->map);
+                } else {
+                        munmap(bo->map, bo->size);
+                        VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
+                }
+        }
+
+        struct drm_gem_close c;
+        memset(&c, 0, sizeof(c));
+        c.handle = bo->handle;
+        int ret = vc4_ioctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c);
+        if (ret != 0)
+                fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno));
+
+        screen->bo_count--;
+        screen->bo_size -= bo->size;
+
+        if (dump_stats) {
+                fprintf(stderr, "Freed %s%s%dkb:\n",
+                        bo->name ? bo->name : "",
+                        bo->name ? " " : "",
+                        bo->size / 1024);
+                vc4_bo_dump_stats(screen);
+        }
+
+        free(bo);
+}
+
 static struct vc4_bo *
 vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
 {
         struct vc4_bo_cache *cache = &screen->bo_cache;
         uint32_t page_index = size / 4096 - 1;
+        struct vc4_bo *iter, *tmp, *bo = NULL;
 
         if (cache->size_list_size <= page_index)
                 return NULL;
 
-        struct vc4_bo *bo = NULL;
         mtx_lock(&cache->lock);
-        if (!list_empty(&cache->size_list[page_index])) {
-                bo = LIST_ENTRY(struct vc4_bo, cache->size_list[page_index].next,
-                                size_list);
-
-                /* Check that the BO has gone idle.  If not, then we want to
-                 * allocate something new instead, since we assume that the
-                 * user will proceed to CPU map it and fill it with stuff.
+	LIST_FOR_EACH_ENTRY_SAFE(iter, tmp, &cache->size_list[page_index],
+				 size_list) {
+                /* Check that the BO has gone idle.  If not, then none of the
+                 * other BOs (pushed to the list after later rendering) are
+                 * likely to be idle, either.
                  */
-                if (!vc4_bo_wait(bo, 0, NULL)) {
-                        mtx_unlock(&cache->lock);
-                        return NULL;
-                }
+                if (!vc4_bo_wait(iter, 0, NULL))
+                        break;
+
+                if (!vc4_bo_unpurgeable(iter)) {
+                        /* The BO has been purged. Free it and try to find
+                         * another one in the cache.
+                         */
+                        vc4_bo_remove_from_cache(cache, iter);
+                        vc4_bo_free(iter);
+                        continue;
+		}
 
+                bo = iter;
                 pipe_reference_init(&bo->reference, 1);
                 vc4_bo_remove_from_cache(cache, bo);
 
                 vc4_bo_label(screen, bo, "%s", name);
                 bo->name = name;
+                break;
         }
         mtx_unlock(&cache->lock);
         return bo;
@@ -221,42 +299,6 @@ vc4_bo_last_unreference(struct vc4_bo *bo)
 }
 
 static void
-vc4_bo_free(struct vc4_bo *bo)
-{
-        struct vc4_screen *screen = bo->screen;
-
-        if (bo->map) {
-                if (using_vc4_simulator && bo->name &&
-                    strcmp(bo->name, "winsys") == 0) {
-                        free(bo->map);
-                } else {
-                        munmap(bo->map, bo->size);
-                        VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
-                }
-        }
-
-        struct drm_gem_close c;
-        memset(&c, 0, sizeof(c));
-        c.handle = bo->handle;
-        int ret = vc4_ioctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c);
-        if (ret != 0)
-                fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno));
-
-        screen->bo_count--;
-        screen->bo_size -= bo->size;
-
-        if (dump_stats) {
-                fprintf(stderr, "Freed %s%s%dkb:\n",
-                        bo->name ? bo->name : "",
-                        bo->name ? " " : "",
-                        bo->size / 1024);
-                vc4_bo_dump_stats(screen);
-        }
-
-        free(bo);
-}
-
-static void
 free_stale_bos(struct vc4_screen *screen, time_t time)
 {
         struct vc4_bo_cache *cache = &screen->bo_cache;
@@ -325,6 +367,7 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time)
                 cache->size_list_size = page_index + 1;
         }
 
+        vc4_bo_purgeable(bo);
         bo->free_time = time;
         list_addtail(&bo->size_list, &cache->size_list[page_index]);
         list_addtail(&bo->time_list, &cache->time_list);
@@ -354,7 +397,7 @@ vc4_bo_open_handle(struct vc4_screen *screen,
 
         bo = util_hash_table_get(screen->bo_handles, (void*)(uintptr_t)handle);
         if (bo) {
-                pipe_reference(NULL, &bo->reference);
+                vc4_bo_reference(bo);
                 goto done;
         }
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h
index 4e7b23e08..9fa477442 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h
@@ -39,6 +39,14 @@ struct vc4_bo {
         uint32_t handle;
         uint32_t size;
 
+        /* This will be read/written by multiple threads without a lock -- you
+         * should take a snapshot and use it to see if you happen to be in the
+         * CL's handles at this position, to make most lookups O(1).  It's
+         * volatile to make sure that the compiler doesn't emit multiple loads
+         * from the address, which would make the lookup racy.
+         */
+        volatile uint32_t last_hindex;
+
         /** Entry in the linked list of buffers freed, by age. */
         struct list_head time_list;
         /** Entry in the per-page-count linked list of buffers freed (by age). */
@@ -65,18 +73,13 @@ struct vc4_bo *vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd,
 bool vc4_bo_flink(struct vc4_bo *bo, uint32_t *name);
 int vc4_bo_get_dmabuf(struct vc4_bo *bo);
 
-static inline void
-vc4_bo_set_reference(struct vc4_bo **old_bo, struct vc4_bo *new_bo)
-{
-        if (pipe_reference(&(*old_bo)->reference, &new_bo->reference))
-                vc4_bo_last_unreference(*old_bo);
-        *old_bo = new_bo;
-}
-
+void vc4_bo_debug_describe(char* buf, const struct vc4_bo *ptr);
 static inline struct vc4_bo *
 vc4_bo_reference(struct vc4_bo *bo)
 {
-        pipe_reference(NULL, &bo->reference);
+        pipe_reference_described(NULL, &bo->reference,
+                                 (debug_reference_descriptor)
+                                 vc4_bo_debug_describe);
         return bo;
 }
 
@@ -89,13 +92,18 @@ vc4_bo_unreference(struct vc4_bo **bo)
 
         if ((*bo)->private) {
                 /* Avoid the mutex for private BOs */
-                if (pipe_reference(&(*bo)->reference, NULL))
+                if (pipe_reference_described(&(*bo)->reference, NULL,
+                                             (debug_reference_descriptor)
+                                             vc4_bo_debug_describe)) {
                         vc4_bo_last_unreference(*bo);
+                }
         } else {
                 screen = (*bo)->screen;
                 mtx_lock(&screen->bo_handles_mutex);
 
-                if (pipe_reference(&(*bo)->reference, NULL)) {
+                if (pipe_reference_described(&(*bo)->reference, NULL,
+                                             (debug_reference_descriptor)
+                                             vc4_bo_debug_describe)) {
                         util_hash_table_remove(screen->bo_handles,
                                                (void *)(uintptr_t)(*bo)->handle);
                         vc4_bo_last_unreference(*bo);
@@ -113,8 +121,11 @@ vc4_bo_unreference_locked_timed(struct vc4_bo **bo, time_t time)
         if (!*bo)
                 return;
 
-        if (pipe_reference(&(*bo)->reference, NULL))
+        if (pipe_reference_described(&(*bo)->reference, NULL,
+                                     (debug_reference_descriptor)
+                                     vc4_bo_debug_describe)) {
                 vc4_bo_last_unreference_locked_timed(*bo, time);
+        }
         *bo = NULL;
 }
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c
index 508281a27..7ae092ebc 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c
@@ -61,10 +61,19 @@ vc4_gem_hindex(struct vc4_job *job, struct vc4_bo *bo)
 {
         uint32_t hindex;
         uint32_t *current_handles = job->bo_handles.base;
+        uint32_t cl_hindex_count = cl_offset(&job->bo_handles) / 4;
+        uint32_t last_hindex = bo->last_hindex; /* volatile read! */
 
-        for (hindex = 0; hindex < cl_offset(&job->bo_handles) / 4; hindex++) {
-                if (current_handles[hindex] == bo->handle)
+        if (last_hindex < cl_hindex_count &&
+            current_handles[last_hindex] == bo->handle) {
+                return last_hindex;
+        }
+
+        for (hindex = 0; hindex < cl_hindex_count; hindex++) {
+                if (current_handles[hindex] == bo->handle) {
+                        bo->last_hindex = hindex;
                         return hindex;
+                }
         }
 
         struct vc4_cl_out *out;
@@ -79,5 +88,6 @@ vc4_gem_hindex(struct vc4_job *job, struct vc4_bo *bo)
 
         job->bo_space += bo->size;
 
+        bo->last_hindex = hindex;
         return hindex;
 }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.h b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.h
index 8df9dbfe6..39d1d347b 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.h
@@ -159,21 +159,6 @@ cl_aligned_f(struct vc4_cl_out **cl, float f)
         cl_aligned_u32(cl, fui(f));
 }
 
-static inline void
-cl_start_reloc(struct vc4_cl *cl, struct vc4_cl_out **out, uint32_t n)
-{
-        assert(n == 1 || n == 2);
-        assert(cl->reloc_count == 0);
-#ifndef NDEBUG
-        cl->reloc_count = n;
-#endif
-
-        cl_u8(out, VC4_PACKET_GEM_HANDLES);
-        cl->reloc_next = *out;
-        cl_u32(out, 0); /* Space where hindex will be written. */
-        cl_u32(out, 0); /* Space where hindex will be written. */
-}
-
 static inline struct vc4_cl_out *
 cl_start_shader_reloc(struct vc4_cl *cl, uint32_t n)
 {
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_cl_dump.c b/lib/mesa/src/gallium/drivers/vc4/vc4_cl_dump.c
index ca1b9a315..a6ae0cf80 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -28,6 +28,7 @@
 #include "kernel/vc4_packet.h"
 
 #include "broadcom/cle/v3d_decoder.h"
+#include "broadcom/clif/clif_dump.h"
 
 void
 vc4_dump_cl(void *cl, uint32_t size, bool is_render)
@@ -41,6 +42,8 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render)
         };
         struct v3d_spec *spec = v3d_spec_load(&devinfo);
 
+        struct clif_dump *clif = clif_dump_init(&devinfo, stderr, true);
+
         uint32_t offset = 0, hw_offset = 0;
         uint8_t *p = cl;
 
@@ -60,7 +63,7 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render)
                 fprintf(stderr, "0x%08x 0x%08x: 0x%02x %s\n",
                         offset, hw_offset, header, v3d_group_get_name(inst));
 
-                v3d_print_group(stderr, inst, offset, p, "");
+                v3d_print_group(clif, inst, offset, p);
 
                 switch (header) {
                 case VC4_PACKET_HALT:
@@ -75,5 +78,7 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render)
                         hw_offset += length;
                 p += length;
         }
+
+        clif_dump_destroy(clif);
 }
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c
index a9e7ff91f..ffd7d4c85 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c
@@ -42,7 +42,6 @@ vc4_flush(struct pipe_context *pctx)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
 
-        struct hash_entry *entry;
         hash_table_foreach(vc4->jobs, entry) {
                 struct vc4_job *job = entry->data;
                 vc4_job_submit(vc4, job);
@@ -59,8 +58,17 @@ vc4_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
 
         if (fence) {
                 struct pipe_screen *screen = pctx->screen;
+                int fd = -1;
+
+                if (flags & PIPE_FLUSH_FENCE_FD) {
+                        /* The vc4_fence takes ownership of the returned fd. */
+                        drmSyncobjExportSyncFile(vc4->fd, vc4->job_syncobj,
+                                                 &fd);
+                }
+
                 struct vc4_fence *f = vc4_fence_create(vc4->screen,
-                                                       vc4->last_emit_seqno);
+                                                       vc4->last_emit_seqno,
+                                                       fd);
                 screen->fence_reference(screen, fence, NULL);
                 *fence = (struct pipe_fence_handle *)f;
         }
@@ -115,8 +123,22 @@ vc4_context_destroy(struct pipe_context *pctx)
         pipe_surface_reference(&vc4->framebuffer.cbufs[0], NULL);
         pipe_surface_reference(&vc4->framebuffer.zsbuf, NULL);
 
+        if (vc4->yuv_linear_blit_vs)
+                pctx->delete_vs_state(pctx, vc4->yuv_linear_blit_vs);
+        if (vc4->yuv_linear_blit_fs_8bit)
+                pctx->delete_fs_state(pctx, vc4->yuv_linear_blit_fs_8bit);
+        if (vc4->yuv_linear_blit_fs_16bit)
+                pctx->delete_fs_state(pctx, vc4->yuv_linear_blit_fs_16bit);
+
         vc4_program_fini(pctx);
 
+        if (vc4->screen->has_syncobj) {
+                drmSyncobjDestroy(vc4->fd, vc4->job_syncobj);
+                drmSyncobjDestroy(vc4->fd, vc4->in_syncobj);
+        }
+        if (vc4->in_fence_fd >= 0)
+                close(vc4->in_fence_fd);
+
         ralloc_free(vc4);
 }
 
@@ -125,6 +147,7 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 {
         struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_context *vc4;
+        int err;
 
         /* Prevent dumping of the shaders built during context setup. */
         uint32_t saved_shaderdb_flag = vc4_debug & VC4_DEBUG_SHADERDB;
@@ -150,10 +173,16 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
         vc4_query_init(pctx);
         vc4_resource_context_init(pctx);
 
-        vc4_job_init(vc4);
-
         vc4->fd = screen->fd;
 
+        err = vc4_job_init(vc4);
+        if (err)
+                goto fail;
+
+        err = vc4_fence_context_init(vc4);
+        if (err)
+                goto fail;
+
         slab_create_child(&vc4->transfer_pool, &screen->transfer_pool);
 
 	vc4->uploader = u_upload_create_default(&vc4->base);
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h
index 4a1e4093f..ce8bcffac 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h
@@ -78,6 +78,7 @@
 #define VC4_DIRTY_COMPILED_VS   (1 << 24)
 #define VC4_DIRTY_COMPILED_FS   (1 << 25)
 #define VC4_DIRTY_FS_INPUTS     (1 << 26)
+#define VC4_DIRTY_UBO_1_SIZE    (1 << 27)
 
 struct vc4_sampler_view {
         struct pipe_sampler_view base;
@@ -219,6 +220,13 @@ struct vc4_job_key {
         struct pipe_surface *zsbuf;
 };
 
+struct vc4_hwperfmon {
+        uint32_t id;
+        uint64_t last_seqno;
+        uint8_t events[DRM_VC4_MAX_PERF_COUNTERS];
+        uint64_t counters[DRM_VC4_MAX_PERF_COUNTERS];
+};
+
 /**
  * A complete bin/render job.
  *
@@ -243,6 +251,9 @@ struct vc4_job {
          */
         uint32_t bo_space;
 
+        /* Last BO hindex referenced from VC4_PACKET_GEM_HANDLES. */
+        uint32_t last_gem_handle_hindex;
+
         /** @{ Surfaces to submit rendering for. */
         struct pipe_surface *color_read;
         struct pipe_surface *color_write;
@@ -306,6 +317,9 @@ struct vc4_job {
         /** Any flags to be passed in drm_vc4_submit_cl.flags. */
         uint32_t flags;
 
+	/* Performance monitor attached to this job. */
+	struct vc4_hwperfmon *perfmon;
+
         struct vc4_job_key key;
 };
 
@@ -363,6 +377,10 @@ struct vc4_context {
 
         struct u_upload_mgr *uploader;
 
+        struct pipe_shader_state *yuv_linear_blit_vs;
+        struct pipe_shader_state *yuv_linear_blit_fs_8bit;
+        struct pipe_shader_state *yuv_linear_blit_fs_16bit;
+
         /** @{ Current pipeline state objects */
         struct pipe_scissor_state scissor;
         struct pipe_blend_state *blend;
@@ -387,7 +405,16 @@ struct vc4_context {
         struct pipe_viewport_state viewport;
         struct vc4_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
         struct vc4_vertexbuf_stateobj vertexbuf;
+
+        struct vc4_hwperfmon *perfmon;
         /** @} */
+
+        /** Handle of syncobj containing the last submitted job fence. */
+        uint32_t job_syncobj;
+
+        int in_fence_fd;
+        /** Handle of the syncobj that holds in_fence_fd for submission. */
+        uint32_t in_syncobj;
 };
 
 struct vc4_rasterizer_state {
@@ -444,6 +471,12 @@ vc4_sampler_state(struct pipe_sampler_state *psampler)
         return (struct vc4_sampler_state *)psampler;
 }
 
+int vc4_get_driver_query_group_info(struct pipe_screen *pscreen,
+                                    unsigned index,
+                                    struct pipe_driver_query_group_info *info);
+int vc4_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
+                              struct pipe_driver_query_info *info);
+
 struct pipe_context *vc4_context_create(struct pipe_screen *pscreen,
                                         void *priv, unsigned flags);
 void vc4_draw_init(struct pipe_context *pctx);
@@ -476,7 +509,8 @@ void vc4_write_uniforms(struct vc4_context *vc4,
                         struct vc4_texture_stateobj *texstate);
 
 void vc4_flush(struct pipe_context *pctx);
-void vc4_job_init(struct vc4_context *vc4);
+int vc4_job_init(struct vc4_context *vc4);
+int vc4_fence_context_init(struct vc4_context *vc4);
 struct vc4_job *vc4_get_job(struct vc4_context *vc4,
                             struct pipe_surface *cbuf,
                             struct pipe_surface *zsbuf);
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c b/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c
index 556855420..06785516c 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c
@@ -40,7 +40,7 @@ vc4_get_draw_cl_space(struct vc4_job *job, int vert_count)
         /* The SW-5891 workaround may cause us to emit multiple shader recs
          * and draw packets.
          */
-        int num_draws = DIV_ROUND_UP(vert_count, 65535) + 1;
+        int num_draws = DIV_ROUND_UP(vert_count, 65535 - 2) + 1;
 
         /* Binner gets our packet state -- vc4_emit.c contents,
          * and the primitive itself.
@@ -222,6 +222,8 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4,
                         attr.coordinate_shader_vpm_offset = 0;
                         attr.vertex_shader_vpm_offset = 0;
                 }
+
+                vc4_bo_unreference(&bo);
         }
 
         cl_emit(&job->bcl, GL_SHADER_STATE, shader_state) {
@@ -286,6 +288,7 @@ static void
 vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
+        struct pipe_draw_info local_info;
 
 	if (!info->count_from_stream_output && !info->indirect &&
 	    !info->primitive_restart &&
@@ -293,11 +296,19 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 		return;
 
         if (info->mode >= PIPE_PRIM_QUADS) {
-                util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);
-                util_primconvert_draw_vbo(vc4->primconvert, info);
-                perf_debug("Fallback conversion for %d %s vertices\n",
-                           info->count, u_prim_name(info->mode));
-                return;
+                if (info->mode == PIPE_PRIM_QUADS &&
+                    info->count == 4 &&
+                    !vc4->rasterizer->base.flatshade) {
+                        local_info = *info;
+                        local_info.mode = PIPE_PRIM_TRIANGLE_FAN;
+                        info = &local_info;
+                } else {
+                        util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);
+                        util_primconvert_draw_vbo(vc4->primconvert, info);
+                        perf_debug("Fallback conversion for %d %s vertices\n",
+                                   info->count, u_prim_name(info->mode));
+                        return;
+                }
         }
 
         /* Before setting up the draw, do any fixup blits necessary. */
@@ -377,7 +388,25 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 struct vc4_resource *rsc = vc4_resource(prsc);
 
                 struct vc4_cl_out *bcl = cl_start(&job->bcl);
-                cl_start_reloc(&job->bcl, &bcl, 1);
+
+                /* The original design for the VC4 kernel UABI had multiple
+                 * packets that used relocations in the BCL (some of which
+                 * needed two BOs), but later modifications eliminated all but
+                 * this one usage.  We have an arbitrary 32-bit offset value,
+                 * and need to also supply an arbitrary 32-bit index buffer
+                 * GEM handle, so we have this fake packet we emit in our BCL
+                 * to be validated, which the kernel uses at validation time
+                 * to perform the relocation in the IB packet (without
+                 * emitting to the actual HW).
+                 */
+                uint32_t hindex = vc4_gem_hindex(job, rsc->bo);
+                if (job->last_gem_handle_hindex != hindex) {
+                        cl_u8(&bcl, VC4_PACKET_GEM_HANDLES);
+                        cl_u32(&bcl, hindex);
+                        cl_u32(&bcl, 0);
+                        job->last_gem_handle_hindex = hindex;
+                }
+
                 cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
                 cl_u8(&bcl,
                       info->mode |
@@ -385,8 +414,9 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                        VC4_INDEX_BUFFER_U16:
                        VC4_INDEX_BUFFER_U8));
                 cl_u32(&bcl, info->count);
-                cl_reloc(job, &job->bcl, &bcl, rsc->bo, offset);
+                cl_u32(&bcl, offset);
                 cl_u32(&bcl, vc4->max_index);
+
                 cl_end(&job->bcl, bcl);
                 job->draw_calls_queued++;
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_job.c b/lib/mesa/src/gallium/drivers/vc4/vc4_job.c
index 7fe20c16b..f38c46475 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_job.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_job.c
@@ -90,6 +90,11 @@ vc4_job_create(struct vc4_context *vc4)
         job->draw_max_x = 0;
         job->draw_max_y = 0;
 
+        job->last_gem_handle_hindex = ~0;
+
+        if (vc4->perfmon)
+                job->perfmon = vc4->perfmon;
+
         return job;
 }
 
@@ -113,7 +118,6 @@ vc4_flush_jobs_reading_resource(struct vc4_context *vc4,
 
         vc4_flush_jobs_writing_resource(vc4, prsc);
 
-        struct hash_entry *entry;
         hash_table_foreach(vc4->jobs, entry) {
                 struct vc4_job *job = entry->data;
 
@@ -453,6 +457,8 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job)
         submit.shader_rec_count = job->shader_rec_count;
         submit.uniforms = (uintptr_t)job->uniforms.base;
         submit.uniforms_size = cl_offset(&job->uniforms);
+	if (job->perfmon)
+		submit.perfmonid = job->perfmon->id;
 
         assert(job->draw_min_x != ~0 && job->draw_min_y != ~0);
         submit.min_x_tile = job->draw_min_x / job->tile_width;
@@ -470,6 +476,19 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job)
         }
         submit.flags |= job->flags;
 
+        if (vc4->screen->has_syncobj) {
+                submit.out_sync = vc4->job_syncobj;
+
+                if (vc4->in_fence_fd >= 0) {
+                        /* This replaces the fence in the syncobj. */
+                        drmSyncobjImportSyncFile(vc4->fd, vc4->in_syncobj,
+                                                 vc4->in_fence_fd);
+                        submit.in_sync = vc4->in_syncobj;
+                        close(vc4->in_fence_fd);
+                        vc4->in_fence_fd = -1;
+                }
+        }
+
         if (!(vc4_debug & VC4_DEBUG_NORAST)) {
                 int ret;
 
@@ -485,6 +504,8 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job)
                         warned = true;
                 } else if (!ret) {
                         vc4->last_emit_seqno = submit.seqno;
+                        if (job->perfmon)
+                                job->perfmon->last_seqno = submit.seqno;
                 }
         }
 
@@ -521,7 +542,7 @@ vc4_job_hash(const void *key)
         return _mesa_hash_data(key, sizeof(struct vc4_job_key));
 }
 
-void
+int
 vc4_job_init(struct vc4_context *vc4)
 {
         vc4->jobs = _mesa_hash_table_create(vc4,
@@ -530,5 +551,24 @@ vc4_job_init(struct vc4_context *vc4)
         vc4->write_jobs = _mesa_hash_table_create(vc4,
                                                   _mesa_hash_pointer,
                                                   _mesa_key_pointer_equal);
+
+        if (vc4->screen->has_syncobj) {
+                /* Create the syncobj as signaled since with no job executed
+                 * there is nothing to wait on.
+                 */
+                int ret = drmSyncobjCreate(vc4->fd,
+                                           DRM_SYNCOBJ_CREATE_SIGNALED,
+                                           &vc4->job_syncobj);
+                if (ret) {
+                        /* If the screen indicated syncobj support, we should
+                         * be able to create a signaled syncobj.
+                         * At this point it is too late to pretend the screen
+                         * has no syncobj support.
+                         */
+                        return ret;
+                }
+        }
+
+        return 0;
 }
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c
index 98cdfdf33..bc9bd76ae 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c
@@ -38,6 +38,7 @@
 #include "vc4_context.h"
 #include "vc4_qpu.h"
 #include "vc4_qir.h"
+#include "mesa/state_tracker/st_glsl_types.h"
 
 static struct qreg
 ntq_get_src(struct vc4_compile *c, nir_src src, int i);
@@ -50,6 +51,12 @@ type_size(const struct glsl_type *type)
    return glsl_count_attribute_slots(type, false);
 }
 
+static int
+uniforms_type_size(const struct glsl_type *type)
+{
+        return st_glsl_storage_type_size(type, false);
+}
+
 static void
 resize_qreg_array(struct vc4_compile *c,
                   struct qreg **regs,
@@ -137,6 +144,32 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
         return qir_TEX_RESULT(c);
 }
 
+static struct qreg
+vc4_ubo_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
+{
+        nir_const_value *buffer_index =
+                nir_src_as_const_value(intr->src[0]);
+        assert(buffer_index->u32[0] == 1);
+        assert(c->stage == QSTAGE_FRAG);
+
+        struct qreg offset = ntq_get_src(c, intr->src[1], 0);
+
+        /* Clamp to [0, array size).  Note that MIN/MAX are signed. */
+        offset = qir_MAX(c, offset, qir_uniform_ui(c, 0));
+        offset = qir_MIN_NOIMM(c, offset,
+                               qir_uniform_ui(c, c->fs_key->ubo_1_size - 4));
+
+        qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
+                     offset,
+                     qir_uniform(c, QUNIFORM_UBO_ADDR, buffer_index->u32[0]));
+
+        c->num_texture_samples++;
+
+        ntq_emit_thrsw(c);
+
+        return qir_TEX_RESULT(c);
+}
+
 nir_ssa_def *
 vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
 {
@@ -287,7 +320,7 @@ static struct qreg
 ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
                 unsigned src)
 {
-        assert(util_is_power_of_two(instr->dest.write_mask));
+        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
         unsigned chan = ffs(instr->dest.write_mask) - 1;
         struct qreg r = ntq_get_src(c, instr->src[src].src,
                                     instr->src[src].swizzle[chan]);
@@ -654,24 +687,44 @@ ntq_fceil(struct vc4_compile *c, struct qreg src)
 }
 
 static struct qreg
+ntq_shrink_sincos_input_range(struct vc4_compile *c, struct qreg x)
+{
+        /* Since we're using a Taylor approximation, we want to have a small
+         * number of coefficients and take advantage of sin/cos repeating
+         * every 2pi.  We keep our x as close to 0 as we can, since the series
+         * will be less accurate as |x| increases.  (Also, be careful of
+         * shifting the input x value to be tricky with sin/cos relations,
+         * because getting accurate values for x==0 is very important for SDL
+         * rendering)
+         */
+        struct qreg scaled_x =
+                qir_FMUL(c, x,
+                         qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
+        /* Note: FTOI truncates toward 0. */
+        struct qreg x_frac = qir_FSUB(c, scaled_x,
+                                      qir_ITOF(c, qir_FTOI(c, scaled_x)));
+        /* Map [0.5, 1] to [-0.5, 0] */
+        qir_SF(c, qir_FSUB(c, x_frac, qir_uniform_f(c, 0.5)));
+        qir_FSUB_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NC;
+        /* Map [-1, -0.5] to [0, 0.5] */
+        qir_SF(c, qir_FADD(c, x_frac, qir_uniform_f(c, 0.5)));
+        qir_FADD_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
+
+        return x_frac;
+}
+
+static struct qreg
 ntq_fsin(struct vc4_compile *c, struct qreg src)
 {
         float coeff[] = {
-                -2.0 * M_PI,
-                pow(2.0 * M_PI, 3) / (3 * 2 * 1),
-                -pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),
-                pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),
-                -pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
+                2.0 * M_PI,
+                -pow(2.0 * M_PI, 3) / (3 * 2 * 1),
+                pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),
+                -pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),
+                pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
         };
 
-        struct qreg scaled_x =
-                qir_FMUL(c,
-                         src,
-                         qir_uniform_f(c, 1.0 / (M_PI * 2.0)));
-
-        struct qreg x = qir_FADD(c,
-                                 ntq_ffract(c, scaled_x),
-                                 qir_uniform_f(c, -0.5));
+        struct qreg x = ntq_shrink_sincos_input_range(c, src);
         struct qreg x2 = qir_FMUL(c, x, x);
         struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0]));
         for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
@@ -689,21 +742,15 @@ static struct qreg
 ntq_fcos(struct vc4_compile *c, struct qreg src)
 {
         float coeff[] = {
-                -1.0f,
-                pow(2.0 * M_PI, 2) / (2 * 1),
-                -pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),
-                pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),
-                -pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
-                pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
+                1.0f,
+                -pow(2.0 * M_PI, 2) / (2 * 1),
+                pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),
+                -pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),
+                pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
+                -pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
         };
 
-        struct qreg scaled_x =
-                qir_FMUL(c, src,
-                         qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
-        struct qreg x_frac = qir_FADD(c,
-                                      ntq_ffract(c, scaled_x),
-                                      qir_uniform_f(c, -0.5));
-
+        struct qreg x_frac = ntq_shrink_sincos_input_range(c, src);
         struct qreg sum = qir_uniform_f(c, coeff[0]);
         struct qreg x2 = qir_FMUL(c, x_frac, x_frac);
         struct qreg x = x2; /* Current x^2, x^4, or x^6 */
@@ -711,13 +758,10 @@ ntq_fcos(struct vc4_compile *c, struct qreg src)
                 if (i != 1)
                         x = qir_FMUL(c, x, x2);
 
-                struct qreg mul = qir_FMUL(c,
+                sum = qir_FADD(c, qir_FMUL(c,
                                            x,
-                                           qir_uniform_f(c, coeff[i]));
-                if (i == 0)
-                        sum = mul;
-                else
-                        sum = qir_FADD(c, sum, mul);
+                                           qir_uniform_f(c, coeff[i])),
+                               sum);
         }
         return sum;
 }
@@ -1337,7 +1381,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         /* We have a scalar result, so the instruction should only have a
          * single channel written to.
          */
-        assert(util_is_power_of_two(instr->dest.write_mask));
+        assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
         ntq_store_dest(c, &instr->dest.dest,
                        ffs(instr->dest.write_mask) - 1, result);
 }
@@ -1659,7 +1703,7 @@ static void
 ntq_setup_uniforms(struct vc4_compile *c)
 {
         nir_foreach_variable(var, &c->s->uniforms) {
-                uint32_t vec4_count = type_size(var->type);
+                uint32_t vec4_count = uniforms_type_size(var->type);
                 unsigned vec4_size = 4 * sizeof(float);
 
                 declare_uniform_range(c, var->data.driver_location * vec4_size,
@@ -1775,6 +1819,11 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 }
                 break;
 
+        case nir_intrinsic_load_ubo:
+                assert(instr->num_components == 1);
+                ntq_store_dest(c, &instr->dest, 0, vc4_ubo_load(c, instr));
+                break;
+
         case nir_intrinsic_load_user_clip_plane:
                 for (int i = 0; i < instr->num_components; i++) {
                         ntq_store_dest(c, &instr->dest, i,
@@ -2180,13 +2229,16 @@ nir_to_qir(struct vc4_compile *c)
 }
 
 static const nir_shader_compiler_options nir_options = {
+        .lower_all_io_to_temps = true,
         .lower_extract_byte = true,
         .lower_extract_word = true,
+        .lower_fdiv = true,
         .lower_ffma = true,
         .lower_flrp32 = true,
         .lower_fpow = true,
         .lower_fsat = true,
         .lower_fsqrt = true,
+        .lower_ldexp = true,
         .lower_negate = true,
         .native_integers = true,
         .max_unroll_iterations = 32,
@@ -2435,9 +2487,10 @@ vc4_shader_state_create(struct pipe_context *pctx,
                  */
                 s = cso->ir.nir;
 
-                NIR_PASS_V(s, nir_lower_io, nir_var_all, type_size,
+                NIR_PASS_V(s, nir_lower_io, nir_var_uniform,
+                           uniforms_type_size,
                            (nir_lower_io_options)0);
-        } else {
+       } else {
                 assert(cso->type == PIPE_SHADER_IR_TGSI);
 
                 if (vc4_debug & VC4_DEBUG_TGSI) {
@@ -2449,6 +2502,10 @@ vc4_shader_state_create(struct pipe_context *pctx,
                 s = tgsi_to_nir(cso->tokens, &nir_options);
         }
 
+        NIR_PASS_V(s, nir_lower_io, nir_var_all & ~nir_var_uniform,
+                   type_size,
+                   (nir_lower_io_options)0);
+
         NIR_PASS_V(s, nir_opt_global_to_local);
         NIR_PASS_V(s, nir_lower_regs_to_ssa);
         NIR_PASS_V(s, nir_normalize_cubemap_coords);
@@ -2724,7 +2781,8 @@ vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
                             VC4_DIRTY_RASTERIZER |
                             VC4_DIRTY_SAMPLE_MASK |
                             VC4_DIRTY_FRAGTEX |
-                            VC4_DIRTY_UNCOMPILED_FS))) {
+                            VC4_DIRTY_UNCOMPILED_FS |
+                            VC4_DIRTY_UBO_1_SIZE))) {
                 return;
         }
 
@@ -2768,6 +2826,7 @@ vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
                          PIPE_SPRITE_COORD_UPPER_LEFT);
         }
 
+        key->ubo_1_size = vc4->constbuf[PIPE_SHADER_FRAGMENT].cb[1].buffer_size;
         key->light_twoside = vc4->rasterizer->base.light_twoside;
 
         struct vc4_compiled_shader *old_fs = vc4->prog.fs;
@@ -2916,7 +2975,6 @@ vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
         struct vc4_context *vc4 = vc4_context(pctx);
         struct vc4_uncompiled_shader *so = hwcso;
 
-        struct hash_entry *entry;
         hash_table_foreach(vc4->fs_cache, entry) {
                 delete_from_cache_if_matches(vc4->fs_cache, &vc4->prog.fs,
                                              entry, so);
@@ -2973,7 +3031,6 @@ vc4_program_fini(struct pipe_context *pctx)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
 
-        struct hash_entry *entry;
         hash_table_foreach(vc4->fs_cache, entry) {
                 struct vc4_compiled_shader *shader = entry->data;
                 vc4_bo_unreference(&shader->bo);
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c
index c829e7f93..71f06aebf 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c
@@ -343,13 +343,57 @@ qir_channels_written(struct qinst *inst)
         unreachable("Bad pack field");
 }
 
+char *
+qir_describe_uniform(enum quniform_contents contents, uint32_t data,
+                     const uint32_t *uniforms)
+{
+        static const char *quniform_names[] = {
+                [QUNIFORM_VIEWPORT_X_SCALE] = "vp_x_scale",
+                [QUNIFORM_VIEWPORT_Y_SCALE] = "vp_y_scale",
+                [QUNIFORM_VIEWPORT_Z_OFFSET] = "vp_z_offset",
+                [QUNIFORM_VIEWPORT_Z_SCALE] = "vp_z_scale",
+                [QUNIFORM_TEXTURE_CONFIG_P0] = "tex_p0",
+                [QUNIFORM_TEXTURE_CONFIG_P1] = "tex_p1",
+                [QUNIFORM_TEXTURE_CONFIG_P2] = "tex_p2",
+                [QUNIFORM_TEXTURE_FIRST_LEVEL] = "tex_first_level",
+        };
+
+        switch (contents) {
+        case QUNIFORM_CONSTANT:
+                return ralloc_asprintf(NULL, "0x%08x / %f", data, uif(data));
+        case QUNIFORM_UNIFORM:
+                if (uniforms) {
+                        uint32_t unif = uniforms[data];
+                        return ralloc_asprintf(NULL, "unif[%d] = 0x%08x / %f",
+                                               data, unif, uif(unif));
+                } else {
+                        return ralloc_asprintf(NULL, "unif[%d]", data);
+                }
+
+        case QUNIFORM_TEXTURE_CONFIG_P0:
+        case QUNIFORM_TEXTURE_CONFIG_P1:
+        case QUNIFORM_TEXTURE_CONFIG_P2:
+        case QUNIFORM_TEXTURE_FIRST_LEVEL:
+                return ralloc_asprintf(NULL, "%s[%d]",
+                                       quniform_names[contents], data);
+
+        default:
+                if (contents < ARRAY_SIZE(quniform_names) &&
+                    quniform_names[contents]) {
+                        return ralloc_asprintf(NULL, "%s",
+                                               quniform_names[contents]);
+                } else {
+                        return ralloc_asprintf(NULL, "??? %d", contents);
+                }
+        }
+}
+
 static void
 qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
 {
         static const char *files[] = {
                 [QFILE_TEMP] = "t",
                 [QFILE_VARY] = "v",
-                [QFILE_UNIF] = "u",
                 [QFILE_TLB_COLOR_WRITE] = "tlb_c",
                 [QFILE_TLB_COLOR_WRITE_MS] = "tlb_c_ms",
                 [QFILE_TLB_Z_WRITE] = "tlb_z",
@@ -403,16 +447,18 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
                 fprintf(stderr, "%s", files[reg.file]);
                 break;
 
-        default:
-                fprintf(stderr, "%s%d", files[reg.file], reg.index);
+        case QFILE_UNIF: {
+                char *desc = qir_describe_uniform(c->uniform_contents[reg.index],
+                                                  c->uniform_data[reg.index],
+                                                  NULL);
+                fprintf(stderr, "u%d (%s)", reg.index, desc);
+                ralloc_free(desc);
                 break;
         }
 
-        if (reg.file == QFILE_UNIF &&
-            c->uniform_contents[reg.index] == QUNIFORM_CONSTANT) {
-                fprintf(stderr, " (0x%08x / %f)",
-                        c->uniform_data[reg.index],
-                        uif(c->uniform_data[reg.index]));
+        default:
+                fprintf(stderr, "%s%d", files[reg.file], reg.index);
+                break;
         }
 }
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h
index 90acaef28..1aa5f652f 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h
@@ -363,6 +363,7 @@ struct vc4_fs_key {
         uint8_t alpha_test_func;
         uint8_t logicop_func;
         uint32_t point_sprite_mask;
+        uint32_t ubo_1_size;
 
         struct pipe_rt_blend_state blend;
 };
@@ -591,6 +592,8 @@ uint8_t qir_channels_written(struct qinst *inst);
 
 void qir_dump(struct vc4_compile *c);
 void qir_dump_inst(struct vc4_compile *c, struct qinst *inst);
+char *qir_describe_uniform(enum quniform_contents contents, uint32_t data,
+                           const uint32_t *uniforms);
 const char *qir_get_stage_name(enum qstage stage);
 
 void qir_validate(struct vc4_compile *c);
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c
index 7108b3ee9..5629ce044 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c
@@ -173,8 +173,6 @@ qir_setup_def(struct vc4_compile *c, struct qblock *block, int ip,
 static void
 sf_state_clear(struct hash_table *partial_update_ht)
 {
-        struct hash_entry *entry;
-
         hash_table_foreach(partial_update_ht, entry) {
                 struct partial_update_state *state = entry->data;
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
index ad19f06d3..d7c22e75c 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
@@ -136,7 +136,6 @@ qir_lower_uniforms(struct vc4_compile *c)
                  */
                 uint32_t max_count = 0;
                 uint32_t max_index = 0;
-                struct hash_entry *entry;
                 hash_table_foreach(ht, entry) {
                         uint32_t count = (uintptr_t)entry->data;
                         uint32_t index = (uintptr_t)entry->key - 1;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c
index cdcbcc917..41e6ec5c1 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c
@@ -22,11 +22,13 @@
  * IN THE SOFTWARE.
  */
 
+#include "pipe/p_defines.h"
 #include "util/u_blit.h"
 #include "util/u_memory.h"
 #include "util/u_format.h"
 #include "util/u_inlines.h"
 #include "util/u_surface.h"
+#include "util/u_transfer_helper.h"
 #include "util/u_upload_mgr.h"
 
 #include "drm_fourcc.h"
@@ -36,10 +38,6 @@
 #include "vc4_resource.h"
 #include "vc4_tiling.h"
 
-#ifndef DRM_FORMAT_MOD_INVALID
-#define DRM_FORMAT_MOD_INVALID ((1ULL << 56) - 1)
-#endif
-
 static bool
 vc4_resource_bo_alloc(struct vc4_resource *rsc)
 {
@@ -79,15 +77,8 @@ vc4_resource_transfer_unmap(struct pipe_context *pctx,
         struct vc4_transfer *trans = vc4_transfer(ptrans);
 
         if (trans->map) {
-                struct vc4_resource *rsc;
-                struct vc4_resource_slice *slice;
-                if (trans->ss_resource) {
-                        rsc = vc4_resource(trans->ss_resource);
-                        slice = &rsc->slices[0];
-                } else {
-                        rsc = vc4_resource(ptrans->resource);
-                        slice = &rsc->slices[ptrans->level];
-                }
+                struct vc4_resource *rsc = vc4_resource(ptrans->resource);
+                struct vc4_resource_slice *slice = &rsc->slices[ptrans->level];
 
                 if (ptrans->usage & PIPE_TRANSFER_WRITE) {
                         vc4_store_tiled_image(rsc->bo->map + slice->offset +
@@ -100,51 +91,10 @@ vc4_resource_transfer_unmap(struct pipe_context *pctx,
                 free(trans->map);
         }
 
-        if (trans->ss_resource && (ptrans->usage & PIPE_TRANSFER_WRITE)) {
-                struct pipe_blit_info blit;
-                memset(&blit, 0, sizeof(blit));
-
-                blit.src.resource = trans->ss_resource;
-                blit.src.format = trans->ss_resource->format;
-                blit.src.box.width = trans->ss_box.width;
-                blit.src.box.height = trans->ss_box.height;
-                blit.src.box.depth = 1;
-
-                blit.dst.resource = ptrans->resource;
-                blit.dst.format = ptrans->resource->format;
-                blit.dst.level = ptrans->level;
-                blit.dst.box = trans->ss_box;
-
-                blit.mask = util_format_get_mask(ptrans->resource->format);
-                blit.filter = PIPE_TEX_FILTER_NEAREST;
-
-                pctx->blit(pctx, &blit);
-
-                pipe_resource_reference(&trans->ss_resource, NULL);
-        }
-
         pipe_resource_reference(&ptrans->resource, NULL);
         slab_free(&vc4->transfer_pool, ptrans);
 }
 
-static struct pipe_resource *
-vc4_get_temp_resource(struct pipe_context *pctx,
-                      struct pipe_resource *prsc,
-                      const struct pipe_box *box)
-{
-        struct pipe_resource temp_setup;
-
-        memset(&temp_setup, 0, sizeof(temp_setup));
-        temp_setup.target = prsc->target;
-        temp_setup.format = prsc->format;
-        temp_setup.width0 = box->width;
-        temp_setup.height0 = box->height;
-        temp_setup.depth0 = 1;
-        temp_setup.array_size = 1;
-
-        return pctx->screen->resource_create(pctx->screen, &temp_setup);
-}
-
 static void *
 vc4_resource_transfer_map(struct pipe_context *pctx,
                           struct pipe_resource *prsc,
@@ -164,7 +114,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
          */
         if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
             !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
-            !(prsc->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) &&
+            !(prsc->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) &&
             prsc->last_level == 0 &&
             prsc->width0 == box->width &&
             prsc->height0 == box->height &&
@@ -218,50 +168,6 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
         ptrans->usage = usage;
         ptrans->box = *box;
 
-        /* If the resource is multisampled, we need to resolve to single
-         * sample.  This seems like it should be handled at a higher layer.
-         */
-        if (prsc->nr_samples > 1) {
-                trans->ss_resource = vc4_get_temp_resource(pctx, prsc, box);
-                if (!trans->ss_resource)
-                        goto fail;
-                assert(!trans->ss_resource->nr_samples);
-
-                /* The ptrans->box gets modified for tile alignment, so save
-                 * the original box for unmap time.
-                 */
-                trans->ss_box = *box;
-
-                if (usage & PIPE_TRANSFER_READ) {
-                        struct pipe_blit_info blit;
-                        memset(&blit, 0, sizeof(blit));
-
-                        blit.src.resource = ptrans->resource;
-                        blit.src.format = ptrans->resource->format;
-                        blit.src.level = ptrans->level;
-                        blit.src.box = trans->ss_box;
-
-                        blit.dst.resource = trans->ss_resource;
-                        blit.dst.format = trans->ss_resource->format;
-                        blit.dst.box.width = trans->ss_box.width;
-                        blit.dst.box.height = trans->ss_box.height;
-                        blit.dst.box.depth = 1;
-
-                        blit.mask = util_format_get_mask(prsc->format);
-                        blit.filter = PIPE_TEX_FILTER_NEAREST;
-
-                        pctx->blit(pctx, &blit);
-                        vc4_flush_jobs_writing_resource(vc4, blit.dst.resource);
-                }
-
-                /* The rest of the mapping process should use our temporary. */
-                prsc = trans->ss_resource;
-                rsc = vc4_resource(prsc);
-                ptrans->box.x = 0;
-                ptrans->box.y = 0;
-                ptrans->box.z = 0;
-        }
-
         if (usage & PIPE_TRANSFER_UNSYNCHRONIZED)
                 buf = vc4_bo_map_unsynchronized(rsc->bo);
         else
@@ -275,9 +181,6 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
 
         struct vc4_resource_slice *slice = &rsc->slices[level];
         if (rsc->tiled) {
-                uint32_t utile_w = vc4_utile_width(rsc->cpp);
-                uint32_t utile_h = vc4_utile_height(rsc->cpp);
-
                 /* No direct mappings of tiled, since we need to manually
                  * tile/untile.
                  */
@@ -298,49 +201,12 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                         ptrans->box.height = (ptrans->box.height + 3) >> 2;
                 }
 
-                /* We need to align the box to utile boundaries, since that's
-                 * what load/store operates on.  This may cause us to need to
-                 * read out the original contents in that border area.  Right
-                 * now we just read out the entire contents, including the
-                 * middle area that will just get overwritten.
-                 */
-                uint32_t box_start_x = ptrans->box.x & (utile_w - 1);
-                uint32_t box_start_y = ptrans->box.y & (utile_h - 1);
-                bool needs_load = (usage & PIPE_TRANSFER_READ) != 0;
-
-                if (box_start_x) {
-                        ptrans->box.width += box_start_x;
-                        ptrans->box.x -= box_start_x;
-                        needs_load = true;
-                }
-                if (box_start_y) {
-                        ptrans->box.height += box_start_y;
-                        ptrans->box.y -= box_start_y;
-                        needs_load = true;
-                }
-                if (ptrans->box.width & (utile_w - 1)) {
-                        /* We only need to force a load if our border region
-                         * we're extending into is actually part of the
-                         * texture.
-                         */
-                        uint32_t slice_width = u_minify(prsc->width0, level);
-                        if (ptrans->box.x + ptrans->box.width != slice_width)
-                                needs_load = true;
-                        ptrans->box.width = align(ptrans->box.width, utile_w);
-                }
-                if (ptrans->box.height & (utile_h - 1)) {
-                        uint32_t slice_height = u_minify(prsc->height0, level);
-                        if (ptrans->box.y + ptrans->box.height != slice_height)
-                                needs_load = true;
-                        ptrans->box.height = align(ptrans->box.height, utile_h);
-                }
-
                 ptrans->stride = ptrans->box.width * rsc->cpp;
                 ptrans->layer_stride = ptrans->stride * ptrans->box.height;
 
                 trans->map = malloc(ptrans->layer_stride * ptrans->box.depth);
 
-                if (needs_load) {
+                if (usage & PIPE_TRANSFER_READ) {
                         vc4_load_tiled_image(trans->map, ptrans->stride,
                                              buf + slice->offset +
                                              ptrans->box.z * rsc->cube_map_stride,
@@ -348,9 +214,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                                              slice->tiling, rsc->cpp,
                                              &ptrans->box);
                 }
-                return (trans->map +
-                        box_start_x * rsc->cpp +
-                        box_start_y * ptrans->stride);
+                return trans->map;
         } else {
                 ptrans->stride = slice->stride;
                 ptrans->layer_stride = ptrans->stride;
@@ -368,6 +232,44 @@ fail:
 }
 
 static void
+vc4_texture_subdata(struct pipe_context *pctx,
+                    struct pipe_resource *prsc,
+                    unsigned level,
+                    unsigned usage,
+                    const struct pipe_box *box,
+                    const void *data,
+                    unsigned stride,
+                    unsigned layer_stride)
+{
+        struct vc4_resource *rsc = vc4_resource(prsc);
+        struct vc4_resource_slice *slice = &rsc->slices[level];
+
+        /* For a direct mapping, we can just take the u_transfer path. */
+        if (!rsc->tiled ||
+            box->depth != 1 ||
+            (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)) {
+                return u_default_texture_subdata(pctx, prsc, level, usage, box,
+                                                 data, stride, layer_stride);
+        }
+
+        /* Otherwise, map and store the texture data directly into the tiled
+         * texture.
+         */
+        void *buf;
+        if (usage & PIPE_TRANSFER_UNSYNCHRONIZED)
+                buf = vc4_bo_map_unsynchronized(rsc->bo);
+        else
+                buf = vc4_bo_map(rsc->bo);
+
+        vc4_store_tiled_image(buf + slice->offset +
+                              box->z * rsc->cube_map_stride,
+                              slice->stride,
+                              (void *)data, stride,
+                              slice->tiling, rsc->cpp,
+                              box);
+}
+
+static void
 vc4_resource_destroy(struct pipe_screen *pscreen,
                      struct pipe_resource *prsc)
 {
@@ -406,7 +308,7 @@ vc4_resource_get_handle(struct pipe_screen *pscreen,
                 whandle->modifier = DRM_FORMAT_MOD_LINEAR;
 
         switch (whandle->type) {
-        case DRM_API_HANDLE_TYPE_SHARED:
+        case WINSYS_HANDLE_TYPE_SHARED:
                 if (screen->ro) {
                         /* This could probably be supported, assuming that a
                          * control node was used for pl111.
@@ -416,12 +318,12 @@ vc4_resource_get_handle(struct pipe_screen *pscreen,
                 }
 
                 return vc4_bo_flink(rsc->bo, &whandle->handle);
-        case DRM_API_HANDLE_TYPE_KMS:
+        case WINSYS_HANDLE_TYPE_KMS:
                 if (screen->ro && renderonly_get_handle(rsc->scanout, whandle))
                         return TRUE;
                 whandle->handle = rsc->bo->handle;
                 return TRUE;
-        case DRM_API_HANDLE_TYPE_FD:
+        case WINSYS_HANDLE_TYPE_FD:
                 /* FDs are cross-device, so we can export directly from vc4.
                  */
                 whandle->handle = vc4_bo_get_dmabuf(rsc->bo);
@@ -564,8 +466,10 @@ get_resource_texture_format(struct pipe_resource *prsc)
                 if (prsc->nr_samples > 1) {
                         return ~0;
                 } else {
-                        assert(format == VC4_TEXTURE_TYPE_RGBA8888);
-                        return VC4_TEXTURE_TYPE_RGBA32R;
+                        if (format == VC4_TEXTURE_TYPE_RGBA8888)
+                                return VC4_TEXTURE_TYPE_RGBA32R;
+                        else
+                                return ~0;
                 }
         }
 
@@ -668,7 +572,15 @@ vc4_resource_create_with_modifiers(struct pipe_screen *pscreen,
                         goto fail;
         }
 
-        if (screen->ro && tmpl->bind & PIPE_BIND_SCANOUT) {
+        /* Set up the "scanout resource" (the dmabuf export of our buffer to
+         * the KMS handle) if the buffer might ever have
+         * resource_get_handle(WINSYS_HANDLE_TYPE_KMS) called on it.
+         * create_with_modifiers() doesn't give us usage flags, so we have to
+         * assume that all calls with modifiers are scanout-possible.
+         */
+        if (screen->ro &&
+            ((tmpl->bind & PIPE_BIND_SCANOUT) ||
+             !(count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID))) {
                 rsc->scanout =
                         renderonly_scanout_for_resource(prsc, screen->ro, NULL);
                 if (!rsc->scanout)
@@ -708,19 +620,12 @@ vc4_resource_from_handle(struct pipe_screen *pscreen,
         if (!rsc)
                 return NULL;
 
-        if (whandle->offset != 0) {
-                fprintf(stderr,
-                        "Attempt to import unsupported winsys offset %u\n",
-                        whandle->offset);
-                return NULL;
-        }
-
         switch (whandle->type) {
-        case DRM_API_HANDLE_TYPE_SHARED:
+        case WINSYS_HANDLE_TYPE_SHARED:
                 rsc->bo = vc4_bo_open_name(screen,
                                            whandle->handle, whandle->stride);
                 break;
-        case DRM_API_HANDLE_TYPE_FD:
+        case WINSYS_HANDLE_TYPE_FD:
                 rsc->bo = vc4_bo_open_dmabuf(screen,
                                              whandle->handle, whandle->stride);
                 break;
@@ -766,6 +671,28 @@ vc4_resource_from_handle(struct pipe_screen *pscreen,
         rsc->vc4_format = get_resource_texture_format(prsc);
         vc4_setup_slices(rsc, "import");
 
+        if (whandle->offset != 0) {
+                if (rsc->tiled) {
+                        fprintf(stderr,
+                                "Attempt to import unsupported "
+                                "winsys offset %u\n",
+                                whandle->offset);
+                        goto fail;
+                }
+
+                rsc->slices[0].offset += whandle->offset;
+
+                if (rsc->slices[0].offset + rsc->slices[0].size >
+                    rsc->bo->size) {
+                        fprintf(stderr, "Attempt to import "
+                                "with overflowing offset (%d + %d > %d)\n",
+                                whandle->offset,
+                                rsc->slices[0].size,
+                                rsc->bo->size);
+                        goto fail;
+                }
+        }
+
         if (screen->ro) {
                 /* Make sure that renderonly has a handle to our buffer in the
                  * display's fd, so that a later renderonly_get_handle()
@@ -779,7 +706,7 @@ vc4_resource_from_handle(struct pipe_screen *pscreen,
                         goto fail;
         }
 
-        if (whandle->stride != slice->stride) {
+        if (rsc->tiled && whandle->stride != slice->stride) {
                 static bool warned = false;
                 if (!warned) {
                         warned = true;
@@ -792,6 +719,8 @@ vc4_resource_from_handle(struct pipe_screen *pscreen,
                                 slice->stride);
                 }
                 goto fail;
+        } else if (!rsc->tiled) {
+                slice->stride = whandle->stride;
         }
 
         return prsc;
@@ -1187,6 +1116,14 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx,
         return shadow_rsc;
 }
 
+static const struct u_transfer_vtbl transfer_vtbl = {
+        .resource_create          = vc4_resource_create,
+        .resource_destroy         = vc4_resource_destroy,
+        .transfer_map             = vc4_resource_transfer_map,
+        .transfer_unmap           = vc4_resource_transfer_unmap,
+        .transfer_flush_region    = u_default_transfer_flush_region,
+};
+
 void
 vc4_resource_screen_init(struct pipe_screen *pscreen)
 {
@@ -1199,6 +1136,9 @@ vc4_resource_screen_init(struct pipe_screen *pscreen)
         pscreen->resource_destroy = u_resource_destroy_vtbl;
         pscreen->resource_get_handle = vc4_resource_get_handle;
         pscreen->resource_destroy = vc4_resource_destroy;
+        pscreen->transfer_helper = u_transfer_helper_create(&transfer_vtbl,
+                                                            false, false,
+                                                            false, true);
 
         /* Test if the kernel has GET_TILING; it will return -EINVAL if the
          * ioctl does not exist, but -ENOENT if we pass an impossible handle.
@@ -1215,11 +1155,11 @@ vc4_resource_screen_init(struct pipe_screen *pscreen)
 void
 vc4_resource_context_init(struct pipe_context *pctx)
 {
-        pctx->transfer_map = vc4_resource_transfer_map;
-        pctx->transfer_flush_region = u_default_transfer_flush_region;
-        pctx->transfer_unmap = vc4_resource_transfer_unmap;
+        pctx->transfer_map = u_transfer_helper_transfer_map;
+        pctx->transfer_flush_region = u_transfer_helper_transfer_flush_region;
+        pctx->transfer_unmap = u_transfer_helper_transfer_unmap;
         pctx->buffer_subdata = u_default_buffer_subdata;
-        pctx->texture_subdata = u_default_texture_subdata;
+        pctx->texture_subdata = vc4_texture_subdata;
         pctx->create_surface = vc4_create_surface;
         pctx->surface_destroy = vc4_surface_destroy;
         pctx->resource_copy_region = util_resource_copy_region;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.h b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.h
index d4c491e50..8c0aadbcc 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.h
@@ -32,9 +32,6 @@
 struct vc4_transfer {
         struct pipe_transfer base;
         void *map;
-
-        struct pipe_resource *ss_resource;
-        struct pipe_box ss_box;
 };
 
 struct vc4_resource_slice {
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c
index 9879a4db1..e7f7c82c2 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c
@@ -22,7 +22,7 @@
  * IN THE SOFTWARE.
  */
 
-#include "os/os_misc.h"
+#include "util/os_misc.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
@@ -32,6 +32,8 @@
 #include "util/u_memory.h"
 #include "util/u_format.h"
 #include "util/u_hash_table.h"
+#include "util/u_screen.h"
+#include "util/u_transfer_helper.h"
 #include "util/ralloc.h"
 
 #include <xf86drm.h>
@@ -64,7 +66,7 @@ static const struct debug_named_value debug_options[] = {
           "Flush after each draw call" },
         { "always_sync", VC4_DEBUG_ALWAYS_SYNC,
           "Wait for finish after each flush" },
-#if USE_VC4_SIMULATOR
+#ifdef USE_VC4_SIMULATOR
         { "dump", VC4_DEBUG_DUMP,
           "Write a GPU command stream trace file" },
 #endif
@@ -105,10 +107,12 @@ vc4_screen_destroy(struct pipe_screen *pscreen)
         slab_destroy_parent(&screen->transfer_pool);
         free(screen->ro);
 
-#if USE_VC4_SIMULATOR
+#ifdef USE_VC4_SIMULATOR
         vc4_simulator_destroy(screen);
 #endif
 
+        u_transfer_helper_destroy(pscreen->transfer_helper);
+
         close(screen->fd);
         ralloc_free(pscreen);
 }
@@ -140,17 +144,15 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
         case PIPE_CAP_NPOT_TEXTURES:
         case PIPE_CAP_SHAREABLE_SHADERS:
-        case PIPE_CAP_USER_CONSTANT_BUFFERS:
-        case PIPE_CAP_TEXTURE_SHADOW_MAP:
         case PIPE_CAP_BLEND_EQUATION_SEPARATE:
-        case PIPE_CAP_TWO_SIDED_STENCIL:
         case PIPE_CAP_TEXTURE_MULTISAMPLE:
         case PIPE_CAP_TEXTURE_SWIZZLE:
-        case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
-        case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
         case PIPE_CAP_TEXTURE_BARRIER:
                 return 1;
 
+        case PIPE_CAP_NATIVE_FENCE_FD:
+                return screen->has_syncobj;
+
         case PIPE_CAP_TILE_RASTER_ORDER:
                 return vc4_has_feature(screen,
                                        DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER);
@@ -160,15 +162,6 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_POINT_SPRITE:
                 return 1;
 
-        case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
-                return 256;
-
-        case PIPE_CAP_GLSL_FEATURE_LEVEL:
-                return 120;
-
-        case PIPE_CAP_MAX_VIEWPORTS:
-                return 1;
-
         case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
         case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
                 return 1;
@@ -177,130 +170,6 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MIXED_COLOR_DEPTH_BITS:
                 return 1;
 
-                /* Unsupported features. */
-        case PIPE_CAP_ANISOTROPIC_FILTER:
-        case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
-        case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
-        case PIPE_CAP_CUBE_MAP_ARRAY:
-        case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-        case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
-        case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
-        case PIPE_CAP_SEAMLESS_CUBE_MAP:
-        case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
-        case PIPE_CAP_TGSI_INSTANCEID:
-        case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
-        case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
-        case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
-        case PIPE_CAP_COMPUTE:
-        case PIPE_CAP_START_INSTANCE:
-        case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
-        case PIPE_CAP_SHADER_STENCIL_EXPORT:
-        case PIPE_CAP_TGSI_TEXCOORD:
-        case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
-        case PIPE_CAP_CONDITIONAL_RENDER:
-        case PIPE_CAP_PRIMITIVE_RESTART:
-        case PIPE_CAP_SM3:
-        case PIPE_CAP_INDEP_BLEND_ENABLE:
-        case PIPE_CAP_INDEP_BLEND_FUNC:
-        case PIPE_CAP_DEPTH_CLIP_DISABLE:
-        case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
-        case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
-        case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
-        case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
-        case PIPE_CAP_USER_VERTEX_BUFFERS:
-        case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
-        case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
-        case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
-        case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
-        case PIPE_CAP_TEXTURE_GATHER_SM5:
-        case PIPE_CAP_FAKE_SW_MSAA:
-        case PIPE_CAP_TEXTURE_QUERY_LOD:
-        case PIPE_CAP_SAMPLE_SHADING:
-        case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-        case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
-        case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
-        case PIPE_CAP_MAX_TEXEL_OFFSET:
-        case PIPE_CAP_MAX_VERTEX_STREAMS:
-        case PIPE_CAP_DRAW_INDIRECT:
-        case PIPE_CAP_MULTI_DRAW_INDIRECT:
-        case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
-        case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-        case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
-        case PIPE_CAP_SAMPLER_VIEW_TARGET:
-        case PIPE_CAP_CLIP_HALFZ:
-        case PIPE_CAP_VERTEXID_NOBASE:
-        case PIPE_CAP_POLYGON_OFFSET_CLAMP:
-        case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
-        case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
-        case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-        case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-        case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
-        case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
-        case PIPE_CAP_DEPTH_BOUNDS_TEST:
-        case PIPE_CAP_TGSI_TXQS:
-        case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
-        case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
-        case PIPE_CAP_CLEAR_TEXTURE:
-        case PIPE_CAP_DRAW_PARAMETERS:
-        case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
-        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
-        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
-        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
-        case PIPE_CAP_INVALIDATE_BUFFER:
-        case PIPE_CAP_GENERATE_MIPMAP:
-        case PIPE_CAP_STRING_MARKER:
-        case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
-        case PIPE_CAP_QUERY_BUFFER_OBJECT:
-        case PIPE_CAP_QUERY_MEMORY_INFO:
-        case PIPE_CAP_PCI_GROUP:
-        case PIPE_CAP_PCI_BUS:
-        case PIPE_CAP_PCI_DEVICE:
-        case PIPE_CAP_PCI_FUNCTION:
-        case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
-        case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
-        case PIPE_CAP_CULL_DISTANCE:
-        case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES:
-        case PIPE_CAP_TGSI_VOTE:
-        case PIPE_CAP_MAX_WINDOW_RECTANGLES:
-        case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
-        case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
-        case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
-        case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
-        case PIPE_CAP_NATIVE_FENCE_FD:
-        case PIPE_CAP_TGSI_FS_FBFETCH:
-        case PIPE_CAP_TGSI_MUL_ZERO_WINS:
-        case PIPE_CAP_DOUBLES:
-        case PIPE_CAP_INT64:
-        case PIPE_CAP_INT64_DIVMOD:
-        case PIPE_CAP_TGSI_TEX_TXF_LZ:
-        case PIPE_CAP_TGSI_CLOCK:
-        case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE:
-        case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
-        case PIPE_CAP_TGSI_BALLOT:
-        case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
-	case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
-        case PIPE_CAP_POST_DEPTH_COVERAGE:
-        case PIPE_CAP_BINDLESS_TEXTURE:
-        case PIPE_CAP_NIR_SAMPLERS_AS_DEREF:
-        case PIPE_CAP_QUERY_SO_OVERFLOW:
-	case PIPE_CAP_MEMOBJ:
-        case PIPE_CAP_LOAD_CONSTBUF:
-	case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
-                return 0;
-
-                /* Stream output. */
-        case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-        case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
-        case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
-        case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
-        case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-                return 0;
-
-                /* Geometry shader output, unsupported. */
-        case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
-        case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
-                return 0;
-
                 /* Texturing. */
         case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
         case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
@@ -308,35 +177,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
                 /* Note: Not supported in hardware, just faking it. */
                 return 5;
-        case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-                return 0;
-
-                /* Render targets. */
-        case PIPE_CAP_MAX_RENDER_TARGETS:
-                return 1;
-
-                /* Queries. */
-        case PIPE_CAP_QUERY_TIME_ELAPSED:
-        case PIPE_CAP_QUERY_TIMESTAMP:
-                return 0;
-
-        case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
-        case PIPE_CAP_MIN_TEXEL_OFFSET:
-                return 0;
-
-        case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
-                return 2048;
-
-        case PIPE_CAP_ENDIANNESS:
-                return PIPE_ENDIAN_LITTLE;
-
-        case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
-                return 64;
 
         case PIPE_CAP_VENDOR_ID:
                 return 0x14E4;
-        case PIPE_CAP_DEVICE_ID:
-                return 0xFFFFFFFF;
         case PIPE_CAP_ACCELERATED:
                 return 1;
         case PIPE_CAP_VIDEO_MEMORY: {
@@ -351,8 +194,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
                 return 1;
 
         default:
-                fprintf(stderr, "unknown param %d\n", param);
-                return 0;
+                return u_pipe_screen_get_param_defaults(pscreen, param);
         }
 }
 
@@ -372,10 +214,10 @@ vc4_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
                 return 0.0f;
         case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
                 return 0.0f;
-        case PIPE_CAPF_GUARD_BAND_LEFT:
-        case PIPE_CAPF_GUARD_BAND_TOP:
-        case PIPE_CAPF_GUARD_BAND_RIGHT:
-        case PIPE_CAPF_GUARD_BAND_BOTTOM:
+
+        case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+        case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+        case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
                 return 0.0f;
         default:
                 fprintf(stderr, "unknown paramf %d\n", param);
@@ -443,13 +285,17 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen,
                 return PIPE_SHADER_IR_NIR;
         case PIPE_SHADER_CAP_SUPPORTED_IRS:
                 return 0;
-	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
-		return 32;
+        case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+                return 32;
         case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
         case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
-	case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+        case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
         case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+        case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
+        case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
                 return 0;
+        case PIPE_SHADER_CAP_SCALAR_ISA:
+                return 1;
         default:
                 fprintf(stderr, "unknown shader param %d\n", param);
                 return 0;
@@ -462,16 +308,18 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen,
                                enum pipe_format format,
                                enum pipe_texture_target target,
                                unsigned sample_count,
+                               unsigned storage_sample_count,
                                unsigned usage)
 {
         struct vc4_screen *screen = vc4_screen(pscreen);
-        unsigned retval = 0;
+
+        if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+                return false;
 
         if (sample_count > 1 && sample_count != VC4_MAX_SAMPLES)
                 return FALSE;
 
-        if ((target >= PIPE_MAX_TEXTURE_TYPES) ||
-            !util_format_is_supported(format, usage)) {
+        if (target >= PIPE_MAX_TEXTURE_TYPES) {
                 return FALSE;
         }
 
@@ -521,46 +369,36 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen,
                 case PIPE_FORMAT_R8G8B8_SSCALED:
                 case PIPE_FORMAT_R8G8_SSCALED:
                 case PIPE_FORMAT_R8_SSCALED:
-                        retval |= PIPE_BIND_VERTEX_BUFFER;
                         break;
                 default:
-                        break;
+                        return FALSE;
                 }
         }
 
         if ((usage & PIPE_BIND_RENDER_TARGET) &&
-            vc4_rt_format_supported(format)) {
-                retval |= PIPE_BIND_RENDER_TARGET;
+            !vc4_rt_format_supported(format)) {
+                return FALSE;
         }
 
         if ((usage & PIPE_BIND_SAMPLER_VIEW) &&
-            vc4_tex_format_supported(format) &&
-            (format != PIPE_FORMAT_ETC1_RGB8 || screen->has_etc1)) {
-                retval |= PIPE_BIND_SAMPLER_VIEW;
+            (!vc4_tex_format_supported(format) ||
+             (format == PIPE_FORMAT_ETC1_RGB8 && !screen->has_etc1))) {
+                return FALSE;
         }
 
         if ((usage & PIPE_BIND_DEPTH_STENCIL) &&
-            (format == PIPE_FORMAT_S8_UINT_Z24_UNORM ||
-             format == PIPE_FORMAT_X8Z24_UNORM)) {
-                retval |= PIPE_BIND_DEPTH_STENCIL;
+            format != PIPE_FORMAT_S8_UINT_Z24_UNORM &&
+            format != PIPE_FORMAT_X8Z24_UNORM) {
+                return FALSE;
         }
 
         if ((usage & PIPE_BIND_INDEX_BUFFER) &&
-            (format == PIPE_FORMAT_I8_UINT ||
-             format == PIPE_FORMAT_I16_UINT)) {
-                retval |= PIPE_BIND_INDEX_BUFFER;
-        }
-
-#if 0
-        if (retval != usage) {
-                fprintf(stderr,
-                        "not supported: format=%s, target=%d, sample_count=%d, "
-                        "usage=0x%x, retval=0x%x\n", util_format_name(format),
-                        target, sample_count, usage, retval);
+            format != PIPE_FORMAT_I8_UINT &&
+            format != PIPE_FORMAT_I16_UINT) {
+                return FALSE;
         }
-#endif
 
-        return retval == usage;
+        return TRUE;
 }
 
 static void
@@ -659,7 +497,9 @@ struct pipe_screen *
 vc4_screen_create(int fd, struct renderonly *ro)
 {
         struct vc4_screen *screen = rzalloc(NULL, struct vc4_screen);
+        uint64_t syncobj_cap = 0;
         struct pipe_screen *pscreen;
+        int err;
 
         pscreen = &screen->base;
 
@@ -690,6 +530,14 @@ vc4_screen_create(int fd, struct renderonly *ro)
                 vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_ETC1);
         screen->has_threaded_fs =
                 vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_THREADED_FS);
+        screen->has_madvise =
+                vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_MADVISE);
+        screen->has_perfmon_ioctl =
+                vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_PERFMON);
+
+        err = drmGetCap(fd, DRM_CAP_SYNCOBJ, &syncobj_cap);
+        if (err == 0 && syncobj_cap)
+                screen->has_syncobj = true;
 
         if (!vc4_get_chip_info(screen))
                 goto fail;
@@ -698,13 +546,13 @@ vc4_screen_create(int fd, struct renderonly *ro)
 
         slab_create_parent(&screen->transfer_pool, sizeof(struct vc4_transfer), 16);
 
-        vc4_fence_init(screen);
+        vc4_fence_screen_init(screen);
 
         vc4_debug = debug_get_option_vc4_debug();
         if (vc4_debug & VC4_DEBUG_SHADERDB)
                 vc4_debug |= VC4_DEBUG_NORAST;
 
-#if USE_VC4_SIMULATOR
+#ifdef USE_VC4_SIMULATOR
         vc4_simulator_init(screen);
 #endif
 
@@ -716,6 +564,11 @@ vc4_screen_create(int fd, struct renderonly *ro)
         pscreen->get_compiler_options = vc4_screen_get_compiler_options;
         pscreen->query_dmabuf_modifiers = vc4_screen_query_dmabuf_modifiers;
 
+        if (screen->has_perfmon_ioctl) {
+                pscreen->get_driver_query_group_info = vc4_get_driver_query_group_info;
+                pscreen->get_driver_query_info = vc4_get_driver_query_info;
+        }
+
         return pscreen;
 
 fail:
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h
index 85108219e..f4550d1c2 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h
@@ -95,7 +95,10 @@ struct vc4_screen {
         bool has_control_flow;
         bool has_etc1;
         bool has_threaded_fs;
+        bool has_madvise;
         bool has_tiling_ioctl;
+        bool has_perfmon_ioctl;
+        bool has_syncobj;
 
         struct vc4_simulator_file *sim_file;
 };
@@ -116,9 +119,9 @@ vc4_screen_get_compiler_options(struct pipe_screen *pscreen,
 extern uint32_t vc4_debug;
 
 void
-vc4_fence_init(struct vc4_screen *screen);
+vc4_fence_screen_init(struct vc4_screen *screen);
 
 struct vc4_fence *
-vc4_fence_create(struct vc4_screen *screen, uint64_t seqno);
+vc4_fence_create(struct vc4_screen *screen, uint64_t seqno, int fd);
 
 #endif /* VC4_SCREEN_H */
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c
index a73e40969..37c098a04 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c
@@ -619,6 +619,11 @@ vc4_simulator_get_param_ioctl(int fd, struct drm_vc4_get_param *args)
                 args->value = true;
                 return 0;
 
+        case DRM_VC4_PARAM_SUPPORTS_MADVISE:
+        case DRM_VC4_PARAM_SUPPORTS_PERFMON:
+                errno = -EINVAL;
+                return -1;
+
         case DRM_VC4_PARAM_V3D_IDENT0:
                 args->value = 0x02000000;
                 return 0;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_state.c b/lib/mesa/src/gallium/drivers/vc4/vc4_state.c
index ed8d404a4..1e4657a79 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_state.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_state.c
@@ -23,6 +23,7 @@
  */
 
 #include "pipe/p_state.h"
+#include "util/u_framebuffer.h"
 #include "util/u_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
@@ -386,8 +387,6 @@ vc4_set_constant_buffer(struct pipe_context *pctx,
         struct vc4_context *vc4 = vc4_context(pctx);
         struct vc4_constbuf_stateobj *so = &vc4->constbuf[shader];
 
-        assert(index == 0);
-
         /* Note that the state tracker can unbind constant buffers by
          * passing NULL here.
          */
@@ -397,7 +396,10 @@ vc4_set_constant_buffer(struct pipe_context *pctx,
                 return;
         }
 
-        assert(!cb->buffer);
+        if (index == 1 && so->cb[index].buffer_size != cb->buffer_size)
+                vc4->dirty |= VC4_DIRTY_UBO_1_SIZE;
+
+        pipe_resource_reference(&so->cb[index].buffer, cb->buffer);
         so->cb[index].buffer_offset = cb->buffer_offset;
         so->cb[index].buffer_size   = cb->buffer_size;
         so->cb[index].user_buffer   = cb->user_buffer;
@@ -413,21 +415,10 @@ vc4_set_framebuffer_state(struct pipe_context *pctx,
 {
         struct vc4_context *vc4 = vc4_context(pctx);
         struct pipe_framebuffer_state *cso = &vc4->framebuffer;
-        unsigned i;
 
         vc4->job = NULL;
 
-        for (i = 0; i < framebuffer->nr_cbufs; i++)
-                pipe_surface_reference(&cso->cbufs[i], framebuffer->cbufs[i]);
-        for (; i < vc4->framebuffer.nr_cbufs; i++)
-                pipe_surface_reference(&cso->cbufs[i], NULL);
-
-        cso->nr_cbufs = framebuffer->nr_cbufs;
-
-        pipe_surface_reference(&cso->zsbuf, framebuffer->zsbuf);
-
-        cso->width = framebuffer->width;
-        cso->height = framebuffer->height;
+        util_copy_framebuffer_state(cso, framebuffer);
 
         /* Nonzero texture mipmap levels are laid out as if they were in
          * power-of-two-sized spaces.  The renderbuffer config infers its
@@ -567,8 +558,8 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
 
         so->base = *cso;
 
-        pipe_reference(NULL, &prsc->reference);
-        so->base.texture = prsc;
+        so->base.texture = NULL;
+        pipe_resource_reference(&so->base.texture, prsc);
         so->base.reference.count = 1;
         so->base.context = pctx;
 
@@ -581,14 +572,20 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
          */
         if ((cso->u.tex.first_level &&
              (cso->u.tex.first_level != cso->u.tex.last_level)) ||
-            rsc->vc4_format == VC4_TEXTURE_TYPE_RGBA32R) {
+            rsc->vc4_format == VC4_TEXTURE_TYPE_RGBA32R ||
+            rsc->vc4_format == ~0) {
                 struct vc4_resource *shadow_parent = rsc;
-                struct pipe_resource tmpl = *prsc;
-
-                tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
-                tmpl.width0 = u_minify(tmpl.width0, cso->u.tex.first_level);
-                tmpl.height0 = u_minify(tmpl.height0, cso->u.tex.first_level);
-                tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level;
+                struct pipe_resource tmpl = {
+                        .target = prsc->target,
+                        .format = prsc->format,
+                        .width0 = u_minify(prsc->width0,
+                                           cso->u.tex.first_level),
+                        .height0 = u_minify(prsc->height0,
+                                            cso->u.tex.first_level),
+                        .bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET,
+                        .last_level = cso->u.tex.last_level - cso->u.tex.first_level,
+                        .nr_samples = prsc->nr_samples,
+                };
 
                 /* Create the shadow texture.  The rest of the texture
                  * parameter setup will use the shadow.
@@ -617,7 +614,9 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
         }
 
         so->texture_p0 =
-                (VC4_SET_FIELD(rsc->slices[0].offset >> 12, VC4_TEX_P0_OFFSET) |
+                (VC4_SET_FIELD((rsc->slices[0].offset +
+                                cso->u.tex.first_layer *
+                                rsc->cube_map_stride) >> 12, VC4_TEX_P0_OFFSET) |
                  VC4_SET_FIELD(rsc->vc4_format & 15, VC4_TEX_P0_TYPE) |
                  VC4_SET_FIELD(so->force_first_level ?
                                cso->u.tex.last_level :
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c
index 07e1c9c5f..2da520eb4 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c
@@ -63,15 +63,6 @@ vc4_size_is_lt(uint32_t width, uint32_t height, int cpp)
                 height <= 4 * vc4_utile_height(cpp));
 }
 
-static void
-check_box_utile_alignment(const struct pipe_box *box, int cpp)
-{
-        assert(!(box->x & (vc4_utile_width(cpp) - 1)));
-        assert(!(box->y & (vc4_utile_height(cpp) - 1)));
-        assert(!(box->width & (vc4_utile_width(cpp) - 1)));
-        assert(!(box->height & (vc4_utile_height(cpp) - 1)));
-}
-
 /**
  * Takes a utile x and y (and the number of utiles of width of the image) and
  * returns the offset to the utile within a VC4_TILING_FORMAT_TF image.
@@ -216,8 +207,6 @@ vc4_load_tiled_image(void *dst, uint32_t dst_stride,
                      uint8_t tiling_format, int cpp,
                      const struct pipe_box *box)
 {
-        check_box_utile_alignment(box, cpp);
-
         if (tiling_format == VC4_TILING_FORMAT_LT) {
                 vc4_load_lt_image(dst, dst_stride,
                                   src, src_stride,
@@ -240,8 +229,6 @@ vc4_store_tiled_image(void *dst, uint32_t dst_stride,
                       uint8_t tiling_format, int cpp,
                       const struct pipe_box *box)
 {
-        check_box_utile_alignment(box, cpp);
-
         if (tiling_format == VC4_TILING_FORMAT_LT) {
                 vc4_store_lt_image(dst, dst_stride,
                                    src, src_stride,
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
index 4a76c0ff7..ec42a3dc2 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
@@ -41,6 +41,12 @@
 #define NEON_TAG(x) x ## _base
 #endif
 
+static inline uint32_t
+align_down(uint32_t val, uint32_t align)
+{
+        return val & ~(align - 1);
+}
+
 /** Returns the stride in bytes of a 64-byte microtile. */
 static uint32_t
 vc4_utile_stride(int cpp)
@@ -252,11 +258,78 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
 #endif
 
 }
+/**
+ * Returns the X value into the address bits for LT tiling.
+ *
+ * The LT tile load/stores rely on the X bits not intersecting with the Y
+ * bits.  Because of this, we have to choose to put the utile index within the
+ * LT tile into one of the two values, and we do so in swizzle_lt_x() to make
+ * NPOT handling easier.
+ */
+static uint32_t
+swizzle_lt_x(int x, int cpp)
+{
+        switch (cpp) {
+        case 1:
+                /* 8x8 inside of 4x4 */
+                return ((x & 0x7) << (0 - 0) |
+                        (x & ~0x7) << (6 - 3));
+        case 2:
+                /* 8x4 inside of 4x4 */
+                return ((x & 0x7) << (1 - 0) |
+                        (x & ~0x7) << (6 - 3));
+        case 4:
+                /* 4x4 inside of 4x4 */
+                return ((x & 0x3) << (2 - 0) |
+                        (x & ~0x3) << (6 - 2));
+        case 8:
+                /* 2x4 inside of 4x4 */
+                return ((x & 0x1) << (3 - 0) |
+                        (x & ~0x1) << (6 - 1));
+        default:
+                unreachable("bad cpp");
+        }
+}
 
-void
-NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
-                            void *src, uint32_t src_stride,
-                            int cpp, const struct pipe_box *box)
+/**
+ * Returns the Y value into the address bits for LT tiling.
+ *
+ * The LT tile load/stores rely on the X bits not intersecting with the Y
+ * bits.
+ */
+static uint32_t
+swizzle_lt_y(int y, int cpp)
+{
+
+        switch (cpp) {
+        case 1:
+                /* 8x8 inside of 4x4 */
+                return ((y & 0x7) << 3);
+        case 2:
+                /* 8x4 inside of 4x4 */
+                return ((y & 0x3) << 4);
+        case 4:
+                /* 4x4 inside of 4x4 */
+                return ((y & 0x3) << 4);
+        case 8:
+                /* 2x4 inside of 4x4 */
+                return ((y & 0x3) << 4);
+        default:
+                unreachable("bad cpp");
+        }
+}
+
+/**
+ * Helper for loading or storing to an LT image, where the box is aligned
+ * to utiles.
+ *
+ * This just breaks the box down into calls to the fast
+ * vc4_load_utile/vc4_store_utile helpers.
+ */
+static inline void
+vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride,
+                     void *cpu, uint32_t cpu_stride,
+                     int cpp, const struct pipe_box *box, bool to_cpu)
 {
         uint32_t utile_w = vc4_utile_width(cpp);
         uint32_t utile_h = vc4_utile_height(cpp);
@@ -264,33 +337,149 @@ NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
         uint32_t ystart = box->y;
 
         for (uint32_t y = 0; y < box->height; y += utile_h) {
-                for (int x = 0; x < box->width; x += utile_w) {
-                        vc4_load_utile(dst + (dst_stride * y +
-                                              x * cpp),
-                                       src + ((ystart + y) * src_stride +
-                                              (xstart + x) * 64 / utile_w),
-                                       dst_stride, cpp);
+                for (uint32_t x = 0; x < box->width; x += utile_w) {
+                        void *gpu_tile = gpu + ((ystart + y) * gpu_stride +
+                                                (xstart + x) * 64 / utile_w);
+                        if (to_cpu) {
+                                vc4_load_utile(cpu + (cpu_stride * y +
+                                                      x * cpp),
+                                               gpu_tile,
+                                               cpu_stride, cpp);
+                        } else {
+                                vc4_store_utile(gpu_tile,
+                                                cpu + (cpu_stride * y +
+                                                       x * cpp),
+                                                cpu_stride, cpp);
+                        }
+                }
+        }
+}
+
+/**
+ * Helper for loading or storing to an LT image, where the box is not aligned
+ * to utiles.
+ *
+ * This walks through the raster-order data, copying to/from the corresponding
+ * tiled pixel.  This means we don't get write-combining on stores, but the
+ * loop is very few CPU instructions since the memcpy will be inlined.
+ */
+static inline void
+vc4_lt_image_unaligned(void *gpu, uint32_t gpu_stride,
+                       void *cpu, uint32_t cpu_stride,
+                       int cpp, const struct pipe_box *box, bool to_cpu)
+{
+
+        /* These are the address bits for the start of the box, split out into
+         * x/y so that they can be incremented separately in their loops.
+         */
+        uint32_t offs_x0 = swizzle_lt_x(box->x, cpp);
+        uint32_t offs_y = swizzle_lt_y(box->y, cpp);
+        /* The *_mask values are "what bits of the address are from x or y" */
+        uint32_t x_mask = swizzle_lt_x(~0, cpp);
+        uint32_t y_mask = swizzle_lt_y(~0, cpp);
+        uint32_t incr_y = swizzle_lt_x(gpu_stride / cpp, cpp);
+
+        assert(!(x_mask & y_mask));
+
+        offs_x0 += incr_y * (box->y / vc4_utile_height(cpp));
+
+        for (uint32_t y = 0; y < box->height; y++) {
+                void *gpu_row = gpu + offs_y;
+
+                uint32_t offs_x = offs_x0;
+
+                for (uint32_t x = 0; x < box->width; x++) {
+                        /* Use a memcpy here to move a pixel's worth of data.
+                         * We're relying on this function to be inlined, so
+                         * this will get expanded into the appropriate 1, 2,
+                         * or 4-byte move.
+                         */
+                        if (to_cpu) {
+                                memcpy(cpu + x * cpp, gpu_row + offs_x, cpp);
+                        } else {
+                                memcpy(gpu_row + offs_x, cpu + x * cpp, cpp);
+                        }
+
+                        /* This math trick with x_mask increments offs_x by 1
+                         * in x.
+                         */
+                        offs_x = (offs_x - x_mask) & x_mask;
                 }
+
+                offs_y = (offs_y - y_mask) & y_mask;
+                /* When offs_y wraps (we hit the end of the utile), we
+                 * increment offs_x0 by effectively the utile stride.
+                 */
+                if (!offs_y)
+                        offs_x0 += incr_y;
+
+                cpu += cpu_stride;
+        }
+}
+
+/**
+ * General LT image load/store helper.
+ */
+static inline void
+vc4_lt_image_helper(void *gpu, uint32_t gpu_stride,
+                    void *cpu, uint32_t cpu_stride,
+                    int cpp, const struct pipe_box *box, bool to_cpu)
+{
+        if (box->x & (vc4_utile_width(cpp) - 1) ||
+            box->y & (vc4_utile_height(cpp) - 1) ||
+            box->width & (vc4_utile_width(cpp) - 1) ||
+            box->height & (vc4_utile_height(cpp) - 1)) {
+                vc4_lt_image_unaligned(gpu, gpu_stride,
+                                       cpu, cpu_stride,
+                                       cpp, box, to_cpu);
+        } else {
+                vc4_lt_image_aligned(gpu, gpu_stride,
+                                     cpu, cpu_stride,
+                                     cpp, box, to_cpu);
+        }
+}
+
+static inline void
+vc4_lt_image_cpp_helper(void *gpu, uint32_t gpu_stride,
+                        void *cpu, uint32_t cpu_stride,
+                        int cpp, const struct pipe_box *box, bool to_cpu)
+{
+        switch (cpp) {
+        case 1:
+                vc4_lt_image_helper(gpu, gpu_stride, cpu, cpu_stride, 1, box,
+                                    to_cpu);
+                break;
+        case 2:
+                vc4_lt_image_helper(gpu, gpu_stride, cpu, cpu_stride, 2, box,
+                                    to_cpu);
+                break;
+        case 4:
+                vc4_lt_image_helper(gpu, gpu_stride, cpu, cpu_stride, 4, box,
+                                    to_cpu);
+                break;
+        case 8:
+                vc4_lt_image_helper(gpu, gpu_stride, cpu, cpu_stride, 8, box,
+                                    to_cpu);
+                break;
+        default:
+                unreachable("bad cpp");
         }
 }
 
 void
+NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
+                            void *src, uint32_t src_stride,
+                            int cpp, const struct pipe_box *box)
+{
+        vc4_lt_image_cpp_helper(src, src_stride, dst, dst_stride, cpp, box,
+                                true);
+}
+
+void
 NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride,
                              void *src, uint32_t src_stride,
                              int cpp, const struct pipe_box *box)
 {
-        uint32_t utile_w = vc4_utile_width(cpp);
-        uint32_t utile_h = vc4_utile_height(cpp);
-        uint32_t xstart = box->x;
-        uint32_t ystart = box->y;
-
-        for (uint32_t y = 0; y < box->height; y += utile_h) {
-                for (int x = 0; x < box->width; x += utile_w) {
-                        vc4_store_utile(dst + ((ystart + y) * dst_stride +
-                                               (xstart + x) * 64 / utile_w),
-                                        src + (src_stride * y +
-                                               x * cpp),
-                                        src_stride, cpp);
-                }
-        }
+        vc4_lt_image_cpp_helper(dst, dst_stride, src, src_stride, cpp, box,
+                                false);
 }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_uniforms.c b/lib/mesa/src/gallium/drivers/vc4/vc4_uniforms.c
index 12e6504bb..3801fbc8f 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -224,14 +224,16 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                                       uinfo->num_texture_samples);
 
         for (int i = 0; i < uinfo->count; i++) {
+                enum quniform_contents contents = uinfo->contents[i];
+                uint32_t data = uinfo->data[i];
 
-                switch (uinfo->contents[i]) {
+                switch (contents) {
                 case QUNIFORM_CONSTANT:
-                        cl_aligned_u32(&uniforms, uinfo->data[i]);
+                        cl_aligned_u32(&uniforms, data);
                         break;
                 case QUNIFORM_UNIFORM:
                         cl_aligned_u32(&uniforms,
-                                       gallium_uniforms[uinfo->data[i]]);
+                                       gallium_uniforms[data]);
                         break;
                 case QUNIFORM_VIEWPORT_X_SCALE:
                         cl_aligned_f(&uniforms, vc4->viewport.scale[0] * 16.0f);
@@ -249,41 +251,49 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
 
                 case QUNIFORM_USER_CLIP_PLANE:
                         cl_aligned_f(&uniforms,
-                                     vc4->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
+                                     vc4->clip.ucp[data / 4][data % 4]);
                         break;
 
                 case QUNIFORM_TEXTURE_CONFIG_P0:
-                        write_texture_p0(job, &uniforms, texstate,
-                                         uinfo->data[i]);
+                        write_texture_p0(job, &uniforms, texstate, data);
                         break;
 
                 case QUNIFORM_TEXTURE_CONFIG_P1:
-                        write_texture_p1(job, &uniforms, texstate,
-                                         uinfo->data[i]);
+                        write_texture_p1(job, &uniforms, texstate, data);
                         break;
 
                 case QUNIFORM_TEXTURE_CONFIG_P2:
-                        write_texture_p2(job, &uniforms, texstate,
-                                         uinfo->data[i]);
+                        write_texture_p2(job, &uniforms, texstate, data);
                         break;
 
                 case QUNIFORM_TEXTURE_FIRST_LEVEL:
                         write_texture_first_level(job, &uniforms, texstate,
-                                                  uinfo->data[i]);
+                                                  data);
                         break;
 
                 case QUNIFORM_UBO_ADDR:
-                        cl_aligned_reloc(job, &job->uniforms, &uniforms, ubo, 0);
+                        if (data == 0) {
+                                cl_aligned_reloc(job, &job->uniforms,
+                                                 &uniforms, ubo, 0);
+                        } else {
+                                struct pipe_constant_buffer *c =
+                                        &cb->cb[data];
+                                struct vc4_resource *rsc =
+                                        vc4_resource(c->buffer);
+
+                                cl_aligned_reloc(job, &job->uniforms,
+                                                 &uniforms,
+                                                 rsc->bo, c->buffer_offset);
+                        }
                         break;
 
                 case QUNIFORM_TEXTURE_MSAA_ADDR:
-                        write_texture_msaa_addr(job, &uniforms,
-                                                texstate, uinfo->data[i]);
+                        write_texture_msaa_addr(job, &uniforms, texstate, data);
                         break;
 
                 case QUNIFORM_TEXTURE_BORDER_COLOR:
                         write_texture_border_color(job, &uniforms,
-                                                   texstate, uinfo->data[i]);
+                                                   texstate, data);
                         break;
 
                 case QUNIFORM_TEXRECT_SCALE_X:
@@ -291,7 +301,7 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                         cl_aligned_u32(&uniforms,
                                        get_texrect_scale(texstate,
                                                          uinfo->contents[i],
-                                                         uinfo->data[i]));
+                                                         data));
                         break;
 
                 case QUNIFORM_BLEND_CONST_COLOR_X:
@@ -330,9 +340,9 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
 
                 case QUNIFORM_STENCIL:
                         cl_aligned_u32(&uniforms,
-                                       vc4->zsa->stencil_uniforms[uinfo->data[i]] |
-                                       (uinfo->data[i] <= 1 ?
-                                        (vc4->stencil_ref.ref_value[uinfo->data[i]] << 8) :
+                                       vc4->zsa->stencil_uniforms[data] |
+                                       (data <= 1 ?
+                                        (vc4->stencil_ref.ref_value[data] << 8) :
                                         0));
                         break;
 
@@ -350,11 +360,18 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                         cl_aligned_u32(&uniforms, 0xd0d0d0d0);
                         break;
                 }
-#if 0
-                uint32_t written_val = *((uint32_t *)uniforms - 1);
-                fprintf(stderr, "%p: %d / 0x%08x (%f)\n",
-                        shader, i, written_val, uif(written_val));
-#endif
+
+                if (false) {
+                        uint32_t written_val = *((uint32_t *)uniforms - 1);
+                        char *desc = qir_describe_uniform(uinfo->contents[i],
+                                                          uinfo->data[i],
+                                                          gallium_uniforms);
+
+                        fprintf(stderr, "%p/%d: 0x%08x %s\n",
+                                shader, i, written_val, desc);
+
+                        ralloc_free(desc);
+                }
         }
 
         cl_end(&job->uniforms, uniforms);
author	Jonathan Gray <jsg@cvs.openbsd.org>	2019-01-29 11:52:33 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2019-01-29 11:52:33 +0000
commit	37bbf6a1792773f11c15a4da1588a7520ee2fb4e (patch)
tree	64944d4aa665a1e479cfc004e446593062254550 /lib/mesa/src/gallium/drivers/vc4
parent	6b139c2063623e9310025247cd966490b9aa57ea (diff)