Merge Mesa 19.0.5

author: Jonathan Gray <jsg@cvs.openbsd.org> 2019-05-23 05:33:34 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2019-05-23 05:33:34 +0000
commit: 9886815a25d84be79f51e65ebd8e458bb5d26ca8 (patch)
tree: a65edf018dd992543337433f7303fb29a6c8e8cf /lib/mesa/src/gallium/drivers/vc4
parent: e2a3acb64af2657b1181806818eacad061103c23 (diff)
14 files changed, 147 insertions, 435 deletions
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c
index 54f9d9c26..716ca50ea 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -386,7 +386,6 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time)
 
 static struct vc4_bo *
 vc4_bo_open_handle(struct vc4_screen *screen,
-                   uint32_t winsys_stride,
                    uint32_t handle, uint32_t size)
 {
         struct vc4_bo *bo;
@@ -410,8 +409,7 @@ vc4_bo_open_handle(struct vc4_screen *screen,
         bo->private = false;
 
 #ifdef USE_VC4_SIMULATOR
-        vc4_simulator_open_from_handle(screen->fd, winsys_stride,
-                                       bo->handle, bo->size);
+        vc4_simulator_open_from_handle(screen->fd, bo->handle, bo->size);
         bo->map = malloc(bo->size);
 #endif
 
@@ -423,8 +421,7 @@ done:
 }
 
 struct vc4_bo *
-vc4_bo_open_name(struct vc4_screen *screen, uint32_t name,
-                 uint32_t winsys_stride)
+vc4_bo_open_name(struct vc4_screen *screen, uint32_t name)
 {
         struct drm_gem_open o = {
                 .name = name
@@ -436,11 +433,11 @@ vc4_bo_open_name(struct vc4_screen *screen, uint32_t name,
                 return NULL;
         }
 
-        return vc4_bo_open_handle(screen, winsys_stride, o.handle, o.size);
+        return vc4_bo_open_handle(screen, o.handle, o.size);
 }
 
 struct vc4_bo *
-vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd, uint32_t winsys_stride)
+vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd)
 {
         uint32_t handle;
         int ret = drmPrimeFDToHandle(screen->fd, fd, &handle);
@@ -457,7 +454,7 @@ vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd, uint32_t winsys_stride)
                 return NULL;
         }
 
-        return vc4_bo_open_handle(screen, winsys_stride, handle, size);
+        return vc4_bo_open_handle(screen, handle, size);
 }
 
 int
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h
index 9fa477442..30a388ee5 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h
@@ -66,10 +66,8 @@ struct vc4_bo *vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data,
                                    uint32_t size);
 void vc4_bo_last_unreference(struct vc4_bo *bo);
 void vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time);
-struct vc4_bo *vc4_bo_open_name(struct vc4_screen *screen, uint32_t name,
-                                uint32_t winsys_stride);
-struct vc4_bo *vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd,
-                                  uint32_t winsys_stride);
+struct vc4_bo *vc4_bo_open_name(struct vc4_screen *screen, uint32_t name);
+struct vc4_bo *vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd);
 bool vc4_bo_flink(struct vc4_bo *bo, uint32_t *name);
 int vc4_bo_get_dmabuf(struct vc4_bo *bo);
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c
index ffd7d4c85..94969dcb1 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c
@@ -85,6 +85,18 @@ vc4_texture_barrier(struct pipe_context *pctx, unsigned flags)
 }
 
 static void
+vc4_set_debug_callback(struct pipe_context *pctx,
+                       const struct pipe_debug_callback *cb)
+{
+        struct vc4_context *vc4 = vc4_context(pctx);
+
+        if (cb)
+                vc4->debug = *cb;
+        else
+                memset(&vc4->debug, 0, sizeof(vc4->debug));
+}
+
+static void
 vc4_invalidate_resource(struct pipe_context *pctx, struct pipe_resource *prsc)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
@@ -164,6 +176,7 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
         pctx->priv = priv;
         pctx->destroy = vc4_context_destroy;
         pctx->flush = vc4_pipe_flush;
+        pctx->set_debug_callback = vc4_set_debug_callback;
         pctx->invalidate_resource = vc4_invalidate_resource;
         pctx->texture_barrier = vc4_texture_barrier;
 
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h
index ce8bcffac..1d3179c71 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h
@@ -405,6 +405,7 @@ struct vc4_context {
         struct pipe_viewport_state viewport;
         struct vc4_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
         struct vc4_vertexbuf_stateobj vertexbuf;
+        struct pipe_debug_callback debug;
 
         struct vc4_hwperfmon *perfmon;
         /** @} */
@@ -451,6 +452,8 @@ struct vc4_depth_stencil_alpha_state {
 #define perf_debug(...) do {                            \
         if (unlikely(vc4_debug & VC4_DEBUG_PERF))       \
                 fprintf(stderr, __VA_ARGS__);           \
+        if (unlikely(vc4->debug.debug_message))         \
+                pipe_debug_message(&vc4->debug, PERF_INFO, __VA_ARGS__);    \
 } while (0)
 
 static inline struct vc4_context *
@@ -486,12 +489,8 @@ void vc4_program_fini(struct pipe_context *pctx);
 void vc4_query_init(struct pipe_context *pctx);
 void vc4_simulator_init(struct vc4_screen *screen);
 void vc4_simulator_destroy(struct vc4_screen *screen);
-int vc4_simulator_flush(struct vc4_context *vc4,
-                        struct drm_vc4_submit_cl *args,
-                        struct vc4_job *job);
 int vc4_simulator_ioctl(int fd, unsigned long request, void *arg);
-void vc4_simulator_open_from_handle(int fd, uint32_t winsys_stride,
-                                    int handle, uint32_t size);
+void vc4_simulator_open_from_handle(int fd, int handle, uint32_t size);
 
 static inline int
 vc4_ioctl(int fd, unsigned long request, void *arg)
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_job.c b/lib/mesa/src/gallium/drivers/vc4/vc4_job.c
index f38c46475..2b87a00df 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_job.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_job.c
@@ -492,11 +492,7 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job)
         if (!(vc4_debug & VC4_DEBUG_NORAST)) {
                 int ret;
 
-#ifndef USE_VC4_SIMULATOR
-                ret = drmIoctl(vc4->fd, DRM_IOCTL_VC4_SUBMIT_CL, &submit);
-#else
-                ret = vc4_simulator_flush(vc4, &submit, job);
-#endif
+                ret = vc4_ioctl(vc4->fd, DRM_IOCTL_VC4_SUBMIT_CL, &submit);
                 static bool warned = false;
                 if (ret && !warned) {
                         fprintf(stderr, "Draw call returned %s.  "
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index 60eccb4fc..ff6268f47 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -42,6 +42,7 @@
 #include "util/u_format.h"
 #include "vc4_qir.h"
 #include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
 #include "vc4_context.h"
 
 static bool
@@ -67,37 +68,6 @@ vc4_nir_get_dst_color(nir_builder *b, int sample)
         return &load->dest.ssa;
 }
 
-static  nir_ssa_def *
-vc4_nir_srgb_decode(nir_builder *b, nir_ssa_def *srgb)
-{
-        nir_ssa_def *is_low = nir_flt(b, srgb, nir_imm_float(b, 0.04045));
-        nir_ssa_def *low = nir_fmul(b, srgb, nir_imm_float(b, 1.0 / 12.92));
-        nir_ssa_def *high = nir_fpow(b,
-                                     nir_fmul(b,
-                                              nir_fadd(b, srgb,
-                                                       nir_imm_float(b, 0.055)),
-                                              nir_imm_float(b, 1.0 / 1.055)),
-                                     nir_imm_float(b, 2.4));
-
-        return nir_bcsel(b, is_low, low, high);
-}
-
-static  nir_ssa_def *
-vc4_nir_srgb_encode(nir_builder *b, nir_ssa_def *linear)
-{
-        nir_ssa_def *is_low = nir_flt(b, linear, nir_imm_float(b, 0.0031308));
-        nir_ssa_def *low = nir_fmul(b, linear, nir_imm_float(b, 12.92));
-        nir_ssa_def *high = nir_fsub(b,
-                                     nir_fmul(b,
-                                              nir_imm_float(b, 1.055),
-                                              nir_fpow(b,
-                                                       linear,
-                                                       nir_imm_float(b, 0.41666))),
-                                     nir_imm_float(b, 0.055));
-
-        return nir_bcsel(b, is_low, low, high);
-}
-
 static nir_ssa_def *
 vc4_blend_channel_f(nir_builder *b,
                     nir_ssa_def **src,
@@ -130,7 +100,7 @@ vc4_blend_channel_f(nir_builder *b,
                 return nir_load_system_value(b,
                                              nir_intrinsic_load_blend_const_color_r_float +
                                              channel,
-                                             0);
+                                             0, 32);
         case PIPE_BLENDFACTOR_CONST_ALPHA:
                 return nir_load_blend_const_color_a_float(b);
         case PIPE_BLENDFACTOR_ZERO:
@@ -148,7 +118,7 @@ vc4_blend_channel_f(nir_builder *b,
                                 nir_load_system_value(b,
                                                       nir_intrinsic_load_blend_const_color_r_float +
                                                       channel,
-                                                      0));
+                                                      0, 32));
         case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
                 return nir_fsub(b, nir_imm_float(b, 1.0),
                                 nir_load_blend_const_color_a_float(b));
@@ -501,14 +471,14 @@ vc4_nir_blend_pipeline(struct vc4_compile *c, nir_builder *b, nir_ssa_def *src,
 
                 /* Turn dst color to linear. */
                 for (int i = 0; i < 3; i++)
-                        dst_color[i] = vc4_nir_srgb_decode(b, dst_color[i]);
+                        dst_color[i] = nir_format_srgb_to_linear(b, dst_color[i]);
 
                 nir_ssa_def *blend_color[4];
                 vc4_do_blending_f(c, b, blend_color, src_color, dst_color);
 
                 /* sRGB encode the output color */
                 for (int i = 0; i < 3; i++)
-                        blend_color[i] = vc4_nir_srgb_encode(b, blend_color[i]);
+                        blend_color[i] = nir_format_linear_to_srgb(b, blend_color[i]);
 
                 packed_color = vc4_nir_swizzle_and_pack(c, b, blend_color);
         } else {
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index b7969a562..fc2baee1b 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -330,7 +330,8 @@ vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b,
                 nir_intrinsic_instr *intr_comp =
                         nir_intrinsic_instr_create(c->s, intr->intrinsic);
                 intr_comp->num_components = 1;
-                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL);
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1,
+                                  intr->dest.ssa.bit_size, NULL);
 
                 /* Convert the uniform offset to bytes.  If it happens
                  * to be a constant, constant-folding will clean up
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c
index bc9bd76ae..8f1e561c4 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c
@@ -1004,24 +1004,24 @@ ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
         enum qpu_cond cond;
 
         switch (compare_instr->op) {
-        case nir_op_feq:
-        case nir_op_ieq:
+        case nir_op_feq32:
+        case nir_op_ieq32:
         case nir_op_seq:
                 cond = QPU_COND_ZS;
                 break;
-        case nir_op_fne:
-        case nir_op_ine:
+        case nir_op_fne32:
+        case nir_op_ine32:
         case nir_op_sne:
                 cond = QPU_COND_ZC;
                 break;
-        case nir_op_fge:
-        case nir_op_ige:
-        case nir_op_uge:
+        case nir_op_fge32:
+        case nir_op_ige32:
+        case nir_op_uge32:
         case nir_op_sge:
                 cond = QPU_COND_NC;
                 break;
-        case nir_op_flt:
-        case nir_op_ilt:
+        case nir_op_flt32:
+        case nir_op_ilt32:
         case nir_op_slt:
                 cond = QPU_COND_NS;
                 break;
@@ -1048,7 +1048,7 @@ ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
                                 qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
                 break;
 
-        case nir_op_bcsel:
+        case nir_op_b32csel:
                 *dest = qir_SEL(c, cond,
                                 ntq_get_alu_src(c, sel_instr, 1),
                                 ntq_get_alu_src(c, sel_instr, 2));
@@ -1208,14 +1208,14 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         case nir_op_u2f32:
                 result = qir_ITOF(c, src[0]);
                 break;
-        case nir_op_b2f:
+        case nir_op_b2f32:
                 result = qir_AND(c, src[0], qir_uniform_f(c, 1.0));
                 break;
-        case nir_op_b2i:
+        case nir_op_b2i32:
                 result = qir_AND(c, src[0], qir_uniform_ui(c, 1));
                 break;
-        case nir_op_i2b:
-        case nir_op_f2b:
+        case nir_op_i2b32:
+        case nir_op_f2b32:
                 qir_SF(c, src[0]);
                 result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC,
                                             qir_uniform_ui(c, ~0),
@@ -1264,21 +1264,21 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         case nir_op_sne:
         case nir_op_sge:
         case nir_op_slt:
-        case nir_op_feq:
-        case nir_op_fne:
-        case nir_op_fge:
-        case nir_op_flt:
-        case nir_op_ieq:
-        case nir_op_ine:
-        case nir_op_ige:
-        case nir_op_uge:
-        case nir_op_ilt:
+        case nir_op_feq32:
+        case nir_op_fne32:
+        case nir_op_fge32:
+        case nir_op_flt32:
+        case nir_op_ieq32:
+        case nir_op_ine32:
+        case nir_op_ige32:
+        case nir_op_uge32:
+        case nir_op_ilt32:
                 if (!ntq_emit_comparison(c, &result, instr, instr)) {
                         fprintf(stderr, "Bad comparison instruction\n");
                 }
                 break;
 
-        case nir_op_bcsel:
+        case nir_op_b32csel:
                 result = ntq_emit_bcsel(c, instr, src);
                 break;
         case nir_op_fcsel:
@@ -1591,14 +1591,14 @@ vc4_optimize_nir(struct nir_shader *s)
                 NIR_PASS(progress, s, nir_opt_dce);
                 NIR_PASS(progress, s, nir_opt_dead_cf);
                 NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
                 NIR_PASS(progress, s, nir_opt_algebraic);
                 NIR_PASS(progress, s, nir_opt_constant_folding);
                 NIR_PASS(progress, s, nir_opt_undef);
                 NIR_PASS(progress, s, nir_opt_loop_unroll,
                          nir_var_shader_in |
                          nir_var_shader_out |
-                         nir_var_local);
+                         nir_var_function_temp);
         } while (progress);
 }
 
@@ -2363,7 +2363,8 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
                 if (stage == QSTAGE_FRAG) {
                         NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables);
                 } else {
-                        NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables);
+                        NIR_PASS_V(c->s, nir_lower_clip_vs,
+				   c->key->ucp_enables, false);
                         NIR_PASS_V(c->s, nir_lower_io_to_scalar,
                                    nir_var_shader_out);
                 }
@@ -2384,6 +2385,8 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
 
         vc4_optimize_nir(c->s);
 
+        NIR_PASS_V(c->s, nir_lower_bool_to_int32);
+
         NIR_PASS_V(c->s, nir_convert_from_ssa, true);
 
         if (vc4_debug & VC4_DEBUG_SHADERDB) {
@@ -2514,7 +2517,7 @@ vc4_shader_state_create(struct pipe_context *pctx,
 
         vc4_optimize_nir(s);
 
-        NIR_PASS_V(s, nir_remove_dead_variables, nir_var_local);
+        NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp);
 
         /* Garbage collect dead instructions */
         nir_sweep(s);
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c
index 41e6ec5c1..a4d1b903b 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c
@@ -319,8 +319,10 @@ vc4_resource_get_handle(struct pipe_screen *pscreen,
 
                 return vc4_bo_flink(rsc->bo, &whandle->handle);
         case WINSYS_HANDLE_TYPE_KMS:
-                if (screen->ro && renderonly_get_handle(rsc->scanout, whandle))
-                        return TRUE;
+                if (screen->ro) {
+                        assert(rsc->scanout);
+                        return renderonly_get_handle(rsc->scanout, whandle);
+                }
                 whandle->handle = rsc->bo->handle;
                 return TRUE;
         case WINSYS_HANDLE_TYPE_FD:
@@ -622,12 +624,10 @@ vc4_resource_from_handle(struct pipe_screen *pscreen,
 
         switch (whandle->type) {
         case WINSYS_HANDLE_TYPE_SHARED:
-                rsc->bo = vc4_bo_open_name(screen,
-                                           whandle->handle, whandle->stride);
+                rsc->bo = vc4_bo_open_name(screen, whandle->handle);
                 break;
         case WINSYS_HANDLE_TYPE_FD:
-                rsc->bo = vc4_bo_open_dmabuf(screen,
-                                             whandle->handle, whandle->stride);
+                rsc->bo = vc4_bo_open_dmabuf(screen, whandle->handle);
                 break;
         default:
                 fprintf(stderr,
@@ -1013,6 +1013,7 @@ void
 vc4_update_shadow_baselevel_texture(struct pipe_context *pctx,
                                     struct pipe_sampler_view *pview)
 {
+        struct vc4_context *vc4 = vc4_context(pctx);
         struct vc4_sampler_view *view = vc4_sampler_view(pview);
         struct vc4_resource *shadow = vc4_resource(view->texture);
         struct vc4_resource *orig = vc4_resource(pview->texture);
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c
index e7f7c82c2..acb4a1feb 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c
@@ -178,6 +178,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
                 /* Note: Not supported in hardware, just faking it. */
                 return 5;
 
+        case PIPE_CAP_MAX_VARYINGS:
+                return 8;
+
         case PIPE_CAP_VENDOR_ID:
                 return 0x14E4;
         case PIPE_CAP_ACCELERATED:
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c
index 37c098a04..2ce5a7596 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c
@@ -99,10 +99,13 @@ struct vc4_simulator_bo {
 
         /** Area for this BO within sim_state->mem */
         struct mem_block *block;
-        void *winsys_map;
-        uint32_t winsys_stride;
 
         int handle;
+
+        /* Mapping of the underlying GEM object that we copy in/out of
+         * simulator memory.
+         */
+        void *gem_vaddr;
 };
 
 static void *
@@ -143,6 +146,7 @@ vc4_create_simulator_bo(int fd, int handle, unsigned size)
         sim_bo->file = file;
         sim_bo->handle = handle;
 
+        /* Allocate space for the buffer in simulator memory. */
         mtx_lock(&sim_state.mutex);
         sim_bo->block = u_mmAllocMem(sim_state.heap, size + 4, PAGE_ALIGN2, 0);
         mtx_unlock(&sim_state.mutex);
@@ -162,6 +166,25 @@ vc4_create_simulator_bo(int fd, int handle, unsigned size)
                 mtx_lock(&sim_state.mutex);
                 _mesa_hash_table_insert(file->bo_map, int_to_key(handle), bo);
                 mtx_unlock(&sim_state.mutex);
+
+                /* Map the GEM buffer for copy in/out to the simulator. */
+                struct drm_mode_map_dumb map = {
+                        .handle = handle,
+                };
+                int ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map);
+                if (ret) {
+                        fprintf(stderr, "Failed to get MMAP offset: %d\n",
+                                errno);
+                        abort();
+                }
+                sim_bo->gem_vaddr = mmap(NULL, obj->base.size,
+                                         PROT_READ | PROT_WRITE, MAP_SHARED,
+                                         fd, map.offset);
+                if (sim_bo->gem_vaddr == MAP_FAILED) {
+                        fprintf(stderr, "mmap of bo %d (offset 0x%016llx, size %d) failed\n",
+                                handle, (long long)map.offset, (int)obj->base.size);
+                        abort();
+                }
         }
 
         return sim_bo;
@@ -174,16 +197,19 @@ vc4_free_simulator_bo(struct vc4_simulator_bo *sim_bo)
         struct drm_vc4_bo *bo = &sim_bo->base;
         struct drm_gem_cma_object *obj = &bo->base;
 
-        if (sim_bo->winsys_map)
-                munmap(sim_bo->winsys_map, obj->base.size);
+        if (bo->validated_shader) {
+                free(bo->validated_shader->texture_samples);
+                free(bo->validated_shader);
+        }
+
+        if (sim_bo->gem_vaddr)
+                munmap(sim_bo->gem_vaddr, obj->base.size);
 
         mtx_lock(&sim_state.mutex);
         u_mmFreeMem(sim_bo->block);
         if (sim_bo->handle) {
-                struct hash_entry *entry =
-                        _mesa_hash_table_search(sim_file->bo_map,
-                                                int_to_key(sim_bo->handle));
-                _mesa_hash_table_remove(sim_file->bo_map, entry);
+                _mesa_hash_table_remove_key(sim_file->bo_map,
+                                            int_to_key(sim_bo->handle));
         }
         mtx_unlock(&sim_state.mutex);
         ralloc_free(sim_bo);
@@ -210,41 +236,23 @@ drm_gem_cma_create(struct drm_device *dev, size_t size)
 }
 
 static int
-vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_job *job,
+vc4_simulator_pin_bos(struct vc4_simulator_file *file,
                       struct vc4_exec_info *exec)
 {
-        int fd = dev->screen->fd;
-        struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd);
         struct drm_vc4_submit_cl *args = exec->args;
-        struct vc4_bo **bos = job->bo_pointers.base;
+        uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles;
 
         exec->bo_count = args->bo_handle_count;
         exec->bo = calloc(exec->bo_count, sizeof(void *));
         for (int i = 0; i < exec->bo_count; i++) {
-                struct vc4_bo *bo = bos[i];
                 struct vc4_simulator_bo *sim_bo =
-                        vc4_get_simulator_bo(file, bo->handle);
+                        vc4_get_simulator_bo(file, bo_handles[i]);
                 struct drm_vc4_bo *drm_bo = &sim_bo->base;
                 struct drm_gem_cma_object *obj = &drm_bo->base;
 
-                drm_bo->bo = bo;
-#if 0
-                fprintf(stderr, "bo hindex %d: %s\n", i, bo->name);
-#endif
-
-                vc4_bo_map(bo);
-                memcpy(obj->vaddr, bo->map, bo->size);
+                memcpy(obj->vaddr, sim_bo->gem_vaddr, obj->base.size);
 
                 exec->bo[i] = obj;
-
-                /* The kernel does this validation at shader create ioctl
-                 * time.
-                 */
-                if (strcmp(bo->name, "code") == 0) {
-                        drm_bo->validated_shader = vc4_validate_shader(obj);
-                        if (!drm_bo->validated_shader)
-                                abort();
-                }
         }
         return 0;
 }
@@ -255,16 +263,13 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
         for (int i = 0; i < exec->bo_count; i++) {
                 struct drm_gem_cma_object *obj = exec->bo[i];
                 struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base);
-                struct vc4_bo *bo = drm_bo->bo;
+                struct vc4_simulator_bo *sim_bo =
+                        (struct vc4_simulator_bo *)drm_bo;
 
                 assert(*(uint32_t *)(obj->vaddr +
                                      obj->base.size) == BO_SENTINEL);
-                memcpy(bo->map, obj->vaddr, bo->size);
-
-                if (drm_bo->validated_shader) {
-                        free(drm_bo->validated_shader->texture_samples);
-                        free(drm_bo->validated_shader);
-                }
+                if (sim_bo->gem_vaddr)
+                        memcpy(sim_bo->gem_vaddr, obj->vaddr, obj->base.size);
         }
 
         free(exec->bo);
@@ -359,19 +364,10 @@ vc4_dump_to_file(struct vc4_exec_info *exec)
         fclose(f);
 }
 
-int
-vc4_simulator_flush(struct vc4_context *vc4,
-                    struct drm_vc4_submit_cl *args, struct vc4_job *job)
+static int
+vc4_simulator_submit_cl_ioctl(int fd, struct drm_vc4_submit_cl *args)
 {
-        struct vc4_screen *screen = vc4->screen;
-        int fd = screen->fd;
         struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd);
-        struct vc4_surface *csurf = vc4_surface(vc4->framebuffer.cbufs[0]);
-        struct vc4_resource *ctex = csurf ? vc4_resource(csurf->base.texture) : NULL;
-        struct vc4_simulator_bo *csim_bo = ctex ? vc4_get_simulator_bo(file, ctex->bo->handle) : NULL;
-        uint32_t winsys_stride = ctex ? csim_bo->winsys_stride : 0;
-        uint32_t sim_stride = ctex ? ctex->slices[0].stride : 0;
-        uint32_t row_len = MIN2(sim_stride, winsys_stride);
         struct vc4_exec_info exec;
         struct drm_device *dev = &file->dev;
         int ret;
@@ -379,25 +375,9 @@ vc4_simulator_flush(struct vc4_context *vc4,
         memset(&exec, 0, sizeof(exec));
         list_inithead(&exec.unref_list);
 
-        if (ctex && csim_bo->winsys_map) {
-#if 0
-                fprintf(stderr, "%dx%d %d %d %d\n",
-                        ctex->base.b.width0, ctex->base.b.height0,
-                        winsys_stride,
-                        sim_stride,
-                        ctex->bo->size);
-#endif
-
-                for (int y = 0; y < ctex->base.height0; y++) {
-                        memcpy(ctex->bo->map + y * sim_stride,
-                               csim_bo->winsys_map + y * winsys_stride,
-                               row_len);
-                }
-        }
-
         exec.args = args;
 
-        ret = vc4_simulator_pin_bos(dev, job, &exec);
+        ret = vc4_simulator_pin_bos(file, &exec);
         if (ret)
                 return ret;
 
@@ -448,65 +428,19 @@ vc4_simulator_flush(struct vc4_context *vc4,
                 vc4_free_simulator_bo(sim_bo);
         }
 
-        if (ctex && csim_bo->winsys_map) {
-                for (int y = 0; y < ctex->base.height0; y++) {
-                        memcpy(csim_bo->winsys_map + y * winsys_stride,
-                               ctex->bo->map + y * sim_stride,
-                               row_len);
-                }
-        }
-
         return 0;
 }
 
 /**
- * Map the underlying GEM object from the real hardware GEM handle.
- */
-static void *
-vc4_simulator_map_winsys_bo(int fd, struct vc4_simulator_bo *sim_bo)
-{
-        struct drm_vc4_bo *bo = &sim_bo->base;
-        struct drm_gem_cma_object *obj = &bo->base;
-        int ret;
-        void *map;
-
-        struct drm_mode_map_dumb map_dumb = {
-                .handle = sim_bo->handle,
-        };
-        ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map_dumb);
-        if (ret != 0) {
-                fprintf(stderr, "map ioctl failure\n");
-                abort();
-        }
-
-        map = mmap(NULL, obj->base.size, PROT_READ | PROT_WRITE, MAP_SHARED,
-                   fd, map_dumb.offset);
-        if (map == MAP_FAILED) {
-                fprintf(stderr,
-                        "mmap of bo %d (offset 0x%016llx, size %d) failed\n",
-                        sim_bo->handle, (long long)map_dumb.offset,
-                        (int)obj->base.size);
-                abort();
-        }
-
-        return map;
-}
-
-/**
  * Do fixups after a BO has been opened from a handle.
  *
  * This could be done at DRM_IOCTL_GEM_OPEN/DRM_IOCTL_GEM_PRIME_FD_TO_HANDLE
  * time, but we're still using drmPrimeFDToHandle() so we have this helper to
  * be called afterward instead.
  */
-void vc4_simulator_open_from_handle(int fd, uint32_t winsys_stride,
-                                    int handle, uint32_t size)
+void vc4_simulator_open_from_handle(int fd, int handle, uint32_t size)
 {
-        struct vc4_simulator_bo *sim_bo =
-                vc4_create_simulator_bo(fd, handle, size);
-
-        sim_bo->winsys_stride = winsys_stride;
-        sim_bo->winsys_map = vc4_simulator_map_winsys_bo(fd, sim_bo);
+        vc4_create_simulator_bo(fd, handle, size);
 }
 
 /**
@@ -558,19 +492,22 @@ vc4_simulator_create_shader_bo_ioctl(int fd,
 
         args->handle = create.handle;
 
-        vc4_create_simulator_bo(fd, create.handle, args->size);
+        struct vc4_simulator_bo *sim_bo =
+                vc4_create_simulator_bo(fd, create.handle, args->size);
+        struct drm_vc4_bo *drm_bo = &sim_bo->base;
+        struct drm_gem_cma_object *obj = &drm_bo->base;
 
-        struct drm_mode_map_dumb map = {
-                .handle = create.handle
-        };
-        ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map);
-        if (ret)
-                return ret;
+        /* Copy into the simulator's BO for validation. */
+        memcpy(obj->vaddr, (void *)(uintptr_t)args->data, args->size);
+
+        /* Copy into the GEM BO to prevent the simulator_pin_bos() from
+         * smashing it.
+         */
+        memcpy(sim_bo->gem_vaddr, (void *)(uintptr_t)args->data, args->size);
 
-        void *shader = mmap(NULL, args->size, PROT_READ | PROT_WRITE, MAP_SHARED,
-                            fd, map.offset);
-        memcpy(shader, (void *)(uintptr_t)args->data, args->size);
-        munmap(shader, args->size);
+        drm_bo->validated_shader = vc4_validate_shader(obj);
+        if (!drm_bo->validated_shader)
+                return -EINVAL;
 
         return 0;
 }
@@ -643,6 +580,8 @@ int
 vc4_simulator_ioctl(int fd, unsigned long request, void *args)
 {
         switch (request) {
+        case DRM_IOCTL_VC4_SUBMIT_CL:
+                return vc4_simulator_submit_cl_ioctl(fd, args);
         case DRM_IOCTL_VC4_CREATE_BO:
                 return vc4_simulator_create_bo_ioctl(fd, args);
         case DRM_IOCTL_VC4_CREATE_SHADER_BO:
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h
index d507b5fb6..e2777cd54 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -94,7 +94,6 @@ struct drm_gem_cma_object {
 
 struct drm_vc4_bo {
         struct drm_gem_cma_object base;
-        struct vc4_bo *bo;
         struct vc4_validated_shader_info *validated_shader;
         struct list_head unref_head;
 };
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
index 167161fdf..d2a84bb35 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
@@ -26,7 +26,7 @@
  * Helper functions from vc4_tiling.c that will be compiled for using NEON
  * assembly or not.
  *
- * If VC4_BUILD_NEON is set, then the functions will be suffixed with _neon.
+ * If V3D_BUILD_NEON is set, then the functions will be suffixed with _neon.
  * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86
  * sim build working.
  */
@@ -34,8 +34,9 @@
 #include <string.h>
 #include "pipe/p_state.h"
 #include "vc4_tiling.h"
+#include "broadcom/common/v3d_cpu_tiling.h"
 
-#ifdef VC4_BUILD_NEON
+#ifdef V3D_BUILD_NEON
 #define NEON_TAG(x) x ## _neon
 #else
 #define NEON_TAG(x) x ## _base
@@ -63,217 +64,6 @@ vc4_utile_stride(int cpp)
         }
 }
 
-static void
-vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
-{
-        uint32_t gpu_stride = vc4_utile_stride(cpp);
-#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
-        if (gpu_stride == 8) {
-                __asm__ volatile (
-                        /* Load from the GPU in one shot, no interleave, to
-                         * d0-d7.
-                         */
-                        "vldm %[gpu], {q0, q1, q2, q3}\n"
-                        /* Store each 8-byte line to cpu-side destination,
-                         * incrementing it by the stride each time.
-                         */
-                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
-                        "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
-                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
-                        "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
-                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
-                        "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
-                        "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
-                        "vst1.8 d7, [%[cpu]]\n"
-                        : [cpu]         "+r"(cpu)
-                        : [gpu]         "r"(gpu),
-                          [cpu_stride]  "r"(cpu_stride)
-                        : "q0", "q1", "q2", "q3");
-        } else {
-                assert(gpu_stride == 16);
-                void *cpu2 = cpu + 8;
-                __asm__ volatile (
-                        /* Load from the GPU in one shot, no interleave, to
-                         * d0-d7.
-                         */
-                        "vldm %[gpu], {q0, q1, q2, q3};\n"
-                        /* Store each 16-byte line in 2 parts to the cpu-side
-                         * destination.  (vld1 can only store one d-register
-                         * at a time).
-                         */
-                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
-                        "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
-                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
-                        "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
-                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
-                        "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
-                        "vst1.8 d6, [%[cpu]]\n"
-                        "vst1.8 d7, [%[cpu2]]\n"
-                        : [cpu]         "+r"(cpu),
-                          [cpu2]        "+r"(cpu2)
-                        : [gpu]         "r"(gpu),
-                          [cpu_stride]  "r"(cpu_stride)
-                        : "q0", "q1", "q2", "q3");
-        }
-#elif defined (PIPE_ARCH_AARCH64)
-	if (gpu_stride == 8) {
-                __asm__ volatile (
-                        /* Load from the GPU in one shot, no interleave, to
-                         * d0-d7.
-                         */
-                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
-                        /* Store each 8-byte line to cpu-side destination,
-                         * incrementing it by the stride each time.
-                         */
-                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
-                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
-                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
-                        "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "st1 {v3.D}[1], [%[cpu]]\n"
-                        : [cpu]         "+r"(cpu)
-                        : [gpu]         "r"(gpu),
-                          [cpu_stride]  "r"(cpu_stride)
-                        : "v0", "v1", "v2", "v3");
-        } else {
-                assert(gpu_stride == 16);
-                void *cpu2 = cpu + 8;
-                __asm__ volatile (
-                        /* Load from the GPU in one shot, no interleave, to
-                         * d0-d7.
-                         */
-                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
-                        /* Store each 16-byte line in 2 parts to the cpu-side
-                         * destination.  (vld1 can only store one d-register
-                         * at a time).
-                         */
-                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
-                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
-                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
-                        "st1 {v3.D}[0], [%[cpu]]\n"
-                        "st1 {v3.D}[1], [%[cpu2]]\n"
-                        : [cpu]         "+r"(cpu),
-                          [cpu2]        "+r"(cpu2)
-                        : [gpu]         "r"(gpu),
-                          [cpu_stride]  "r"(cpu_stride)
-                        : "v0", "v1", "v2", "v3");
-        }
-#else
-        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
-                memcpy(cpu, gpu + gpu_offset, gpu_stride);
-                cpu += cpu_stride;
-        }
-#endif
-}
-
-static void
-vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
-{
-        uint32_t gpu_stride = vc4_utile_stride(cpp);
-
-#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
-        if (gpu_stride == 8) {
-                __asm__ volatile (
-                        /* Load each 8-byte line from cpu-side source,
-                         * incrementing it by the stride each time.
-                         */
-                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
-                        "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
-                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
-                        "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
-                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
-                        "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
-                        "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
-                        "vld1.8 d7, [%[cpu]]\n"
-                        /* Load from the GPU in one shot, no interleave, to
-                         * d0-d7.
-                         */
-                        "vstm %[gpu], {q0, q1, q2, q3}\n"
-                        : [cpu]         "+r"(cpu)
-                        : [gpu]         "r"(gpu),
-                          [cpu_stride]  "r"(cpu_stride)
-                        : "q0", "q1", "q2", "q3");
-        } else {
-                assert(gpu_stride == 16);
-                void *cpu2 = cpu + 8;
-                __asm__ volatile (
-                        /* Load each 16-byte line in 2 parts from the cpu-side
-                         * destination.  (vld1 can only store one d-register
-                         * at a time).
-                         */
-                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
-                        "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
-                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
-                        "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
-                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
-                        "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
-                        "vld1.8 d6, [%[cpu]]\n"
-                        "vld1.8 d7, [%[cpu2]]\n"
-                        /* Store to the GPU in one shot, no interleave. */
-                        "vstm %[gpu], {q0, q1, q2, q3}\n"
-                        : [cpu]         "+r"(cpu),
-                          [cpu2]        "+r"(cpu2)
-                        : [gpu]         "r"(gpu),
-                          [cpu_stride]  "r"(cpu_stride)
-                        : "q0", "q1", "q2", "q3");
-        }
-#elif defined (PIPE_ARCH_AARCH64)
-	if (gpu_stride == 8) {
-                __asm__ volatile (
-                        /* Load each 8-byte line from cpu-side source,
-                         * incrementing it by the stride each time.
-                         */
-                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
-                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
-                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
-                        "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "ld1 {v3.D}[1], [%[cpu]]\n"
-                        /* Store to the GPU in one shot, no interleave. */
-                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
-                        : [cpu]         "+r"(cpu)
-                        : [gpu]         "r"(gpu),
-                          [cpu_stride]  "r"(cpu_stride)
-                        : "v0", "v1", "v2", "v3");
-        } else {
-                assert(gpu_stride == 16);
-                void *cpu2 = cpu + 8;
-                __asm__ volatile (
-                        /* Load each 16-byte line in 2 parts from the cpu-side
-                         * destination.  (vld1 can only store one d-register
-                         * at a time).
-                         */
-                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
-                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
-                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
-                        "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
-                        "ld1 {v3.D}[0], [%[cpu]]\n"
-                        "ld1 {v3.D}[1], [%[cpu2]]\n"
-                        /* Store to the GPU in one shot, no interleave. */
-                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
-                        : [cpu]         "+r"(cpu),
-                          [cpu2]        "+r"(cpu2)
-                        : [gpu]         "r"(gpu),
-                          [cpu_stride]  "r"(cpu_stride)
-                        : "v0", "v1", "v2", "v3");
-        }
-#else
-        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
-                memcpy(gpu + gpu_offset, cpu, gpu_stride);
-                cpu += cpu_stride;
-        }
-#endif
-
-}
 /**
  * Returns the X value into the address bits for LT tiling.
  *
@@ -349,6 +139,7 @@ vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride,
 {
         uint32_t utile_w = vc4_utile_width(cpp);
         uint32_t utile_h = vc4_utile_height(cpp);
+        uint32_t utile_stride = vc4_utile_stride(cpp);
         uint32_t xstart = box->x;
         uint32_t ystart = box->y;
 
@@ -357,15 +148,17 @@ vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride,
                         void *gpu_tile = gpu + ((ystart + y) * gpu_stride +
                                                 (xstart + x) * 64 / utile_w);
                         if (to_cpu) {
-                                vc4_load_utile(cpu + (cpu_stride * y +
+                                v3d_load_utile(cpu + (cpu_stride * y +
                                                       x * cpp),
+                                               cpu_stride,
                                                gpu_tile,
-                                               cpu_stride, cpp);
+                                               utile_stride);
                         } else {
-                                vc4_store_utile(gpu_tile,
+                                v3d_store_utile(gpu_tile,
+                                                utile_stride,
                                                 cpu + (cpu_stride * y +
                                                        x * cpp),
-                                                cpu_stride, cpp);
+                                                cpu_stride);
                         }
                 }
         }
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c
index 7ba66ae4c..9efec3799 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c
@@ -26,5 +26,5 @@
  * single file.
  */
 
-#define VC4_BUILD_NEON
+#define V3D_BUILD_NEON
 #include "vc4_tiling_lt.c"
author	Jonathan Gray <jsg@cvs.openbsd.org>	2019-05-23 05:33:34 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2019-05-23 05:33:34 +0000
commit	9886815a25d84be79f51e65ebd8e458bb5d26ca8 (patch)
tree	a65edf018dd992543337433f7303fb29a6c8e8cf /lib/mesa/src/gallium/drivers/vc4
parent	e2a3acb64af2657b1181806818eacad061103c23 (diff)