diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2019-05-23 05:33:34 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2019-05-23 05:33:34 +0000 |
commit | 9886815a25d84be79f51e65ebd8e458bb5d26ca8 (patch) | |
tree | a65edf018dd992543337433f7303fb29a6c8e8cf /lib/mesa/src/gallium/drivers/vc4 | |
parent | e2a3acb64af2657b1181806818eacad061103c23 (diff) |
Merge Mesa 19.0.5
Diffstat (limited to 'lib/mesa/src/gallium/drivers/vc4')
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c | 13 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h | 6 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_context.c | 13 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_context.h | 9 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_job.c | 6 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c | 40 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c | 3 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_program.c | 59 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_resource.c | 13 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_screen.c | 3 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c | 187 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h | 1 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c | 227 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c | 2 |
14 files changed, 147 insertions, 435 deletions
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c index 54f9d9c26..716ca50ea 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c @@ -386,7 +386,6 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time) static struct vc4_bo * vc4_bo_open_handle(struct vc4_screen *screen, - uint32_t winsys_stride, uint32_t handle, uint32_t size) { struct vc4_bo *bo; @@ -410,8 +409,7 @@ vc4_bo_open_handle(struct vc4_screen *screen, bo->private = false; #ifdef USE_VC4_SIMULATOR - vc4_simulator_open_from_handle(screen->fd, winsys_stride, - bo->handle, bo->size); + vc4_simulator_open_from_handle(screen->fd, bo->handle, bo->size); bo->map = malloc(bo->size); #endif @@ -423,8 +421,7 @@ done: } struct vc4_bo * -vc4_bo_open_name(struct vc4_screen *screen, uint32_t name, - uint32_t winsys_stride) +vc4_bo_open_name(struct vc4_screen *screen, uint32_t name) { struct drm_gem_open o = { .name = name @@ -436,11 +433,11 @@ vc4_bo_open_name(struct vc4_screen *screen, uint32_t name, return NULL; } - return vc4_bo_open_handle(screen, winsys_stride, o.handle, o.size); + return vc4_bo_open_handle(screen, o.handle, o.size); } struct vc4_bo * -vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd, uint32_t winsys_stride) +vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd) { uint32_t handle; int ret = drmPrimeFDToHandle(screen->fd, fd, &handle); @@ -457,7 +454,7 @@ vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd, uint32_t winsys_stride) return NULL; } - return vc4_bo_open_handle(screen, winsys_stride, handle, size); + return vc4_bo_open_handle(screen, handle, size); } int diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h index 9fa477442..30a388ee5 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h @@ -66,10 +66,8 @@ struct vc4_bo *vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size); void vc4_bo_last_unreference(struct vc4_bo *bo); void vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time); -struct vc4_bo *vc4_bo_open_name(struct vc4_screen *screen, uint32_t name, - uint32_t winsys_stride); -struct vc4_bo *vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd, - uint32_t winsys_stride); +struct vc4_bo *vc4_bo_open_name(struct vc4_screen *screen, uint32_t name); +struct vc4_bo *vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd); bool vc4_bo_flink(struct vc4_bo *bo, uint32_t *name); int vc4_bo_get_dmabuf(struct vc4_bo *bo); diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c index ffd7d4c85..94969dcb1 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c @@ -85,6 +85,18 @@ vc4_texture_barrier(struct pipe_context *pctx, unsigned flags) } static void +vc4_set_debug_callback(struct pipe_context *pctx, + const struct pipe_debug_callback *cb) +{ + struct vc4_context *vc4 = vc4_context(pctx); + + if (cb) + vc4->debug = *cb; + else + memset(&vc4->debug, 0, sizeof(vc4->debug)); +} + +static void vc4_invalidate_resource(struct pipe_context *pctx, struct pipe_resource *prsc) { struct vc4_context *vc4 = vc4_context(pctx); @@ -164,6 +176,7 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) pctx->priv = priv; pctx->destroy = vc4_context_destroy; pctx->flush = vc4_pipe_flush; + pctx->set_debug_callback = vc4_set_debug_callback; pctx->invalidate_resource = vc4_invalidate_resource; pctx->texture_barrier = vc4_texture_barrier; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h index ce8bcffac..1d3179c71 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h @@ -405,6 +405,7 @@ struct vc4_context { struct pipe_viewport_state viewport; struct vc4_constbuf_stateobj constbuf[PIPE_SHADER_TYPES]; struct vc4_vertexbuf_stateobj vertexbuf; + struct pipe_debug_callback debug; struct vc4_hwperfmon *perfmon; /** @} */ @@ -451,6 +452,8 @@ struct vc4_depth_stencil_alpha_state { #define perf_debug(...) do { \ if (unlikely(vc4_debug & VC4_DEBUG_PERF)) \ fprintf(stderr, __VA_ARGS__); \ + if (unlikely(vc4->debug.debug_message)) \ + pipe_debug_message(&vc4->debug, PERF_INFO, __VA_ARGS__); \ } while (0) static inline struct vc4_context * @@ -486,12 +489,8 @@ void vc4_program_fini(struct pipe_context *pctx); void vc4_query_init(struct pipe_context *pctx); void vc4_simulator_init(struct vc4_screen *screen); void vc4_simulator_destroy(struct vc4_screen *screen); -int vc4_simulator_flush(struct vc4_context *vc4, - struct drm_vc4_submit_cl *args, - struct vc4_job *job); int vc4_simulator_ioctl(int fd, unsigned long request, void *arg); -void vc4_simulator_open_from_handle(int fd, uint32_t winsys_stride, - int handle, uint32_t size); +void vc4_simulator_open_from_handle(int fd, int handle, uint32_t size); static inline int vc4_ioctl(int fd, unsigned long request, void *arg) diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_job.c b/lib/mesa/src/gallium/drivers/vc4/vc4_job.c index f38c46475..2b87a00df 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_job.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_job.c @@ -492,11 +492,7 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job) if (!(vc4_debug & VC4_DEBUG_NORAST)) { int ret; -#ifndef USE_VC4_SIMULATOR - ret = drmIoctl(vc4->fd, DRM_IOCTL_VC4_SUBMIT_CL, &submit); -#else - ret = vc4_simulator_flush(vc4, &submit, job); -#endif + ret = vc4_ioctl(vc4->fd, DRM_IOCTL_VC4_SUBMIT_CL, &submit); static bool warned = false; if (ret && !warned) { fprintf(stderr, "Draw call returned %s. " diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c index 60eccb4fc..ff6268f47 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c @@ -42,6 +42,7 @@ #include "util/u_format.h" #include "vc4_qir.h" #include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_format_convert.h" #include "vc4_context.h" static bool @@ -67,37 +68,6 @@ vc4_nir_get_dst_color(nir_builder *b, int sample) return &load->dest.ssa; } -static nir_ssa_def * -vc4_nir_srgb_decode(nir_builder *b, nir_ssa_def *srgb) -{ - nir_ssa_def *is_low = nir_flt(b, srgb, nir_imm_float(b, 0.04045)); - nir_ssa_def *low = nir_fmul(b, srgb, nir_imm_float(b, 1.0 / 12.92)); - nir_ssa_def *high = nir_fpow(b, - nir_fmul(b, - nir_fadd(b, srgb, - nir_imm_float(b, 0.055)), - nir_imm_float(b, 1.0 / 1.055)), - nir_imm_float(b, 2.4)); - - return nir_bcsel(b, is_low, low, high); -} - -static nir_ssa_def * -vc4_nir_srgb_encode(nir_builder *b, nir_ssa_def *linear) -{ - nir_ssa_def *is_low = nir_flt(b, linear, nir_imm_float(b, 0.0031308)); - nir_ssa_def *low = nir_fmul(b, linear, nir_imm_float(b, 12.92)); - nir_ssa_def *high = nir_fsub(b, - nir_fmul(b, - nir_imm_float(b, 1.055), - nir_fpow(b, - linear, - nir_imm_float(b, 0.41666))), - nir_imm_float(b, 0.055)); - - return nir_bcsel(b, is_low, low, high); -} - static nir_ssa_def * vc4_blend_channel_f(nir_builder *b, nir_ssa_def **src, @@ -130,7 +100,7 @@ vc4_blend_channel_f(nir_builder *b, return nir_load_system_value(b, nir_intrinsic_load_blend_const_color_r_float + channel, - 0); + 0, 32); case PIPE_BLENDFACTOR_CONST_ALPHA: return nir_load_blend_const_color_a_float(b); case PIPE_BLENDFACTOR_ZERO: @@ -148,7 +118,7 @@ vc4_blend_channel_f(nir_builder *b, nir_load_system_value(b, nir_intrinsic_load_blend_const_color_r_float + channel, - 0)); + 0, 32)); case PIPE_BLENDFACTOR_INV_CONST_ALPHA: return nir_fsub(b, nir_imm_float(b, 1.0), nir_load_blend_const_color_a_float(b)); @@ -501,14 +471,14 @@ vc4_nir_blend_pipeline(struct vc4_compile *c, nir_builder *b, nir_ssa_def *src, /* Turn dst color to linear. */ for (int i = 0; i < 3; i++) - dst_color[i] = vc4_nir_srgb_decode(b, dst_color[i]); + dst_color[i] = nir_format_srgb_to_linear(b, dst_color[i]); nir_ssa_def *blend_color[4]; vc4_do_blending_f(c, b, blend_color, src_color, dst_color); /* sRGB encode the output color */ for (int i = 0; i < 3; i++) - blend_color[i] = vc4_nir_srgb_encode(b, blend_color[i]); + blend_color[i] = nir_format_linear_to_srgb(b, blend_color[i]); packed_color = vc4_nir_swizzle_and_pack(c, b, blend_color); } else { diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c index b7969a562..fc2baee1b 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c @@ -330,7 +330,8 @@ vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b, nir_intrinsic_instr *intr_comp = nir_intrinsic_instr_create(c->s, intr->intrinsic); intr_comp->num_components = 1; - nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL); + nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, + intr->dest.ssa.bit_size, NULL); /* Convert the uniform offset to bytes. If it happens * to be a constant, constant-folding will clean up diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c index bc9bd76ae..8f1e561c4 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c @@ -1004,24 +1004,24 @@ ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest, enum qpu_cond cond; switch (compare_instr->op) { - case nir_op_feq: - case nir_op_ieq: + case nir_op_feq32: + case nir_op_ieq32: case nir_op_seq: cond = QPU_COND_ZS; break; - case nir_op_fne: - case nir_op_ine: + case nir_op_fne32: + case nir_op_ine32: case nir_op_sne: cond = QPU_COND_ZC; break; - case nir_op_fge: - case nir_op_ige: - case nir_op_uge: + case nir_op_fge32: + case nir_op_ige32: + case nir_op_uge32: case nir_op_sge: cond = QPU_COND_NC; break; - case nir_op_flt: - case nir_op_ilt: + case nir_op_flt32: + case nir_op_ilt32: case nir_op_slt: cond = QPU_COND_NS; break; @@ -1048,7 +1048,7 @@ ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest, qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0)); break; - case nir_op_bcsel: + case nir_op_b32csel: *dest = qir_SEL(c, cond, ntq_get_alu_src(c, sel_instr, 1), ntq_get_alu_src(c, sel_instr, 2)); @@ -1208,14 +1208,14 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) case nir_op_u2f32: result = qir_ITOF(c, src[0]); break; - case nir_op_b2f: + case nir_op_b2f32: result = qir_AND(c, src[0], qir_uniform_f(c, 1.0)); break; - case nir_op_b2i: + case nir_op_b2i32: result = qir_AND(c, src[0], qir_uniform_ui(c, 1)); break; - case nir_op_i2b: - case nir_op_f2b: + case nir_op_i2b32: + case nir_op_f2b32: qir_SF(c, src[0]); result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, qir_uniform_ui(c, ~0), @@ -1264,21 +1264,21 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) case nir_op_sne: case nir_op_sge: case nir_op_slt: - case nir_op_feq: - case nir_op_fne: - case nir_op_fge: - case nir_op_flt: - case nir_op_ieq: - case nir_op_ine: - case nir_op_ige: - case nir_op_uge: - case nir_op_ilt: + case nir_op_feq32: + case nir_op_fne32: + case nir_op_fge32: + case nir_op_flt32: + case nir_op_ieq32: + case nir_op_ine32: + case nir_op_ige32: + case nir_op_uge32: + case nir_op_ilt32: if (!ntq_emit_comparison(c, &result, instr, instr)) { fprintf(stderr, "Bad comparison instruction\n"); } break; - case nir_op_bcsel: + case nir_op_b32csel: result = ntq_emit_bcsel(c, instr, src); break; case nir_op_fcsel: @@ -1591,14 +1591,14 @@ vc4_optimize_nir(struct nir_shader *s) NIR_PASS(progress, s, nir_opt_dce); NIR_PASS(progress, s, nir_opt_dead_cf); NIR_PASS(progress, s, nir_opt_cse); - NIR_PASS(progress, s, nir_opt_peephole_select, 8); + NIR_PASS(progress, s, nir_opt_peephole_select, 8, true); NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); NIR_PASS(progress, s, nir_opt_undef); NIR_PASS(progress, s, nir_opt_loop_unroll, nir_var_shader_in | nir_var_shader_out | - nir_var_local); + nir_var_function_temp); } while (progress); } @@ -2363,7 +2363,8 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, if (stage == QSTAGE_FRAG) { NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables); } else { - NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables); + NIR_PASS_V(c->s, nir_lower_clip_vs, + c->key->ucp_enables, false); NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); } @@ -2384,6 +2385,8 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, vc4_optimize_nir(c->s); + NIR_PASS_V(c->s, nir_lower_bool_to_int32); + NIR_PASS_V(c->s, nir_convert_from_ssa, true); if (vc4_debug & VC4_DEBUG_SHADERDB) { @@ -2514,7 +2517,7 @@ vc4_shader_state_create(struct pipe_context *pctx, vc4_optimize_nir(s); - NIR_PASS_V(s, nir_remove_dead_variables, nir_var_local); + NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp); /* Garbage collect dead instructions */ nir_sweep(s); diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c index 41e6ec5c1..a4d1b903b 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c @@ -319,8 +319,10 @@ vc4_resource_get_handle(struct pipe_screen *pscreen, return vc4_bo_flink(rsc->bo, &whandle->handle); case WINSYS_HANDLE_TYPE_KMS: - if (screen->ro && renderonly_get_handle(rsc->scanout, whandle)) - return TRUE; + if (screen->ro) { + assert(rsc->scanout); + return renderonly_get_handle(rsc->scanout, whandle); + } whandle->handle = rsc->bo->handle; return TRUE; case WINSYS_HANDLE_TYPE_FD: @@ -622,12 +624,10 @@ vc4_resource_from_handle(struct pipe_screen *pscreen, switch (whandle->type) { case WINSYS_HANDLE_TYPE_SHARED: - rsc->bo = vc4_bo_open_name(screen, - whandle->handle, whandle->stride); + rsc->bo = vc4_bo_open_name(screen, whandle->handle); break; case WINSYS_HANDLE_TYPE_FD: - rsc->bo = vc4_bo_open_dmabuf(screen, - whandle->handle, whandle->stride); + rsc->bo = vc4_bo_open_dmabuf(screen, whandle->handle); break; default: fprintf(stderr, @@ -1013,6 +1013,7 @@ void vc4_update_shadow_baselevel_texture(struct pipe_context *pctx, struct pipe_sampler_view *pview) { + struct vc4_context *vc4 = vc4_context(pctx); struct vc4_sampler_view *view = vc4_sampler_view(pview); struct vc4_resource *shadow = vc4_resource(view->texture); struct vc4_resource *orig = vc4_resource(pview->texture); diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c index e7f7c82c2..acb4a1feb 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c @@ -178,6 +178,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) /* Note: Not supported in hardware, just faking it. */ return 5; + case PIPE_CAP_MAX_VARYINGS: + return 8; + case PIPE_CAP_VENDOR_ID: return 0x14E4; case PIPE_CAP_ACCELERATED: diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c index 37c098a04..2ce5a7596 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c @@ -99,10 +99,13 @@ struct vc4_simulator_bo { /** Area for this BO within sim_state->mem */ struct mem_block *block; - void *winsys_map; - uint32_t winsys_stride; int handle; + + /* Mapping of the underlying GEM object that we copy in/out of + * simulator memory. + */ + void *gem_vaddr; }; static void * @@ -143,6 +146,7 @@ vc4_create_simulator_bo(int fd, int handle, unsigned size) sim_bo->file = file; sim_bo->handle = handle; + /* Allocate space for the buffer in simulator memory. */ mtx_lock(&sim_state.mutex); sim_bo->block = u_mmAllocMem(sim_state.heap, size + 4, PAGE_ALIGN2, 0); mtx_unlock(&sim_state.mutex); @@ -162,6 +166,25 @@ vc4_create_simulator_bo(int fd, int handle, unsigned size) mtx_lock(&sim_state.mutex); _mesa_hash_table_insert(file->bo_map, int_to_key(handle), bo); mtx_unlock(&sim_state.mutex); + + /* Map the GEM buffer for copy in/out to the simulator. */ + struct drm_mode_map_dumb map = { + .handle = handle, + }; + int ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map); + if (ret) { + fprintf(stderr, "Failed to get MMAP offset: %d\n", + errno); + abort(); + } + sim_bo->gem_vaddr = mmap(NULL, obj->base.size, + PROT_READ | PROT_WRITE, MAP_SHARED, + fd, map.offset); + if (sim_bo->gem_vaddr == MAP_FAILED) { + fprintf(stderr, "mmap of bo %d (offset 0x%016llx, size %d) failed\n", + handle, (long long)map.offset, (int)obj->base.size); + abort(); + } } return sim_bo; @@ -174,16 +197,19 @@ vc4_free_simulator_bo(struct vc4_simulator_bo *sim_bo) struct drm_vc4_bo *bo = &sim_bo->base; struct drm_gem_cma_object *obj = &bo->base; - if (sim_bo->winsys_map) - munmap(sim_bo->winsys_map, obj->base.size); + if (bo->validated_shader) { + free(bo->validated_shader->texture_samples); + free(bo->validated_shader); + } + + if (sim_bo->gem_vaddr) + munmap(sim_bo->gem_vaddr, obj->base.size); mtx_lock(&sim_state.mutex); u_mmFreeMem(sim_bo->block); if (sim_bo->handle) { - struct hash_entry *entry = - _mesa_hash_table_search(sim_file->bo_map, - int_to_key(sim_bo->handle)); - _mesa_hash_table_remove(sim_file->bo_map, entry); + _mesa_hash_table_remove_key(sim_file->bo_map, + int_to_key(sim_bo->handle)); } mtx_unlock(&sim_state.mutex); ralloc_free(sim_bo); @@ -210,41 +236,23 @@ drm_gem_cma_create(struct drm_device *dev, size_t size) } static int -vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_job *job, +vc4_simulator_pin_bos(struct vc4_simulator_file *file, struct vc4_exec_info *exec) { - int fd = dev->screen->fd; - struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd); struct drm_vc4_submit_cl *args = exec->args; - struct vc4_bo **bos = job->bo_pointers.base; + uint32_t *bo_handles = (uint32_t *)(uintptr_t)args->bo_handles; exec->bo_count = args->bo_handle_count; exec->bo = calloc(exec->bo_count, sizeof(void *)); for (int i = 0; i < exec->bo_count; i++) { - struct vc4_bo *bo = bos[i]; struct vc4_simulator_bo *sim_bo = - vc4_get_simulator_bo(file, bo->handle); + vc4_get_simulator_bo(file, bo_handles[i]); struct drm_vc4_bo *drm_bo = &sim_bo->base; struct drm_gem_cma_object *obj = &drm_bo->base; - drm_bo->bo = bo; -#if 0 - fprintf(stderr, "bo hindex %d: %s\n", i, bo->name); -#endif - - vc4_bo_map(bo); - memcpy(obj->vaddr, bo->map, bo->size); + memcpy(obj->vaddr, sim_bo->gem_vaddr, obj->base.size); exec->bo[i] = obj; - - /* The kernel does this validation at shader create ioctl - * time. - */ - if (strcmp(bo->name, "code") == 0) { - drm_bo->validated_shader = vc4_validate_shader(obj); - if (!drm_bo->validated_shader) - abort(); - } } return 0; } @@ -255,16 +263,13 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec) for (int i = 0; i < exec->bo_count; i++) { struct drm_gem_cma_object *obj = exec->bo[i]; struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base); - struct vc4_bo *bo = drm_bo->bo; + struct vc4_simulator_bo *sim_bo = + (struct vc4_simulator_bo *)drm_bo; assert(*(uint32_t *)(obj->vaddr + obj->base.size) == BO_SENTINEL); - memcpy(bo->map, obj->vaddr, bo->size); - - if (drm_bo->validated_shader) { - free(drm_bo->validated_shader->texture_samples); - free(drm_bo->validated_shader); - } + if (sim_bo->gem_vaddr) + memcpy(sim_bo->gem_vaddr, obj->vaddr, obj->base.size); } free(exec->bo); @@ -359,19 +364,10 @@ vc4_dump_to_file(struct vc4_exec_info *exec) fclose(f); } -int -vc4_simulator_flush(struct vc4_context *vc4, - struct drm_vc4_submit_cl *args, struct vc4_job *job) +static int +vc4_simulator_submit_cl_ioctl(int fd, struct drm_vc4_submit_cl *args) { - struct vc4_screen *screen = vc4->screen; - int fd = screen->fd; struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd); - struct vc4_surface *csurf = vc4_surface(vc4->framebuffer.cbufs[0]); - struct vc4_resource *ctex = csurf ? vc4_resource(csurf->base.texture) : NULL; - struct vc4_simulator_bo *csim_bo = ctex ? vc4_get_simulator_bo(file, ctex->bo->handle) : NULL; - uint32_t winsys_stride = ctex ? csim_bo->winsys_stride : 0; - uint32_t sim_stride = ctex ? ctex->slices[0].stride : 0; - uint32_t row_len = MIN2(sim_stride, winsys_stride); struct vc4_exec_info exec; struct drm_device *dev = &file->dev; int ret; @@ -379,25 +375,9 @@ vc4_simulator_flush(struct vc4_context *vc4, memset(&exec, 0, sizeof(exec)); list_inithead(&exec.unref_list); - if (ctex && csim_bo->winsys_map) { -#if 0 - fprintf(stderr, "%dx%d %d %d %d\n", - ctex->base.b.width0, ctex->base.b.height0, - winsys_stride, - sim_stride, - ctex->bo->size); -#endif - - for (int y = 0; y < ctex->base.height0; y++) { - memcpy(ctex->bo->map + y * sim_stride, - csim_bo->winsys_map + y * winsys_stride, - row_len); - } - } - exec.args = args; - ret = vc4_simulator_pin_bos(dev, job, &exec); + ret = vc4_simulator_pin_bos(file, &exec); if (ret) return ret; @@ -448,65 +428,19 @@ vc4_simulator_flush(struct vc4_context *vc4, vc4_free_simulator_bo(sim_bo); } - if (ctex && csim_bo->winsys_map) { - for (int y = 0; y < ctex->base.height0; y++) { - memcpy(csim_bo->winsys_map + y * winsys_stride, - ctex->bo->map + y * sim_stride, - row_len); - } - } - return 0; } /** - * Map the underlying GEM object from the real hardware GEM handle. - */ -static void * -vc4_simulator_map_winsys_bo(int fd, struct vc4_simulator_bo *sim_bo) -{ - struct drm_vc4_bo *bo = &sim_bo->base; - struct drm_gem_cma_object *obj = &bo->base; - int ret; - void *map; - - struct drm_mode_map_dumb map_dumb = { - .handle = sim_bo->handle, - }; - ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map_dumb); - if (ret != 0) { - fprintf(stderr, "map ioctl failure\n"); - abort(); - } - - map = mmap(NULL, obj->base.size, PROT_READ | PROT_WRITE, MAP_SHARED, - fd, map_dumb.offset); - if (map == MAP_FAILED) { - fprintf(stderr, - "mmap of bo %d (offset 0x%016llx, size %d) failed\n", - sim_bo->handle, (long long)map_dumb.offset, - (int)obj->base.size); - abort(); - } - - return map; -} - -/** * Do fixups after a BO has been opened from a handle. * * This could be done at DRM_IOCTL_GEM_OPEN/DRM_IOCTL_GEM_PRIME_FD_TO_HANDLE * time, but we're still using drmPrimeFDToHandle() so we have this helper to * be called afterward instead. */ -void vc4_simulator_open_from_handle(int fd, uint32_t winsys_stride, - int handle, uint32_t size) +void vc4_simulator_open_from_handle(int fd, int handle, uint32_t size) { - struct vc4_simulator_bo *sim_bo = - vc4_create_simulator_bo(fd, handle, size); - - sim_bo->winsys_stride = winsys_stride; - sim_bo->winsys_map = vc4_simulator_map_winsys_bo(fd, sim_bo); + vc4_create_simulator_bo(fd, handle, size); } /** @@ -558,19 +492,22 @@ vc4_simulator_create_shader_bo_ioctl(int fd, args->handle = create.handle; - vc4_create_simulator_bo(fd, create.handle, args->size); + struct vc4_simulator_bo *sim_bo = + vc4_create_simulator_bo(fd, create.handle, args->size); + struct drm_vc4_bo *drm_bo = &sim_bo->base; + struct drm_gem_cma_object *obj = &drm_bo->base; - struct drm_mode_map_dumb map = { - .handle = create.handle - }; - ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map); - if (ret) - return ret; + /* Copy into the simulator's BO for validation. */ + memcpy(obj->vaddr, (void *)(uintptr_t)args->data, args->size); + + /* Copy into the GEM BO to prevent the simulator_pin_bos() from + * smashing it. + */ + memcpy(sim_bo->gem_vaddr, (void *)(uintptr_t)args->data, args->size); - void *shader = mmap(NULL, args->size, PROT_READ | PROT_WRITE, MAP_SHARED, - fd, map.offset); - memcpy(shader, (void *)(uintptr_t)args->data, args->size); - munmap(shader, args->size); + drm_bo->validated_shader = vc4_validate_shader(obj); + if (!drm_bo->validated_shader) + return -EINVAL; return 0; } @@ -643,6 +580,8 @@ int vc4_simulator_ioctl(int fd, unsigned long request, void *args) { switch (request) { + case DRM_IOCTL_VC4_SUBMIT_CL: + return vc4_simulator_submit_cl_ioctl(fd, args); case DRM_IOCTL_VC4_CREATE_BO: return vc4_simulator_create_bo_ioctl(fd, args); case DRM_IOCTL_VC4_CREATE_SHADER_BO: diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h index d507b5fb6..e2777cd54 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h @@ -94,7 +94,6 @@ struct drm_gem_cma_object { struct drm_vc4_bo { struct drm_gem_cma_object base; - struct vc4_bo *bo; struct vc4_validated_shader_info *validated_shader; struct list_head unref_head; }; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c index 167161fdf..d2a84bb35 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c @@ -26,7 +26,7 @@ * Helper functions from vc4_tiling.c that will be compiled for using NEON * assembly or not. * - * If VC4_BUILD_NEON is set, then the functions will be suffixed with _neon. + * If V3D_BUILD_NEON is set, then the functions will be suffixed with _neon. * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86 * sim build working. */ @@ -34,8 +34,9 @@ #include <string.h> #include "pipe/p_state.h" #include "vc4_tiling.h" +#include "broadcom/common/v3d_cpu_tiling.h" -#ifdef VC4_BUILD_NEON +#ifdef V3D_BUILD_NEON #define NEON_TAG(x) x ## _neon #else #define NEON_TAG(x) x ## _base @@ -63,217 +64,6 @@ vc4_utile_stride(int cpp) } } -static void -vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp) -{ - uint32_t gpu_stride = vc4_utile_stride(cpp); -#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM) - if (gpu_stride == 8) { - __asm__ volatile ( - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "vldm %[gpu], {q0, q1, q2, q3}\n" - /* Store each 8-byte line to cpu-side destination, - * incrementing it by the stride each time. - */ - "vst1.8 d0, [%[cpu]], %[cpu_stride]\n" - "vst1.8 d1, [%[cpu]], %[cpu_stride]\n" - "vst1.8 d2, [%[cpu]], %[cpu_stride]\n" - "vst1.8 d3, [%[cpu]], %[cpu_stride]\n" - "vst1.8 d4, [%[cpu]], %[cpu_stride]\n" - "vst1.8 d5, [%[cpu]], %[cpu_stride]\n" - "vst1.8 d6, [%[cpu]], %[cpu_stride]\n" - "vst1.8 d7, [%[cpu]]\n" - : [cpu] "+r"(cpu) - : [gpu] "r"(gpu), - [cpu_stride] "r"(cpu_stride) - : "q0", "q1", "q2", "q3"); - } else { - assert(gpu_stride == 16); - void *cpu2 = cpu + 8; - __asm__ volatile ( - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "vldm %[gpu], {q0, q1, q2, q3};\n" - /* Store each 16-byte line in 2 parts to the cpu-side - * destination. (vld1 can only store one d-register - * at a time). - */ - "vst1.8 d0, [%[cpu]], %[cpu_stride]\n" - "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n" - "vst1.8 d2, [%[cpu]], %[cpu_stride]\n" - "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n" - "vst1.8 d4, [%[cpu]], %[cpu_stride]\n" - "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n" - "vst1.8 d6, [%[cpu]]\n" - "vst1.8 d7, [%[cpu2]]\n" - : [cpu] "+r"(cpu), - [cpu2] "+r"(cpu2) - : [gpu] "r"(gpu), - [cpu_stride] "r"(cpu_stride) - : "q0", "q1", "q2", "q3"); - } -#elif defined (PIPE_ARCH_AARCH64) - if (gpu_stride == 8) { - __asm__ volatile ( - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" - /* Store each 8-byte line to cpu-side destination, - * incrementing it by the stride each time. - */ - "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" - "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n" - "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" - "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n" - "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" - "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n" - "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n" - "st1 {v3.D}[1], [%[cpu]]\n" - : [cpu] "+r"(cpu) - : [gpu] "r"(gpu), - [cpu_stride] "r"(cpu_stride) - : "v0", "v1", "v2", "v3"); - } else { - assert(gpu_stride == 16); - void *cpu2 = cpu + 8; - __asm__ volatile ( - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" - /* Store each 16-byte line in 2 parts to the cpu-side - * destination. (vld1 can only store one d-register - * at a time). - */ - "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" - "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n" - "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" - "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n" - "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" - "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n" - "st1 {v3.D}[0], [%[cpu]]\n" - "st1 {v3.D}[1], [%[cpu2]]\n" - : [cpu] "+r"(cpu), - [cpu2] "+r"(cpu2) - : [gpu] "r"(gpu), - [cpu_stride] "r"(cpu_stride) - : "v0", "v1", "v2", "v3"); - } -#else - for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { - memcpy(cpu, gpu + gpu_offset, gpu_stride); - cpu += cpu_stride; - } -#endif -} - -static void -vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp) -{ - uint32_t gpu_stride = vc4_utile_stride(cpp); - -#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM) - if (gpu_stride == 8) { - __asm__ volatile ( - /* Load each 8-byte line from cpu-side source, - * incrementing it by the stride each time. - */ - "vld1.8 d0, [%[cpu]], %[cpu_stride]\n" - "vld1.8 d1, [%[cpu]], %[cpu_stride]\n" - "vld1.8 d2, [%[cpu]], %[cpu_stride]\n" - "vld1.8 d3, [%[cpu]], %[cpu_stride]\n" - "vld1.8 d4, [%[cpu]], %[cpu_stride]\n" - "vld1.8 d5, [%[cpu]], %[cpu_stride]\n" - "vld1.8 d6, [%[cpu]], %[cpu_stride]\n" - "vld1.8 d7, [%[cpu]]\n" - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "vstm %[gpu], {q0, q1, q2, q3}\n" - : [cpu] "+r"(cpu) - : [gpu] "r"(gpu), - [cpu_stride] "r"(cpu_stride) - : "q0", "q1", "q2", "q3"); - } else { - assert(gpu_stride == 16); - void *cpu2 = cpu + 8; - __asm__ volatile ( - /* Load each 16-byte line in 2 parts from the cpu-side - * destination. (vld1 can only store one d-register - * at a time). - */ - "vld1.8 d0, [%[cpu]], %[cpu_stride]\n" - "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n" - "vld1.8 d2, [%[cpu]], %[cpu_stride]\n" - "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n" - "vld1.8 d4, [%[cpu]], %[cpu_stride]\n" - "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n" - "vld1.8 d6, [%[cpu]]\n" - "vld1.8 d7, [%[cpu2]]\n" - /* Store to the GPU in one shot, no interleave. */ - "vstm %[gpu], {q0, q1, q2, q3}\n" - : [cpu] "+r"(cpu), - [cpu2] "+r"(cpu2) - : [gpu] "r"(gpu), - [cpu_stride] "r"(cpu_stride) - : "q0", "q1", "q2", "q3"); - } -#elif defined (PIPE_ARCH_AARCH64) - if (gpu_stride == 8) { - __asm__ volatile ( - /* Load each 8-byte line from cpu-side source, - * incrementing it by the stride each time. - */ - "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" - "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n" - "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" - "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n" - "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" - "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n" - "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n" - "ld1 {v3.D}[1], [%[cpu]]\n" - /* Store to the GPU in one shot, no interleave. */ - "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" - : [cpu] "+r"(cpu) - : [gpu] "r"(gpu), - [cpu_stride] "r"(cpu_stride) - : "v0", "v1", "v2", "v3"); - } else { - assert(gpu_stride == 16); - void *cpu2 = cpu + 8; - __asm__ volatile ( - /* Load each 16-byte line in 2 parts from the cpu-side - * destination. (vld1 can only store one d-register - * at a time). - */ - "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" - "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n" - "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" - "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n" - "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" - "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n" - "ld1 {v3.D}[0], [%[cpu]]\n" - "ld1 {v3.D}[1], [%[cpu2]]\n" - /* Store to the GPU in one shot, no interleave. */ - "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" - : [cpu] "+r"(cpu), - [cpu2] "+r"(cpu2) - : [gpu] "r"(gpu), - [cpu_stride] "r"(cpu_stride) - : "v0", "v1", "v2", "v3"); - } -#else - for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { - memcpy(gpu + gpu_offset, cpu, gpu_stride); - cpu += cpu_stride; - } -#endif - -} /** * Returns the X value into the address bits for LT tiling. * @@ -349,6 +139,7 @@ vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride, { uint32_t utile_w = vc4_utile_width(cpp); uint32_t utile_h = vc4_utile_height(cpp); + uint32_t utile_stride = vc4_utile_stride(cpp); uint32_t xstart = box->x; uint32_t ystart = box->y; @@ -357,15 +148,17 @@ vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride, void *gpu_tile = gpu + ((ystart + y) * gpu_stride + (xstart + x) * 64 / utile_w); if (to_cpu) { - vc4_load_utile(cpu + (cpu_stride * y + + v3d_load_utile(cpu + (cpu_stride * y + x * cpp), + cpu_stride, gpu_tile, - cpu_stride, cpp); + utile_stride); } else { - vc4_store_utile(gpu_tile, + v3d_store_utile(gpu_tile, + utile_stride, cpu + (cpu_stride * y + x * cpp), - cpu_stride, cpp); + cpu_stride); } } } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c index 7ba66ae4c..9efec3799 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c @@ -26,5 +26,5 @@ * single file. */ -#define VC4_BUILD_NEON +#define V3D_BUILD_NEON #include "vc4_tiling_lt.c" |