diff options
Diffstat (limited to 'lib/mesa/src/gallium/drivers')
12 files changed, 148 insertions, 119 deletions
diff --git a/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_context.c b/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_context.c index 3038d210e..303dff583 100644 --- a/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_context.c +++ b/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_context.c @@ -60,6 +60,8 @@ etna_context_destroy(struct pipe_context *pctx) { struct etna_context *ctx = etna_context(pctx); + util_copy_framebuffer_state(&ctx->framebuffer_s, NULL); + if (ctx->primconvert) util_primconvert_destroy(ctx->primconvert); @@ -296,10 +298,10 @@ etna_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) if (DBG_ENABLED(ETNA_DBG_FLUSH_ALL)) pctx->flush(pctx, NULL, 0); - if (ctx->framebuffer.cbuf) - etna_resource(ctx->framebuffer.cbuf->texture)->seqno++; - if (ctx->framebuffer.zsbuf) - etna_resource(ctx->framebuffer.zsbuf->texture)->seqno++; + if (ctx->framebuffer_s.cbufs[0]) + etna_resource(ctx->framebuffer_s.cbufs[0]->texture)->seqno++; + if (ctx->framebuffer_s.zsbuf) + etna_resource(ctx->framebuffer_s.zsbuf->texture)->seqno++; if (info->index_size && indexbuf != info->index.resource) pipe_resource_reference(&indexbuf, NULL); } diff --git a/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_internal.h b/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_internal.h index 3424d8a77..77214d9cc 100644 --- a/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_internal.h +++ b/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_internal.h @@ -182,7 +182,6 @@ struct compiled_viewport_state { /* Compiled pipe_framebuffer_state */ struct compiled_framebuffer_state { - struct pipe_surface *cbuf, *zsbuf; /* keep reference to surfaces */ uint32_t GL_MULTI_SAMPLE_CONFIG; uint32_t PE_COLOR_FORMAT; uint32_t PE_DEPTH_CONFIG; diff --git a/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_state.c b/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_state.c index 87ba10b0d..520cc5a77 100644 --- a/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_state.c +++ b/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_state.c @@ -37,6 +37,7 @@ #include "etnaviv_surface.h" #include "etnaviv_translate.h" #include "etnaviv_util.h" +#include "util/u_framebuffer.h" #include "util/u_helpers.h" #include "util/u_inlines.h" #include "util/u_math.h" @@ -130,7 +131,6 @@ etna_set_framebuffer_state(struct pipe_context *pctx, assert(res->layout & ETNA_LAYOUT_BIT_TILE); /* Cannot render to linear surfaces */ etna_update_render_resource(pctx, cbuf->base.texture); - pipe_surface_reference(&cs->cbuf, &cbuf->base); cs->PE_COLOR_FORMAT = VIVS_PE_COLOR_FORMAT_FORMAT(translate_rs_format(cbuf->base.format)) | VIVS_PE_COLOR_FORMAT_COMPONENTS__MASK | @@ -182,7 +182,6 @@ etna_set_framebuffer_state(struct pipe_context *pctx, nr_samples_color = cbuf->base.texture->nr_samples; } else { - pipe_surface_reference(&cs->cbuf, NULL); /* Clearing VIVS_PE_COLOR_FORMAT_COMPONENTS__MASK and * VIVS_PE_COLOR_FORMAT_OVERWRITE prevents us from overwriting the * color target */ @@ -201,7 +200,6 @@ etna_set_framebuffer_state(struct pipe_context *pctx, etna_update_render_resource(pctx, zsbuf->base.texture); - pipe_surface_reference(&cs->zsbuf, &zsbuf->base); assert(res->layout &ETNA_LAYOUT_BIT_TILE); /* Cannot render to linear surfaces */ uint32_t depth_format = translate_depth_format(zsbuf->base.format); @@ -252,7 +250,6 @@ etna_set_framebuffer_state(struct pipe_context *pctx, nr_samples_depth = zsbuf->base.texture->nr_samples; } else { - pipe_surface_reference(&cs->zsbuf, NULL); cs->PE_DEPTH_CONFIG = VIVS_PE_DEPTH_CONFIG_DEPTH_MODE_NONE; cs->PE_DEPTH_ADDR.bo = NULL; cs->PE_DEPTH_STRIDE = 0; @@ -325,7 +322,8 @@ etna_set_framebuffer_state(struct pipe_context *pctx, */ cs->PE_LOGIC_OP = VIVS_PE_LOGIC_OP_SINGLE_BUFFER(ctx->specs.single_buffer ? 3 : 0); - ctx->framebuffer_s = *sv; /* keep copy of original structure */ + /* keep copy of original structure */ + util_copy_framebuffer_state(&ctx->framebuffer_s, sv); ctx->dirty |= ETNA_DIRTY_FRAMEBUFFER | ETNA_DIRTY_DERIVE_TS; } diff --git a/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.c b/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.c index 54d738589..eeeda1cf6 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.c +++ b/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.c @@ -839,8 +839,7 @@ fd_resource_create(struct pipe_screen *pscreen, rsc->internal_format = format; rsc->cpp = util_format_get_blocksize(format); - prsc->nr_samples = MAX2(1, prsc->nr_samples); - rsc->cpp *= prsc->nr_samples; + rsc->cpp *= fd_resource_nr_samples(prsc); assert(rsc->cpp); @@ -924,9 +923,9 @@ fd_resource_from_handle(struct pipe_screen *pscreen, if (!rsc->bo) goto fail; - prsc->nr_samples = MAX2(1, prsc->nr_samples); rsc->internal_format = tmpl->format; - rsc->cpp = prsc->nr_samples * util_format_get_blocksize(tmpl->format); + rsc->cpp = util_format_get_blocksize(tmpl->format); + rsc->cpp *= fd_resource_nr_samples(prsc); slice->pitch = handle->stride / rsc->cpp; slice->offset = handle->offset; slice->size0 = handle->stride * prsc->height0; diff --git a/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.h b/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.h index 09abb512d..6790352f9 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.h +++ b/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.h @@ -178,6 +178,15 @@ fd_resource_level_linear(struct pipe_resource *prsc, int level) return false; } +/* access # of samples, with 0 normalized to 1 (which is what we care about + * most of the time) + */ +static inline unsigned +fd_resource_nr_samples(struct pipe_resource *prsc) +{ + return MAX2(1, prsc->nr_samples); +} + void fd_blitter_pipe_begin(struct fd_context *ctx, bool render_cond, bool discard, enum fd_render_stage stage); void fd_blitter_pipe_end(struct fd_context *ctx); diff --git a/lib/mesa/src/gallium/drivers/freedreno/freedreno_texture.c b/lib/mesa/src/gallium/drivers/freedreno/freedreno_texture.c index d92298d2e..84b4df6c1 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/freedreno_texture.c +++ b/lib/mesa/src/gallium/drivers/freedreno/freedreno_texture.c @@ -31,6 +31,7 @@ #include "freedreno_texture.h" #include "freedreno_context.h" +#include "freedreno_resource.h" #include "freedreno_util.h" static void @@ -83,7 +84,7 @@ static void set_sampler_views(struct fd_texture_stateobj *tex, tex->num_textures = util_last_bit(tex->valid_textures); for (i = 0; i < tex->num_textures; i++) { - uint nr_samples = tex->textures[i]->texture->nr_samples; + uint nr_samples = fd_resource_nr_samples(tex->textures[i]->texture); samplers |= (nr_samples >> 1) << (i * 2); } diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index ca0192a9c..997df1607 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -1044,7 +1044,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) break; } case OP_MUL: - if (i->dType == TYPE_F32) + if (i->dType == TYPE_F32 && !i->precise) tryCollapseChainedMULs(i, s, imm0); if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) { diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index d0d39aa53..9059d4e48 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -1279,8 +1279,8 @@ nvc0_screen_create(struct nouveau_device *dev) for (i = 0; i < NVC0_MAX_VIEWPORTS; i++) { BEGIN_NVC0(push, NVC0_3D(SCISSOR_ENABLE(i)), 3); PUSH_DATA (push, 1); - PUSH_DATA (push, 8192 << 16); - PUSH_DATA (push, 8192 << 16); + PUSH_DATA (push, 16384 << 16); + PUSH_DATA (push, 16384 << 16); } #define MK_MACRO(m, n) i = nvc0_graph_set_macro(screen, m, i, sizeof(n), n); diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c index 612ca910c..413cbc627 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c @@ -348,20 +348,11 @@ si_get_init_multi_vgt_param(struct si_screen *sscreen, key->u.uses_gs) partial_vs_wave = true; - /* Needed for 028B6C_DISTRIBUTION_MODE != 0 */ + /* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= VI) */ if (sscreen->has_distributed_tess) { if (key->u.uses_gs) { - if (sscreen->info.chip_class <= VI) + if (sscreen->info.chip_class == VI) partial_es_wave = true; - - /* GPU hang workaround. */ - if (sscreen->info.family == CHIP_TONGA || - sscreen->info.family == CHIP_FIJI || - sscreen->info.family == CHIP_POLARIS10 || - sscreen->info.family == CHIP_POLARIS11 || - sscreen->info.family == CHIP_POLARIS12 || - sscreen->info.family == CHIP_VEGAM) - partial_vs_wave = true; } else { partial_vs_wave = true; } @@ -417,6 +408,18 @@ si_get_init_multi_vgt_param(struct si_screen *sscreen, if (sscreen->info.max_se == 4 && !wd_switch_on_eop) ia_switch_on_eoi = true; + /* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set + * to work around a GS hang. + */ + if (key->u.uses_gs && + (sscreen->info.family == CHIP_TONGA || + sscreen->info.family == CHIP_FIJI || + sscreen->info.family == CHIP_POLARIS10 || + sscreen->info.family == CHIP_POLARIS11 || + sscreen->info.family == CHIP_POLARIS12 || + sscreen->info.family == CHIP_VEGAM)) + partial_vs_wave = true; + /* Required by Hawaii and, for some special cases, by VI. */ if (ia_switch_on_eoi && (sscreen->info.family == CHIP_HAWAII || diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c index ad7d21e78..950fb41a5 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1662,7 +1662,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx); /* ps_uses_fbfetch is true only if the color buffer is bound. */ - if (sctx->ps_uses_fbfetch) { + if (sctx->ps_uses_fbfetch && !sctx->blitter->running) { struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0]; struct pipe_resource *tex = cb0->texture; diff --git a/lib/mesa/src/gallium/drivers/swr/swr_fence.cpp b/lib/mesa/src/gallium/drivers/swr/swr_fence.cpp index b05ac8cec..074d82a3b 100644 --- a/lib/mesa/src/gallium/drivers/swr/swr_fence.cpp +++ b/lib/mesa/src/gallium/drivers/swr/swr_fence.cpp @@ -50,7 +50,9 @@ swr_fence_cb(uint64_t userData, uint64_t userData2, uint64_t userData3) swr_fence_do_work(fence); /* Correct value is in SwrSync data, and not the fence write field. */ - fence->read = userData2; + /* Contexts may not finish in order, but fence value always increases */ + if (fence->read < userData2) + fence->read = userData2; } /* diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c index ec42a3dc2..167161fdf 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c @@ -73,42 +73,46 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp) /* Load from the GPU in one shot, no interleave, to * d0-d7. */ - "vldm %0, {q0, q1, q2, q3}\n" + "vldm %[gpu], {q0, q1, q2, q3}\n" /* Store each 8-byte line to cpu-side destination, * incrementing it by the stride each time. */ - "vst1.8 d0, [%1], %2\n" - "vst1.8 d1, [%1], %2\n" - "vst1.8 d2, [%1], %2\n" - "vst1.8 d3, [%1], %2\n" - "vst1.8 d4, [%1], %2\n" - "vst1.8 d5, [%1], %2\n" - "vst1.8 d6, [%1], %2\n" - "vst1.8 d7, [%1]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu_stride) + "vst1.8 d0, [%[cpu]], %[cpu_stride]\n" + "vst1.8 d1, [%[cpu]], %[cpu_stride]\n" + "vst1.8 d2, [%[cpu]], %[cpu_stride]\n" + "vst1.8 d3, [%[cpu]], %[cpu_stride]\n" + "vst1.8 d4, [%[cpu]], %[cpu_stride]\n" + "vst1.8 d5, [%[cpu]], %[cpu_stride]\n" + "vst1.8 d6, [%[cpu]], %[cpu_stride]\n" + "vst1.8 d7, [%[cpu]]\n" + : [cpu] "+r"(cpu) + : [gpu] "r"(gpu), + [cpu_stride] "r"(cpu_stride) : "q0", "q1", "q2", "q3"); } else { assert(gpu_stride == 16); + void *cpu2 = cpu + 8; __asm__ volatile ( /* Load from the GPU in one shot, no interleave, to * d0-d7. */ - "vldm %0, {q0, q1, q2, q3};\n" + "vldm %[gpu], {q0, q1, q2, q3};\n" /* Store each 16-byte line in 2 parts to the cpu-side * destination. (vld1 can only store one d-register * at a time). */ - "vst1.8 d0, [%1], %3\n" - "vst1.8 d1, [%2], %3\n" - "vst1.8 d2, [%1], %3\n" - "vst1.8 d3, [%2], %3\n" - "vst1.8 d4, [%1], %3\n" - "vst1.8 d5, [%2], %3\n" - "vst1.8 d6, [%1]\n" - "vst1.8 d7, [%2]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) + "vst1.8 d0, [%[cpu]], %[cpu_stride]\n" + "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n" + "vst1.8 d2, [%[cpu]], %[cpu_stride]\n" + "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n" + "vst1.8 d4, [%[cpu]], %[cpu_stride]\n" + "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n" + "vst1.8 d6, [%[cpu]]\n" + "vst1.8 d7, [%[cpu2]]\n" + : [cpu] "+r"(cpu), + [cpu2] "+r"(cpu2) + : [gpu] "r"(gpu), + [cpu_stride] "r"(cpu_stride) : "q0", "q1", "q2", "q3"); } #elif defined (PIPE_ARCH_AARCH64) @@ -117,42 +121,46 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp) /* Load from the GPU in one shot, no interleave, to * d0-d7. */ - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" /* Store each 8-byte line to cpu-side destination, * incrementing it by the stride each time. */ - "st1 {v0.D}[0], [%1], %2\n" - "st1 {v0.D}[1], [%1], %2\n" - "st1 {v1.D}[0], [%1], %2\n" - "st1 {v1.D}[1], [%1], %2\n" - "st1 {v2.D}[0], [%1], %2\n" - "st1 {v2.D}[1], [%1], %2\n" - "st1 {v3.D}[0], [%1], %2\n" - "st1 {v3.D}[1], [%1]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu_stride) + "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" + "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n" + "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" + "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n" + "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" + "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n" + "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n" + "st1 {v3.D}[1], [%[cpu]]\n" + : [cpu] "+r"(cpu) + : [gpu] "r"(gpu), + [cpu_stride] "r"(cpu_stride) : "v0", "v1", "v2", "v3"); } else { assert(gpu_stride == 16); + void *cpu2 = cpu + 8; __asm__ volatile ( /* Load from the GPU in one shot, no interleave, to * d0-d7. */ - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" /* Store each 16-byte line in 2 parts to the cpu-side * destination. (vld1 can only store one d-register * at a time). */ - "st1 {v0.D}[0], [%1], %3\n" - "st1 {v0.D}[1], [%2], %3\n" - "st1 {v1.D}[0], [%1], %3\n" - "st1 {v1.D}[1], [%2], %3\n" - "st1 {v2.D}[0], [%1], %3\n" - "st1 {v2.D}[1], [%2], %3\n" - "st1 {v3.D}[0], [%1]\n" - "st1 {v3.D}[1], [%2]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) + "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" + "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n" + "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" + "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n" + "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" + "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n" + "st1 {v3.D}[0], [%[cpu]]\n" + "st1 {v3.D}[1], [%[cpu2]]\n" + : [cpu] "+r"(cpu), + [cpu2] "+r"(cpu2) + : [gpu] "r"(gpu), + [cpu_stride] "r"(cpu_stride) : "v0", "v1", "v2", "v3"); } #else @@ -174,40 +182,44 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp) /* Load each 8-byte line from cpu-side source, * incrementing it by the stride each time. */ - "vld1.8 d0, [%1], %2\n" - "vld1.8 d1, [%1], %2\n" - "vld1.8 d2, [%1], %2\n" - "vld1.8 d3, [%1], %2\n" - "vld1.8 d4, [%1], %2\n" - "vld1.8 d5, [%1], %2\n" - "vld1.8 d6, [%1], %2\n" - "vld1.8 d7, [%1]\n" + "vld1.8 d0, [%[cpu]], %[cpu_stride]\n" + "vld1.8 d1, [%[cpu]], %[cpu_stride]\n" + "vld1.8 d2, [%[cpu]], %[cpu_stride]\n" + "vld1.8 d3, [%[cpu]], %[cpu_stride]\n" + "vld1.8 d4, [%[cpu]], %[cpu_stride]\n" + "vld1.8 d5, [%[cpu]], %[cpu_stride]\n" + "vld1.8 d6, [%[cpu]], %[cpu_stride]\n" + "vld1.8 d7, [%[cpu]]\n" /* Load from the GPU in one shot, no interleave, to * d0-d7. */ - "vstm %0, {q0, q1, q2, q3}\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu_stride) + "vstm %[gpu], {q0, q1, q2, q3}\n" + : [cpu] "+r"(cpu) + : [gpu] "r"(gpu), + [cpu_stride] "r"(cpu_stride) : "q0", "q1", "q2", "q3"); } else { assert(gpu_stride == 16); + void *cpu2 = cpu + 8; __asm__ volatile ( /* Load each 16-byte line in 2 parts from the cpu-side * destination. (vld1 can only store one d-register * at a time). */ - "vld1.8 d0, [%1], %3\n" - "vld1.8 d1, [%2], %3\n" - "vld1.8 d2, [%1], %3\n" - "vld1.8 d3, [%2], %3\n" - "vld1.8 d4, [%1], %3\n" - "vld1.8 d5, [%2], %3\n" - "vld1.8 d6, [%1]\n" - "vld1.8 d7, [%2]\n" + "vld1.8 d0, [%[cpu]], %[cpu_stride]\n" + "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n" + "vld1.8 d2, [%[cpu]], %[cpu_stride]\n" + "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n" + "vld1.8 d4, [%[cpu]], %[cpu_stride]\n" + "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n" + "vld1.8 d6, [%[cpu]]\n" + "vld1.8 d7, [%[cpu2]]\n" /* Store to the GPU in one shot, no interleave. */ - "vstm %0, {q0, q1, q2, q3}\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) + "vstm %[gpu], {q0, q1, q2, q3}\n" + : [cpu] "+r"(cpu), + [cpu2] "+r"(cpu2) + : [gpu] "r"(gpu), + [cpu_stride] "r"(cpu_stride) : "q0", "q1", "q2", "q3"); } #elif defined (PIPE_ARCH_AARCH64) @@ -216,38 +228,42 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp) /* Load each 8-byte line from cpu-side source, * incrementing it by the stride each time. */ - "ld1 {v0.D}[0], [%1], %2\n" - "ld1 {v0.D}[1], [%1], %2\n" - "ld1 {v1.D}[0], [%1], %2\n" - "ld1 {v1.D}[1], [%1], %2\n" - "ld1 {v2.D}[0], [%1], %2\n" - "ld1 {v2.D}[1], [%1], %2\n" - "ld1 {v3.D}[0], [%1], %2\n" - "ld1 {v3.D}[1], [%1]\n" + "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" + "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n" + "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" + "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n" + "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" + "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n" + "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n" + "ld1 {v3.D}[1], [%[cpu]]\n" /* Store to the GPU in one shot, no interleave. */ - "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu_stride) + "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" + : [cpu] "+r"(cpu) + : [gpu] "r"(gpu), + [cpu_stride] "r"(cpu_stride) : "v0", "v1", "v2", "v3"); } else { assert(gpu_stride == 16); + void *cpu2 = cpu + 8; __asm__ volatile ( /* Load each 16-byte line in 2 parts from the cpu-side * destination. (vld1 can only store one d-register * at a time). */ - "ld1 {v0.D}[0], [%1], %3\n" - "ld1 {v0.D}[1], [%2], %3\n" - "ld1 {v1.D}[0], [%1], %3\n" - "ld1 {v1.D}[1], [%2], %3\n" - "ld1 {v2.D}[0], [%1], %3\n" - "ld1 {v2.D}[1], [%2], %3\n" - "ld1 {v3.D}[0], [%1]\n" - "ld1 {v3.D}[1], [%2]\n" + "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" + "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n" + "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" + "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n" + "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" + "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n" + "ld1 {v3.D}[0], [%[cpu]]\n" + "ld1 {v3.D}[1], [%[cpu2]]\n" /* Store to the GPU in one shot, no interleave. */ - "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) + "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" + : [cpu] "+r"(cpu), + [cpu2] "+r"(cpu2) + : [gpu] "r"(gpu), + [cpu_stride] "r"(cpu_stride) : "v0", "v1", "v2", "v3"); } #else |