summaryrefslogtreecommitdiff
path: root/lib/mesa/src/gallium/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'lib/mesa/src/gallium/drivers')
-rw-r--r--lib/mesa/src/gallium/drivers/etnaviv/etnaviv_context.c10
-rw-r--r--lib/mesa/src/gallium/drivers/etnaviv/etnaviv_internal.h1
-rw-r--r--lib/mesa/src/gallium/drivers/etnaviv/etnaviv_state.c8
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.c7
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.h9
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/freedreno_texture.c3
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp2
-rw-r--r--lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c4
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c25
-rw-r--r--lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c2
-rw-r--r--lib/mesa/src/gallium/drivers/swr/swr_fence.cpp4
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c192
12 files changed, 148 insertions, 119 deletions
diff --git a/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_context.c b/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_context.c
index 3038d210e..303dff583 100644
--- a/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_context.c
+++ b/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_context.c
@@ -60,6 +60,8 @@ etna_context_destroy(struct pipe_context *pctx)
{
struct etna_context *ctx = etna_context(pctx);
+ util_copy_framebuffer_state(&ctx->framebuffer_s, NULL);
+
if (ctx->primconvert)
util_primconvert_destroy(ctx->primconvert);
@@ -296,10 +298,10 @@ etna_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
if (DBG_ENABLED(ETNA_DBG_FLUSH_ALL))
pctx->flush(pctx, NULL, 0);
- if (ctx->framebuffer.cbuf)
- etna_resource(ctx->framebuffer.cbuf->texture)->seqno++;
- if (ctx->framebuffer.zsbuf)
- etna_resource(ctx->framebuffer.zsbuf->texture)->seqno++;
+ if (ctx->framebuffer_s.cbufs[0])
+ etna_resource(ctx->framebuffer_s.cbufs[0]->texture)->seqno++;
+ if (ctx->framebuffer_s.zsbuf)
+ etna_resource(ctx->framebuffer_s.zsbuf->texture)->seqno++;
if (info->index_size && indexbuf != info->index.resource)
pipe_resource_reference(&indexbuf, NULL);
}
diff --git a/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_internal.h b/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_internal.h
index 3424d8a77..77214d9cc 100644
--- a/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_internal.h
+++ b/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_internal.h
@@ -182,7 +182,6 @@ struct compiled_viewport_state {
/* Compiled pipe_framebuffer_state */
struct compiled_framebuffer_state {
- struct pipe_surface *cbuf, *zsbuf; /* keep reference to surfaces */
uint32_t GL_MULTI_SAMPLE_CONFIG;
uint32_t PE_COLOR_FORMAT;
uint32_t PE_DEPTH_CONFIG;
diff --git a/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_state.c b/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_state.c
index 87ba10b0d..520cc5a77 100644
--- a/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_state.c
+++ b/lib/mesa/src/gallium/drivers/etnaviv/etnaviv_state.c
@@ -37,6 +37,7 @@
#include "etnaviv_surface.h"
#include "etnaviv_translate.h"
#include "etnaviv_util.h"
+#include "util/u_framebuffer.h"
#include "util/u_helpers.h"
#include "util/u_inlines.h"
#include "util/u_math.h"
@@ -130,7 +131,6 @@ etna_set_framebuffer_state(struct pipe_context *pctx,
assert(res->layout & ETNA_LAYOUT_BIT_TILE); /* Cannot render to linear surfaces */
etna_update_render_resource(pctx, cbuf->base.texture);
- pipe_surface_reference(&cs->cbuf, &cbuf->base);
cs->PE_COLOR_FORMAT =
VIVS_PE_COLOR_FORMAT_FORMAT(translate_rs_format(cbuf->base.format)) |
VIVS_PE_COLOR_FORMAT_COMPONENTS__MASK |
@@ -182,7 +182,6 @@ etna_set_framebuffer_state(struct pipe_context *pctx,
nr_samples_color = cbuf->base.texture->nr_samples;
} else {
- pipe_surface_reference(&cs->cbuf, NULL);
/* Clearing VIVS_PE_COLOR_FORMAT_COMPONENTS__MASK and
* VIVS_PE_COLOR_FORMAT_OVERWRITE prevents us from overwriting the
* color target */
@@ -201,7 +200,6 @@ etna_set_framebuffer_state(struct pipe_context *pctx,
etna_update_render_resource(pctx, zsbuf->base.texture);
- pipe_surface_reference(&cs->zsbuf, &zsbuf->base);
assert(res->layout &ETNA_LAYOUT_BIT_TILE); /* Cannot render to linear surfaces */
uint32_t depth_format = translate_depth_format(zsbuf->base.format);
@@ -252,7 +250,6 @@ etna_set_framebuffer_state(struct pipe_context *pctx,
nr_samples_depth = zsbuf->base.texture->nr_samples;
} else {
- pipe_surface_reference(&cs->zsbuf, NULL);
cs->PE_DEPTH_CONFIG = VIVS_PE_DEPTH_CONFIG_DEPTH_MODE_NONE;
cs->PE_DEPTH_ADDR.bo = NULL;
cs->PE_DEPTH_STRIDE = 0;
@@ -325,7 +322,8 @@ etna_set_framebuffer_state(struct pipe_context *pctx,
*/
cs->PE_LOGIC_OP = VIVS_PE_LOGIC_OP_SINGLE_BUFFER(ctx->specs.single_buffer ? 3 : 0);
- ctx->framebuffer_s = *sv; /* keep copy of original structure */
+ /* keep copy of original structure */
+ util_copy_framebuffer_state(&ctx->framebuffer_s, sv);
ctx->dirty |= ETNA_DIRTY_FRAMEBUFFER | ETNA_DIRTY_DERIVE_TS;
}
diff --git a/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.c b/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.c
index 54d738589..eeeda1cf6 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -839,8 +839,7 @@ fd_resource_create(struct pipe_screen *pscreen,
rsc->internal_format = format;
rsc->cpp = util_format_get_blocksize(format);
- prsc->nr_samples = MAX2(1, prsc->nr_samples);
- rsc->cpp *= prsc->nr_samples;
+ rsc->cpp *= fd_resource_nr_samples(prsc);
assert(rsc->cpp);
@@ -924,9 +923,9 @@ fd_resource_from_handle(struct pipe_screen *pscreen,
if (!rsc->bo)
goto fail;
- prsc->nr_samples = MAX2(1, prsc->nr_samples);
rsc->internal_format = tmpl->format;
- rsc->cpp = prsc->nr_samples * util_format_get_blocksize(tmpl->format);
+ rsc->cpp = util_format_get_blocksize(tmpl->format);
+ rsc->cpp *= fd_resource_nr_samples(prsc);
slice->pitch = handle->stride / rsc->cpp;
slice->offset = handle->offset;
slice->size0 = handle->stride * prsc->height0;
diff --git a/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.h b/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.h
index 09abb512d..6790352f9 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.h
+++ b/lib/mesa/src/gallium/drivers/freedreno/freedreno_resource.h
@@ -178,6 +178,15 @@ fd_resource_level_linear(struct pipe_resource *prsc, int level)
return false;
}
+/* access # of samples, with 0 normalized to 1 (which is what we care about
+ * most of the time)
+ */
+static inline unsigned
+fd_resource_nr_samples(struct pipe_resource *prsc)
+{
+ return MAX2(1, prsc->nr_samples);
+}
+
void fd_blitter_pipe_begin(struct fd_context *ctx, bool render_cond, bool discard,
enum fd_render_stage stage);
void fd_blitter_pipe_end(struct fd_context *ctx);
diff --git a/lib/mesa/src/gallium/drivers/freedreno/freedreno_texture.c b/lib/mesa/src/gallium/drivers/freedreno/freedreno_texture.c
index d92298d2e..84b4df6c1 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/freedreno_texture.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/freedreno_texture.c
@@ -31,6 +31,7 @@
#include "freedreno_texture.h"
#include "freedreno_context.h"
+#include "freedreno_resource.h"
#include "freedreno_util.h"
static void
@@ -83,7 +84,7 @@ static void set_sampler_views(struct fd_texture_stateobj *tex,
tex->num_textures = util_last_bit(tex->valid_textures);
for (i = 0; i < tex->num_textures; i++) {
- uint nr_samples = tex->textures[i]->texture->nr_samples;
+ uint nr_samples = fd_resource_nr_samples(tex->textures[i]->texture);
samplers |= (nr_samples >> 1) << (i * 2);
}
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index ca0192a9c..997df1607 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1044,7 +1044,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
break;
}
case OP_MUL:
- if (i->dType == TYPE_F32)
+ if (i->dType == TYPE_F32 && !i->precise)
tryCollapseChainedMULs(i, s, imm0);
if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index d0d39aa53..9059d4e48 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -1279,8 +1279,8 @@ nvc0_screen_create(struct nouveau_device *dev)
for (i = 0; i < NVC0_MAX_VIEWPORTS; i++) {
BEGIN_NVC0(push, NVC0_3D(SCISSOR_ENABLE(i)), 3);
PUSH_DATA (push, 1);
- PUSH_DATA (push, 8192 << 16);
- PUSH_DATA (push, 8192 << 16);
+ PUSH_DATA (push, 16384 << 16);
+ PUSH_DATA (push, 16384 << 16);
}
#define MK_MACRO(m, n) i = nvc0_graph_set_macro(screen, m, i, sizeof(n), n);
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c
index 612ca910c..413cbc627 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -348,20 +348,11 @@ si_get_init_multi_vgt_param(struct si_screen *sscreen,
key->u.uses_gs)
partial_vs_wave = true;
- /* Needed for 028B6C_DISTRIBUTION_MODE != 0 */
+ /* Needed for 028B6C_DISTRIBUTION_MODE != 0. (implies >= VI) */
if (sscreen->has_distributed_tess) {
if (key->u.uses_gs) {
- if (sscreen->info.chip_class <= VI)
+ if (sscreen->info.chip_class == VI)
partial_es_wave = true;
-
- /* GPU hang workaround. */
- if (sscreen->info.family == CHIP_TONGA ||
- sscreen->info.family == CHIP_FIJI ||
- sscreen->info.family == CHIP_POLARIS10 ||
- sscreen->info.family == CHIP_POLARIS11 ||
- sscreen->info.family == CHIP_POLARIS12 ||
- sscreen->info.family == CHIP_VEGAM)
- partial_vs_wave = true;
} else {
partial_vs_wave = true;
}
@@ -417,6 +408,18 @@ si_get_init_multi_vgt_param(struct si_screen *sscreen,
if (sscreen->info.max_se == 4 && !wd_switch_on_eop)
ia_switch_on_eoi = true;
+ /* HW engineers suggested that PARTIAL_VS_WAVE_ON should be set
+ * to work around a GS hang.
+ */
+ if (key->u.uses_gs &&
+ (sscreen->info.family == CHIP_TONGA ||
+ sscreen->info.family == CHIP_FIJI ||
+ sscreen->info.family == CHIP_POLARIS10 ||
+ sscreen->info.family == CHIP_POLARIS11 ||
+ sscreen->info.family == CHIP_POLARIS12 ||
+ sscreen->info.family == CHIP_VEGAM))
+ partial_vs_wave = true;
+
/* Required by Hawaii and, for some special cases, by VI. */
if (ia_switch_on_eoi &&
(sscreen->info.family == CHIP_HAWAII ||
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c
index ad7d21e78..950fb41a5 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1662,7 +1662,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
/* ps_uses_fbfetch is true only if the color buffer is bound. */
- if (sctx->ps_uses_fbfetch) {
+ if (sctx->ps_uses_fbfetch && !sctx->blitter->running) {
struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
struct pipe_resource *tex = cb0->texture;
diff --git a/lib/mesa/src/gallium/drivers/swr/swr_fence.cpp b/lib/mesa/src/gallium/drivers/swr/swr_fence.cpp
index b05ac8cec..074d82a3b 100644
--- a/lib/mesa/src/gallium/drivers/swr/swr_fence.cpp
+++ b/lib/mesa/src/gallium/drivers/swr/swr_fence.cpp
@@ -50,7 +50,9 @@ swr_fence_cb(uint64_t userData, uint64_t userData2, uint64_t userData3)
swr_fence_do_work(fence);
/* Correct value is in SwrSync data, and not the fence write field. */
- fence->read = userData2;
+ /* Contexts may not finish in order, but fence value always increases */
+ if (fence->read < userData2)
+ fence->read = userData2;
}
/*
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
index ec42a3dc2..167161fdf 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
@@ -73,42 +73,46 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
- "vldm %0, {q0, q1, q2, q3}\n"
+ "vldm %[gpu], {q0, q1, q2, q3}\n"
/* Store each 8-byte line to cpu-side destination,
* incrementing it by the stride each time.
*/
- "vst1.8 d0, [%1], %2\n"
- "vst1.8 d1, [%1], %2\n"
- "vst1.8 d2, [%1], %2\n"
- "vst1.8 d3, [%1], %2\n"
- "vst1.8 d4, [%1], %2\n"
- "vst1.8 d5, [%1], %2\n"
- "vst1.8 d6, [%1], %2\n"
- "vst1.8 d7, [%1]\n"
- :
- : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+ "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d7, [%[cpu]]\n"
+ : [cpu] "+r"(cpu)
+ : [gpu] "r"(gpu),
+ [cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
} else {
assert(gpu_stride == 16);
+ void *cpu2 = cpu + 8;
__asm__ volatile (
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
- "vldm %0, {q0, q1, q2, q3};\n"
+ "vldm %[gpu], {q0, q1, q2, q3};\n"
/* Store each 16-byte line in 2 parts to the cpu-side
* destination. (vld1 can only store one d-register
* at a time).
*/
- "vst1.8 d0, [%1], %3\n"
- "vst1.8 d1, [%2], %3\n"
- "vst1.8 d2, [%1], %3\n"
- "vst1.8 d3, [%2], %3\n"
- "vst1.8 d4, [%1], %3\n"
- "vst1.8 d5, [%2], %3\n"
- "vst1.8 d6, [%1]\n"
- "vst1.8 d7, [%2]\n"
- :
- : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+ "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
+ "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
+ "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
+ "vst1.8 d6, [%[cpu]]\n"
+ "vst1.8 d7, [%[cpu2]]\n"
+ : [cpu] "+r"(cpu),
+ [cpu2] "+r"(cpu2)
+ : [gpu] "r"(gpu),
+ [cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
}
#elif defined (PIPE_ARCH_AARCH64)
@@ -117,42 +121,46 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
- "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
+ "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
/* Store each 8-byte line to cpu-side destination,
* incrementing it by the stride each time.
*/
- "st1 {v0.D}[0], [%1], %2\n"
- "st1 {v0.D}[1], [%1], %2\n"
- "st1 {v1.D}[0], [%1], %2\n"
- "st1 {v1.D}[1], [%1], %2\n"
- "st1 {v2.D}[0], [%1], %2\n"
- "st1 {v2.D}[1], [%1], %2\n"
- "st1 {v3.D}[0], [%1], %2\n"
- "st1 {v3.D}[1], [%1]\n"
- :
- : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+ "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v3.D}[1], [%[cpu]]\n"
+ : [cpu] "+r"(cpu)
+ : [gpu] "r"(gpu),
+ [cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
} else {
assert(gpu_stride == 16);
+ void *cpu2 = cpu + 8;
__asm__ volatile (
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
- "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
+ "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
/* Store each 16-byte line in 2 parts to the cpu-side
* destination. (vld1 can only store one d-register
* at a time).
*/
- "st1 {v0.D}[0], [%1], %3\n"
- "st1 {v0.D}[1], [%2], %3\n"
- "st1 {v1.D}[0], [%1], %3\n"
- "st1 {v1.D}[1], [%2], %3\n"
- "st1 {v2.D}[0], [%1], %3\n"
- "st1 {v2.D}[1], [%2], %3\n"
- "st1 {v3.D}[0], [%1]\n"
- "st1 {v3.D}[1], [%2]\n"
- :
- : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+ "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
+ "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
+ "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
+ "st1 {v3.D}[0], [%[cpu]]\n"
+ "st1 {v3.D}[1], [%[cpu2]]\n"
+ : [cpu] "+r"(cpu),
+ [cpu2] "+r"(cpu2)
+ : [gpu] "r"(gpu),
+ [cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
}
#else
@@ -174,40 +182,44 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
/* Load each 8-byte line from cpu-side source,
* incrementing it by the stride each time.
*/
- "vld1.8 d0, [%1], %2\n"
- "vld1.8 d1, [%1], %2\n"
- "vld1.8 d2, [%1], %2\n"
- "vld1.8 d3, [%1], %2\n"
- "vld1.8 d4, [%1], %2\n"
- "vld1.8 d5, [%1], %2\n"
- "vld1.8 d6, [%1], %2\n"
- "vld1.8 d7, [%1]\n"
+ "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d7, [%[cpu]]\n"
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
- "vstm %0, {q0, q1, q2, q3}\n"
- :
- : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+ "vstm %[gpu], {q0, q1, q2, q3}\n"
+ : [cpu] "+r"(cpu)
+ : [gpu] "r"(gpu),
+ [cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
} else {
assert(gpu_stride == 16);
+ void *cpu2 = cpu + 8;
__asm__ volatile (
/* Load each 16-byte line in 2 parts from the cpu-side
* destination. (vld1 can only store one d-register
* at a time).
*/
- "vld1.8 d0, [%1], %3\n"
- "vld1.8 d1, [%2], %3\n"
- "vld1.8 d2, [%1], %3\n"
- "vld1.8 d3, [%2], %3\n"
- "vld1.8 d4, [%1], %3\n"
- "vld1.8 d5, [%2], %3\n"
- "vld1.8 d6, [%1]\n"
- "vld1.8 d7, [%2]\n"
+ "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
+ "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
+ "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
+ "vld1.8 d6, [%[cpu]]\n"
+ "vld1.8 d7, [%[cpu2]]\n"
/* Store to the GPU in one shot, no interleave. */
- "vstm %0, {q0, q1, q2, q3}\n"
- :
- : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+ "vstm %[gpu], {q0, q1, q2, q3}\n"
+ : [cpu] "+r"(cpu),
+ [cpu2] "+r"(cpu2)
+ : [gpu] "r"(gpu),
+ [cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
}
#elif defined (PIPE_ARCH_AARCH64)
@@ -216,38 +228,42 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
/* Load each 8-byte line from cpu-side source,
* incrementing it by the stride each time.
*/
- "ld1 {v0.D}[0], [%1], %2\n"
- "ld1 {v0.D}[1], [%1], %2\n"
- "ld1 {v1.D}[0], [%1], %2\n"
- "ld1 {v1.D}[1], [%1], %2\n"
- "ld1 {v2.D}[0], [%1], %2\n"
- "ld1 {v2.D}[1], [%1], %2\n"
- "ld1 {v3.D}[0], [%1], %2\n"
- "ld1 {v3.D}[1], [%1]\n"
+ "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v3.D}[1], [%[cpu]]\n"
/* Store to the GPU in one shot, no interleave. */
- "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
- :
- : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+ "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
+ : [cpu] "+r"(cpu)
+ : [gpu] "r"(gpu),
+ [cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
} else {
assert(gpu_stride == 16);
+ void *cpu2 = cpu + 8;
__asm__ volatile (
/* Load each 16-byte line in 2 parts from the cpu-side
* destination. (vld1 can only store one d-register
* at a time).
*/
- "ld1 {v0.D}[0], [%1], %3\n"
- "ld1 {v0.D}[1], [%2], %3\n"
- "ld1 {v1.D}[0], [%1], %3\n"
- "ld1 {v1.D}[1], [%2], %3\n"
- "ld1 {v2.D}[0], [%1], %3\n"
- "ld1 {v2.D}[1], [%2], %3\n"
- "ld1 {v3.D}[0], [%1]\n"
- "ld1 {v3.D}[1], [%2]\n"
+ "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
+ "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
+ "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
+ "ld1 {v3.D}[0], [%[cpu]]\n"
+ "ld1 {v3.D}[1], [%[cpu2]]\n"
/* Store to the GPU in one shot, no interleave. */
- "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
- :
- : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+ "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
+ : [cpu] "+r"(cpu),
+ [cpu2] "+r"(cpu2)
+ : [gpu] "r"(gpu),
+ [cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
}
#else