summaryrefslogtreecommitdiff
path: root/lib/mesa/src/gallium/drivers/vc4
diff options
context:
space:
mode:
authorJonathan Gray <jsg@cvs.openbsd.org>2017-08-14 09:45:54 +0000
committerJonathan Gray <jsg@cvs.openbsd.org>2017-08-14 09:45:54 +0000
commit4c58069f5013f0a621503525f7d5193bfe9976b3 (patch)
treebd8f8a08b889e9a8b99c9de01ae12459d527ea6d /lib/mesa/src/gallium/drivers/vc4
parent5caa025e6b62d0456faad86c89f239a14d1eaadb (diff)
Import Mesa 17.1.6
Diffstat (limited to 'lib/mesa/src/gallium/drivers/vc4')
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/Makefile.am8
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/Makefile.sources2
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/kernel/README6
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h2
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c24
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c81
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_blit.c14
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c149
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h9
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_cl.c4
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_context.c11
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_context.h38
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_draw.c10
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_emit.c15
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_formats.c2
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c29
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c16
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c15
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_opt_coalesce_ff_writes.c111
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c2
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c4
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c6
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c4
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c11
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c59
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_program.c347
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_qir.c124
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_qir.h80
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c18
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c42
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c8
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c173
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c26
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c1
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c12
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c92
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c136
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c107
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c137
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c2
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_resource.c22
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_screen.c67
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_screen.h20
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c509
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h3
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_state.c6
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c208
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h83
-rw-r--r--lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c212
49 files changed, 2271 insertions, 796 deletions
diff --git a/lib/mesa/src/gallium/drivers/vc4/Makefile.am b/lib/mesa/src/gallium/drivers/vc4/Makefile.am
index 19fc38759..b361a0c58 100644
--- a/lib/mesa/src/gallium/drivers/vc4/Makefile.am
+++ b/lib/mesa/src/gallium/drivers/vc4/Makefile.am
@@ -40,3 +40,11 @@ noinst_LTLIBRARIES = libvc4.la
libvc4_la_SOURCES = $(C_SOURCES)
libvc4_la_LIBADD = $(SIM_LIB) $(VC4_LIBS)
libvc4_la_LDFLAGS = $(SIM_LDFLAGS)
+
+noinst_LTLIBRARIES += libvc4_neon.la
+libvc4_la_LIBADD += libvc4_neon.la
+
+libvc4_neon_la_SOURCES = vc4_tiling_lt.c
+libvc4_neon_la_CFLAGS = $(AM_CFLAGS) -DVC4_BUILD_NEON
+
+EXTRA_DIST = kernel/README
diff --git a/lib/mesa/src/gallium/drivers/vc4/Makefile.sources b/lib/mesa/src/gallium/drivers/vc4/Makefile.sources
index e1496d101..10de34361 100644
--- a/lib/mesa/src/gallium/drivers/vc4/Makefile.sources
+++ b/lib/mesa/src/gallium/drivers/vc4/Makefile.sources
@@ -28,6 +28,7 @@ C_SOURCES := \
vc4_opt_peephole_sf.c \
vc4_opt_small_immediates.c \
vc4_opt_vpm.c \
+ vc4_opt_coalesce_ff_writes.c \
vc4_program.c \
vc4_qir.c \
vc4_qir_emit_uniform_stream_resets.c \
@@ -54,6 +55,7 @@ C_SOURCES := \
vc4_simulator_validate.h \
vc4_state.c \
vc4_tiling.c \
+ vc4_tiling_lt.c \
vc4_tiling.h \
vc4_uniforms.c \
$()
diff --git a/lib/mesa/src/gallium/drivers/vc4/kernel/README b/lib/mesa/src/gallium/drivers/vc4/kernel/README
new file mode 100644
index 000000000..89e4442b4
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/vc4/kernel/README
@@ -0,0 +1,6 @@
+This is a mirror of the kernel validation code into the userspace GL library.
+It is only built when USE_VC4_SIMULATOR is defined, for compiling the driver
+on an x86 system with the simpenrose simulator. It allows testing of changes
+across the kernel and userspace with exposure to most of the software stack,
+on a higher-performance and more-debuggable environment than the native
+hardware.
diff --git a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h
index 90f45397d..8f5ed00d9 100644
--- a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h
+++ b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h
@@ -150,6 +150,8 @@ struct vc4_validated_shader_info
uint32_t num_uniform_addr_offsets;
uint32_t *uniform_addr_offsets;
+
+ bool is_threaded;
};
/* vc4_validate.c */
diff --git a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c
index 4ef01108b..bd193b993 100644
--- a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -640,6 +640,13 @@ reloc_tex(struct vc4_exec_info *exec,
cpp = 1;
break;
case VC4_TEXTURE_TYPE_ETC1:
+ /* ETC1 is arranged as 64-bit blocks, where each block is 4x4
+ * pixels.
+ */
+ cpp = 8;
+ width = (width + 3) >> 2;
+ height = (height + 3) >> 2;
+ break;
case VC4_TEXTURE_TYPE_BW1:
case VC4_TEXTURE_TYPE_A4:
case VC4_TEXTURE_TYPE_A1:
@@ -773,11 +780,6 @@ validate_gl_shader_rec(struct drm_device *dev,
exec->shader_rec_v += roundup(packet_size, 16);
exec->shader_rec_size -= packet_size;
- if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
- DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
- return -EINVAL;
- }
-
for (i = 0; i < shader_reloc_count; i++) {
if (src_handles[i] > exec->bo_count) {
DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
@@ -794,6 +796,18 @@ validate_gl_shader_rec(struct drm_device *dev,
return -EINVAL;
}
+ if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) !=
+ to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) {
+ DRM_ERROR("Thread mode of CL and FS do not match\n");
+ return -EINVAL;
+ }
+
+ if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded ||
+ to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) {
+ DRM_ERROR("cs and vs cannot be threaded\n");
+ return -EINVAL;
+ }
+
for (i = 0; i < shader_reloc_count; i++) {
struct vc4_validated_shader_info *validated_shader;
uint32_t o = shader_reloc_offsets[i];
diff --git a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
index 82717ca55..d93f5239d 100644
--- a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
+++ b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
@@ -84,6 +84,14 @@ struct vc4_shader_validation_state {
* basic blocks.
*/
bool needs_uniform_address_for_loop;
+
+ /* Set when we find an instruction which violates the criterion for a
+ * threaded shader. These are:
+ * - only write the lower half of the register space
+ * - last thread switch signaled at the end
+ * So track the usage of the thread switches and the register usage.
+ */
+ bool all_registers_used;
};
static uint32_t
@@ -119,6 +127,12 @@ raddr_add_a_to_live_reg_index(uint64_t inst)
return ~0;
}
+static bool live_reg_is_upper_half(uint32_t lri)
+{
+ return (lri >=16 && lri < 32) ||
+ (lri >=32 + 16 && lri < 32 + 32);
+}
+
static bool
is_tmu_submit(uint32_t waddr)
{
@@ -385,6 +399,9 @@ check_reg_write(struct vc4_validated_shader_info *validated_shader,
} else {
validation_state->live_immediates[lri] = ~0;
}
+
+ if (live_reg_is_upper_half(lri))
+ validation_state->all_registers_used = true;
}
switch (waddr) {
@@ -593,6 +610,11 @@ check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
}
}
+ if ((raddr_a >= 16 && raddr_a < 32) ||
+ (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
+ validation_state->all_registers_used = true;
+ }
+
return true;
}
@@ -603,9 +625,7 @@ static bool
vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
{
uint32_t max_branch_target = 0;
- bool found_shader_end = false;
int ip;
- int shader_end_ip = 0;
int last_branch = -2;
for (ip = 0; ip < validation_state->max_ip; ip++) {
@@ -616,8 +636,13 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
uint32_t branch_target_ip;
if (sig == QPU_SIG_PROG_END) {
- shader_end_ip = ip;
- found_shader_end = true;
+ /* There are two delay slots after program end is
+ * signaled that are still executed, then we're
+ * finished. validation_state->max_ip is the
+ * instruction after the last valid instruction in the
+ * program.
+ */
+ validation_state->max_ip = ip + 3;
continue;
}
@@ -671,15 +696,9 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
}
set_bit(after_delay_ip, validation_state->branch_targets);
max_branch_target = max(max_branch_target, after_delay_ip);
-
- /* There are two delay slots after program end is signaled
- * that are still executed, then we're finished.
- */
- if (found_shader_end && ip == shader_end_ip + 2)
- break;
}
- if (max_branch_target > shader_end_ip) {
+ if (max_branch_target > validation_state->max_ip - 3) {
DRM_ERROR("Branch landed after QPU_SIG_PROG_END");
return false;
}
@@ -751,6 +770,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
{
bool found_shader_end = false;
int shader_end_ip = 0;
+ uint32_t last_thread_switch_ip = -3;
uint32_t ip;
struct vc4_validated_shader_info *validated_shader = NULL;
struct vc4_shader_validation_state validation_state;
@@ -783,6 +803,16 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
if (!vc4_handle_branch_target(&validation_state))
goto fail;
+ if (ip == last_thread_switch_ip + 3) {
+ /* Reset r0-r3 live clamp data */
+ int i;
+ for (i = 64; i < LIVE_REG_COUNT; i++) {
+ validation_state.live_min_clamp_offsets[i] = ~0;
+ validation_state.live_max_clamp_regs[i] = false;
+ validation_state.live_immediates[i] = ~0;
+ }
+ }
+
switch (sig) {
case QPU_SIG_NONE:
case QPU_SIG_WAIT_FOR_SCOREBOARD:
@@ -792,6 +822,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
case QPU_SIG_LOAD_TMU1:
case QPU_SIG_PROG_END:
case QPU_SIG_SMALL_IMM:
+ case QPU_SIG_THREAD_SWITCH:
+ case QPU_SIG_LAST_THREAD_SWITCH:
if (!check_instruction_writes(validated_shader,
&validation_state)) {
DRM_ERROR("Bad write at ip %d\n", ip);
@@ -807,6 +839,18 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
shader_end_ip = ip;
}
+ if (sig == QPU_SIG_THREAD_SWITCH ||
+ sig == QPU_SIG_LAST_THREAD_SWITCH) {
+ validated_shader->is_threaded = true;
+
+ if (ip < last_thread_switch_ip + 3) {
+ DRM_ERROR("Thread switch too soon after "
+ "last switch at ip %d\n", ip);
+ goto fail;
+ }
+ last_thread_switch_ip = ip;
+ }
+
break;
case QPU_SIG_LOAD_IMM:
@@ -821,6 +865,13 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
if (!check_branch(inst, validated_shader,
&validation_state, ip))
goto fail;
+
+ if (ip < last_thread_switch_ip + 3) {
+ DRM_ERROR("Branch in thread switch at ip %d",
+ ip);
+ goto fail;
+ }
+
break;
default:
DRM_ERROR("Unsupported QPU signal %d at "
@@ -842,6 +893,14 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
goto fail;
}
+ /* Might corrupt other thread */
+ if (validated_shader->is_threaded &&
+ validation_state.all_registers_used) {
+ DRM_ERROR("Shader uses threading, but uses the upper "
+ "half of the registers, too\n");
+ goto fail;
+ }
+
/* If we did a backwards branch and we haven't emitted a uniforms
* reset since then, we still need the uniforms stream to have the
* uniforms address available so that the backwards branch can do its
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c b/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c
index 1e056568a..0e4ab5bfa 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c
@@ -212,14 +212,16 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
if (vc4_tile_blit(pctx, blit_info))
return;
- if (util_try_blit_via_copy_region(pctx, &info)) {
- return; /* done */
- }
-
if (info.mask & PIPE_MASK_S) {
- fprintf(stderr, "cannot blit stencil, skipping\n");
+ if (util_try_blit_via_copy_region(pctx, &info))
+ return;
+
info.mask &= ~PIPE_MASK_S;
+ fprintf(stderr, "cannot blit stencil, skipping\n");
}
- vc4_render_blit(pctx, &info);
+ if (vc4_render_blit(pctx, &info))
+ return;
+
+ fprintf(stderr, "Unsupported blit\n");
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c
index cf6a5114b..12af7f8a9 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -97,7 +97,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
return NULL;
struct vc4_bo *bo = NULL;
- pipe_mutex_lock(cache->lock);
+ mtx_lock(&cache->lock);
if (!list_empty(&cache->size_list[page_index])) {
bo = LIST_ENTRY(struct vc4_bo, cache->size_list[page_index].next,
size_list);
@@ -107,7 +107,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
* user will proceed to CPU map it and fill it with stuff.
*/
if (!vc4_bo_wait(bo, 0, NULL)) {
- pipe_mutex_unlock(cache->lock);
+ mtx_unlock(&cache->lock);
return NULL;
}
@@ -116,7 +116,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
bo->name = name;
}
- pipe_mutex_unlock(cache->lock);
+ mtx_unlock(&cache->lock);
return bo;
}
@@ -148,28 +148,17 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
bo->name = name;
bo->private = true;
+ retry:
+ ;
+
bool cleared_and_retried = false;
-retry:
- if (!using_vc4_simulator) {
- struct drm_vc4_create_bo create;
- memset(&create, 0, sizeof(create));
-
- create.size = size;
-
- ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_CREATE_BO, &create);
- bo->handle = create.handle;
- } else {
- struct drm_mode_create_dumb create;
- memset(&create, 0, sizeof(create));
-
- create.width = 128;
- create.bpp = 8;
- create.height = (size + 127) / 128;
-
- ret = drmIoctl(screen->fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
- bo->handle = create.handle;
- assert(create.size >= size);
- }
+ struct drm_vc4_create_bo create = {
+ .size = size
+ };
+
+ ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_CREATE_BO, &create);
+ bo->handle = create.handle;
+
if (ret != 0) {
if (!list_empty(&screen->bo_cache.time_list) &&
!cleared_and_retried) {
@@ -199,9 +188,9 @@ vc4_bo_last_unreference(struct vc4_bo *bo)
struct timespec time;
clock_gettime(CLOCK_MONOTONIC, &time);
- pipe_mutex_lock(screen->bo_cache.lock);
+ mtx_lock(&screen->bo_cache.lock);
vc4_bo_last_unreference_locked_timed(bo, time.tv_sec);
- pipe_mutex_unlock(screen->bo_cache.lock);
+ mtx_unlock(&screen->bo_cache.lock);
}
static void
@@ -210,20 +199,19 @@ vc4_bo_free(struct vc4_bo *bo)
struct vc4_screen *screen = bo->screen;
if (bo->map) {
-#ifdef USE_VC4_SIMULATOR
- if (bo->simulator_winsys_map) {
+ if (using_vc4_simulator && bo->name &&
+ strcmp(bo->name, "winsys") == 0) {
free(bo->map);
- bo->map = bo->simulator_winsys_map;
+ } else {
+ munmap(bo->map, bo->size);
+ VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
}
-#endif
- munmap(bo->map, bo->size);
- VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
}
struct drm_gem_close c;
memset(&c, 0, sizeof(c));
c.handle = bo->handle;
- int ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c);
+ int ret = vc4_ioctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c);
if (ret != 0)
fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno));
@@ -273,13 +261,13 @@ free_stale_bos(struct vc4_screen *screen, time_t time)
static void
vc4_bo_cache_free_all(struct vc4_bo_cache *cache)
{
- pipe_mutex_lock(cache->lock);
+ mtx_lock(&cache->lock);
list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list,
time_list) {
vc4_bo_remove_from_cache(cache, bo);
vc4_bo_free(bo);
}
- pipe_mutex_unlock(cache->lock);
+ mtx_unlock(&cache->lock);
}
void
@@ -301,17 +289,8 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time)
/* Move old list contents over (since the array has moved, and
* therefore the pointers to the list heads have to change).
*/
- for (int i = 0; i < cache->size_list_size; i++) {
- struct list_head *old_head = &cache->size_list[i];
- if (list_empty(old_head))
- list_inithead(&new_list[i]);
- else {
- new_list[i].next = old_head->next;
- new_list[i].prev = old_head->prev;
- new_list[i].next->prev = &new_list[i];
- new_list[i].prev->next = &new_list[i];
- }
- }
+ for (int i = 0; i < cache->size_list_size; i++)
+ list_replace(&cache->size_list[i], &new_list[i]);
for (int i = cache->size_list_size; i < page_index + 1; i++)
list_inithead(&new_list[i]);
@@ -343,7 +322,7 @@ vc4_bo_open_handle(struct vc4_screen *screen,
assert(size);
- pipe_mutex_lock(screen->bo_handles_mutex);
+ mtx_lock(&screen->bo_handles_mutex);
bo = util_hash_table_get(screen->bo_handles, (void*)(uintptr_t)handle);
if (bo) {
@@ -360,16 +339,15 @@ vc4_bo_open_handle(struct vc4_screen *screen,
bo->private = false;
#ifdef USE_VC4_SIMULATOR
- vc4_bo_map(bo);
- bo->simulator_winsys_map = bo->map;
- bo->simulator_winsys_stride = winsys_stride;
+ vc4_simulator_open_from_handle(screen->fd, winsys_stride,
+ bo->handle, bo->size);
bo->map = malloc(bo->size);
#endif
util_hash_table_set(screen->bo_handles, (void *)(uintptr_t)handle, bo);
done:
- pipe_mutex_unlock(screen->bo_handles_mutex);
+ mtx_unlock(&screen->bo_handles_mutex);
return bo;
}
@@ -380,7 +358,7 @@ vc4_bo_open_name(struct vc4_screen *screen, uint32_t name,
struct drm_gem_open o = {
.name = name
};
- int ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_OPEN, &o);
+ int ret = vc4_ioctl(screen->fd, DRM_IOCTL_GEM_OPEN, &o);
if (ret) {
fprintf(stderr, "Failed to open bo %d: %s\n",
name, strerror(errno));
@@ -423,10 +401,10 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo)
return -1;
}
- pipe_mutex_lock(bo->screen->bo_handles_mutex);
+ mtx_lock(&bo->screen->bo_handles_mutex);
bo->private = false;
util_hash_table_set(bo->screen->bo_handles, (void *)(uintptr_t)bo->handle, bo);
- pipe_mutex_unlock(bo->screen->bo_handles_mutex);
+ mtx_unlock(&bo->screen->bo_handles_mutex);
return fd;
}
@@ -447,30 +425,15 @@ vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size)
bo->name = "code";
bo->private = false; /* Make sure it doesn't go back to the cache. */
- if (!using_vc4_simulator) {
- struct drm_vc4_create_shader_bo create = {
- .size = size,
- .data = (uintptr_t)data,
- };
-
- ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_CREATE_SHADER_BO,
- &create);
- bo->handle = create.handle;
- } else {
- struct drm_mode_create_dumb create;
- memset(&create, 0, sizeof(create));
-
- create.width = 128;
- create.bpp = 8;
- create.height = (size + 127) / 128;
-
- ret = drmIoctl(screen->fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
- bo->handle = create.handle;
- assert(create.size >= size);
-
- vc4_bo_map(bo);
- memcpy(bo->map, data, size);
- }
+ struct drm_vc4_create_shader_bo create = {
+ .size = size,
+ .data = (uintptr_t)data,
+ };
+
+ ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_CREATE_SHADER_BO,
+ &create);
+ bo->handle = create.handle;
+
if (ret != 0) {
fprintf(stderr, "create shader ioctl failure\n");
abort();
@@ -492,7 +455,7 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name)
struct drm_gem_flink flink = {
.handle = bo->handle,
};
- int ret = drmIoctl(bo->screen->fd, DRM_IOCTL_GEM_FLINK, &flink);
+ int ret = vc4_ioctl(bo->screen->fd, DRM_IOCTL_GEM_FLINK, &flink);
if (ret) {
fprintf(stderr, "Failed to flink bo %d: %s\n",
bo->handle, strerror(errno));
@@ -508,14 +471,11 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name)
static int vc4_wait_seqno_ioctl(int fd, uint64_t seqno, uint64_t timeout_ns)
{
- if (using_vc4_simulator)
- return 0;
-
struct drm_vc4_wait_seqno wait = {
.seqno = seqno,
.timeout_ns = timeout_ns,
};
- int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
+ int ret = vc4_ioctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
if (ret == -1)
return -errno;
else
@@ -553,14 +513,11 @@ vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns,
static int vc4_wait_bo_ioctl(int fd, uint32_t handle, uint64_t timeout_ns)
{
- if (using_vc4_simulator)
- return 0;
-
struct drm_vc4_wait_bo wait = {
.handle = handle,
.timeout_ns = timeout_ns,
};
- int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_BO, &wait);
+ int ret = vc4_ioctl(fd, DRM_IOCTL_VC4_WAIT_BO, &wait);
if (ret == -1)
return -errno;
else
@@ -602,19 +559,11 @@ vc4_bo_map_unsynchronized(struct vc4_bo *bo)
if (bo->map)
return bo->map;
- if (!using_vc4_simulator) {
- struct drm_vc4_mmap_bo map;
- memset(&map, 0, sizeof(map));
- map.handle = bo->handle;
- ret = drmIoctl(bo->screen->fd, DRM_IOCTL_VC4_MMAP_BO, &map);
- offset = map.offset;
- } else {
- struct drm_mode_map_dumb map;
- memset(&map, 0, sizeof(map));
- map.handle = bo->handle;
- ret = drmIoctl(bo->screen->fd, DRM_IOCTL_MODE_MAP_DUMB, &map);
- offset = map.offset;
- }
+ struct drm_vc4_mmap_bo map;
+ memset(&map, 0, sizeof(map));
+ map.handle = bo->handle;
+ ret = vc4_ioctl(bo->screen->fd, DRM_IOCTL_VC4_MMAP_BO, &map);
+ offset = map.offset;
if (ret != 0) {
fprintf(stderr, "map ioctl failure\n");
abort();
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h
index 71a442648..838314f43 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h
@@ -39,11 +39,6 @@ struct vc4_bo {
uint32_t handle;
uint32_t size;
-#ifdef USE_VC4_SIMULATOR
- void *simulator_winsys_map;
- uint32_t simulator_winsys_stride;
-#endif
-
/** Entry in the linked list of buffers freed, by age. */
struct list_head time_list;
/** Entry in the per-page-count linked list of buffers freed (by age). */
@@ -98,7 +93,7 @@ vc4_bo_unreference(struct vc4_bo **bo)
vc4_bo_last_unreference(*bo);
} else {
screen = (*bo)->screen;
- pipe_mutex_lock(screen->bo_handles_mutex);
+ mtx_lock(&screen->bo_handles_mutex);
if (pipe_reference(&(*bo)->reference, NULL)) {
util_hash_table_remove(screen->bo_handles,
@@ -106,7 +101,7 @@ vc4_bo_unreference(struct vc4_bo **bo)
vc4_bo_last_unreference(*bo);
}
- pipe_mutex_unlock(screen->bo_handles_mutex);
+ mtx_unlock(&screen->bo_handles_mutex);
}
*bo = NULL;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c
index afb9987f4..35578370e 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c
@@ -28,7 +28,7 @@
void
vc4_init_cl(void *mem_ctx, struct vc4_cl *cl)
{
- cl->base = ralloc_size(mem_ctx, 1);
+ cl->base = rzalloc_size(mem_ctx, 1); /* TODO: don't use rzalloc */
cl->next = cl->base;
cl->size = 0;
}
@@ -76,5 +76,7 @@ vc4_gem_hindex(struct vc4_job *job, struct vc4_bo *bo)
cl_ptr(&out, vc4_bo_reference(bo));
cl_end(&job->bo_pointers, out);
+ job->bo_space += bo->size;
+
return hindex;
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c
index 974df8a1d..401c160fc 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c
@@ -144,7 +144,12 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
vc4->fd = screen->fd;
slab_create_child(&vc4->transfer_pool, &screen->transfer_pool);
- vc4->blitter = util_blitter_create(pctx);
+
+ vc4->uploader = u_upload_create_default(&vc4->base);
+ vc4->base.stream_uploader = vc4->uploader;
+ vc4->base.const_uploader = vc4->uploader;
+
+ vc4->blitter = util_blitter_create(pctx);
if (!vc4->blitter)
goto fail;
@@ -153,10 +158,6 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
if (!vc4->primconvert)
goto fail;
- vc4->uploader = u_upload_create(pctx, 16 * 1024,
- PIPE_BIND_INDEX_BUFFER,
- PIPE_USAGE_STREAM);
-
vc4_debug |= saved_shaderdb_flag;
vc4->sample_mask = (1 << VC4_MAX_SAMPLES) - 1;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h
index c164eba80..6bd2424ec 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h
@@ -30,6 +30,7 @@
#include "pipe/p_context.h"
#include "pipe/p_state.h"
#include "util/slab.h"
+#include "xf86drm.h"
#define __user
#include "vc4_drm.h"
@@ -38,6 +39,13 @@
#include "vc4_cl.h"
#include "vc4_qir.h"
+#ifndef DRM_VC4_PARAM_SUPPORTS_ETC1
+#define DRM_VC4_PARAM_SUPPORTS_ETC1 4
+#endif
+#ifndef DRM_VC4_PARAM_SUPPORTS_THREADED_FS
+#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS 5
+#endif
+
#ifdef USE_VC4_SIMULATOR
#define using_vc4_simulator true
#else
@@ -162,6 +170,8 @@ struct vc4_compiled_shader {
*/
bool failed;
+ bool fs_threaded;
+
uint8_t num_inputs;
/* Byte offsets for the start of the vertex attributes 0-7, and the
@@ -218,6 +228,13 @@ struct vc4_job {
struct vc4_cl bo_handles;
struct vc4_cl bo_pointers;
uint32_t shader_rec_count;
+ /**
+ * Amount of memory used by the BOs in bo_pointers.
+ *
+ * Used for checking when we should flush the job early so we don't
+ * OOM.
+ */
+ uint32_t bo_space;
/** @{ Surfaces to submit rendering for. */
struct pipe_surface *color_read;
@@ -317,11 +334,12 @@ struct vc4_context {
uint64_t next_compiled_program_id;
struct ra_regs *regs;
- unsigned int reg_class_any;
- unsigned int reg_class_a_or_b_or_acc;
+ unsigned int reg_class_any[2];
+ unsigned int reg_class_a_or_b[2];
+ unsigned int reg_class_a_or_b_or_acc[2];
unsigned int reg_class_r0_r3;
- unsigned int reg_class_r4_or_a;
- unsigned int reg_class_a;
+ unsigned int reg_class_r4_or_a[2];
+ unsigned int reg_class_a[2];
uint8_t prim_mode;
@@ -433,6 +451,18 @@ void vc4_simulator_destroy(struct vc4_screen *screen);
int vc4_simulator_flush(struct vc4_context *vc4,
struct drm_vc4_submit_cl *args,
struct vc4_job *job);
+int vc4_simulator_ioctl(int fd, unsigned long request, void *arg);
+void vc4_simulator_open_from_handle(int fd, uint32_t winsys_stride,
+ int handle, uint32_t size);
+
+static inline int
+vc4_ioctl(int fd, unsigned long request, void *arg)
+{
+ if (using_vc4_simulator)
+ return vc4_simulator_ioctl(fd, request, arg);
+ else
+ return drmIoctl(fd, request, arg);
+}
void vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader);
void vc4_write_uniforms(struct vc4_context *vc4,
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c b/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c
index c5afc0cda..ebd080298 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c
@@ -155,7 +155,8 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4,
/* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
cl_u16(&shader_rec,
VC4_SHADER_FLAG_ENABLE_CLIPPING |
- VC4_SHADER_FLAG_FS_SINGLE_THREAD |
+ (vc4->prog.fs->fs_threaded ?
+ 0 : VC4_SHADER_FLAG_FS_SINGLE_THREAD) |
((info->mode == PIPE_PRIM_POINTS &&
vc4->rasterizer->base.point_size_per_vertex) ?
VC4_SHADER_FLAG_VS_POINT_SIZE : 0));
@@ -465,6 +466,13 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
job->resolve |= PIPE_CLEAR_COLOR0;
+ /* If we've used half of the presumably 256MB CMA area, flush the job
+ * so that we don't accumulate a job that will end up not being
+ * executable.
+ */
+ if (job->bo_space > 128 * 1024 * 1024)
+ vc4_flush(pctx);
+
if (vc4_debug & VC4_DEBUG_ALWAYS_FLUSH)
vc4_flush(pctx);
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_emit.c b/lib/mesa/src/gallium/drivers/vc4/vc4_emit.c
index 9258ceebe..b48d89a06 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_emit.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_emit.c
@@ -76,6 +76,7 @@ vc4_emit_state(struct pipe_context *pctx)
VC4_DIRTY_ZSA |
VC4_DIRTY_COMPILED_FS)) {
uint8_t ez_enable_mask_out = ~0;
+ uint8_t rasosm_mask_out = ~0;
/* HW-2905: If the RCL ends up doing a full-res load when
* multisampling, then early Z tracking may end up with values
@@ -89,10 +90,20 @@ vc4_emit_state(struct pipe_context *pctx)
if (job->msaa || vc4->prog.fs->disable_early_z)
ez_enable_mask_out &= ~VC4_CONFIG_BITS_EARLY_Z;
+ /* Don't set the rasterizer to oversample if we're doing our
+ * binning and load/stores in single-sample mode. This is for
+ * the samples == 1 case, where vc4 doesn't do any
+ * multisampling behavior.
+ */
+ if (!job->msaa) {
+ rasosm_mask_out &=
+ ~VC4_CONFIG_BITS_RASTERIZER_OVERSAMPLE_4X;
+ }
+
cl_u8(&bcl, VC4_PACKET_CONFIGURATION_BITS);
cl_u8(&bcl,
- vc4->rasterizer->config_bits[0] |
- vc4->zsa->config_bits[0]);
+ (vc4->rasterizer->config_bits[0] |
+ vc4->zsa->config_bits[0]) & rasosm_mask_out);
cl_u8(&bcl,
vc4->rasterizer->config_bits[1] |
vc4->zsa->config_bits[1]);
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_formats.c b/lib/mesa/src/gallium/drivers/vc4/vc4_formats.c
index dd700cdec..42cdad115 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_formats.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_formats.c
@@ -83,6 +83,8 @@ static const struct vc4_format vc4_format_table[] = {
FORMAT(B5G6R5_UNORM, RGB565, RGB565, SWIZ(X, Y, Z, 1)),
+ FORMAT(ETC1_RGB8, NO, ETC1, SWIZ(X, Y, Z, 1)),
+
/* Depth sampling will be handled by doing nearest filtering and not
* unpacking the RGBA value.
*/
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index b7e31b80c..2ed89ead5 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -494,7 +494,7 @@ vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b,
discard->num_components = 1;
discard->src[0] = nir_src_for_ssa(nir_inot(b, condition));
nir_builder_instr_insert(b, &discard->instr);
- c->s->info.fs.uses_discard = true;
+ c->s->info->fs.uses_discard = true;
}
static nir_ssa_def *
@@ -630,25 +630,14 @@ vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
{
nir_ssa_def *frag_color = intr->src[0].ssa;
- if (c->fs_key->sample_coverage) {
- nir_intrinsic_instr *load =
- nir_intrinsic_instr_create(b->shader,
- nir_intrinsic_load_sample_mask_in);
- load->num_components = 1;
- nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
- nir_builder_instr_insert(b, &load->instr);
-
- nir_ssa_def *bitmask = &load->dest.ssa;
-
- vc4_nir_store_sample_mask(c, b, bitmask);
- } else if (c->fs_key->sample_alpha_to_coverage) {
+ if (c->fs_key->sample_alpha_to_coverage) {
nir_ssa_def *a = nir_channel(b, frag_color, 3);
/* XXX: We should do a nice dither based on the fragment
* coordinate, instead.
*/
nir_ssa_def *num_samples = nir_imm_float(b, VC4_MAX_SAMPLES);
- nir_ssa_def *num_bits = nir_f2i(b, nir_fmul(b, a, num_samples));
+ nir_ssa_def *num_bits = nir_f2i32(b, nir_fmul(b, a, num_samples));
nir_ssa_def *bitmask = nir_isub(b,
nir_ishl(b,
nir_imm_int(b, 1),
@@ -730,4 +719,16 @@ vc4_nir_lower_blend(nir_shader *s, struct vc4_compile *c)
nir_metadata_dominance);
}
}
+
+ /* If we didn't do alpha-to-coverage on the output color, we still
+ * need to pass glSampleMask() through.
+ */
+ if (c->fs_key->sample_coverage && !c->fs_key->sample_alpha_to_coverage) {
+ nir_function_impl *impl = nir_shader_get_entrypoint(s);
+ nir_builder b;
+ nir_builder_init(&b, impl);
+ b.cursor = nir_after_block(nir_impl_last_block(impl));
+
+ vc4_nir_store_sample_mask(c, &b, nir_load_sample_mask_in(&b));
+ }
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index 4a795f8da..b7969a562 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -106,11 +106,11 @@ vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c,
} else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_SIGNED) {
if (chan->normalized) {
return nir_fmul(b,
- nir_i2f(b, vpm_reads[swiz]),
+ nir_i2f32(b, vpm_reads[swiz]),
nir_imm_float(b,
1.0 / 0x7fffffff));
} else {
- return nir_i2f(b, vpm_reads[swiz]);
+ return nir_i2f32(b, vpm_reads[swiz]);
}
} else if (chan->size == 8 &&
(chan->type == UTIL_FORMAT_TYPE_UNSIGNED ||
@@ -125,16 +125,16 @@ vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c,
nir_imm_float(b, 1.0));
} else {
return nir_fadd(b,
- nir_i2f(b,
- vc4_nir_unpack_8i(b, temp,
- swiz)),
+ nir_i2f32(b,
+ vc4_nir_unpack_8i(b, temp,
+ swiz)),
nir_imm_float(b, -128.0));
}
} else {
if (chan->normalized) {
return vc4_nir_unpack_8f(b, vpm, swiz);
} else {
- return nir_i2f(b, vc4_nir_unpack_8i(b, vpm, swiz));
+ return nir_i2f32(b, vc4_nir_unpack_8i(b, vpm, swiz));
}
}
} else if (chan->size == 16 &&
@@ -146,7 +146,7 @@ vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c,
* UNPACK_16_I for all of these.
*/
if (chan->type == UTIL_FORMAT_TYPE_SIGNED) {
- temp = nir_i2f(b, vc4_nir_unpack_16i(b, vpm, swiz & 1));
+ temp = nir_i2f32(b, vc4_nir_unpack_16i(b, vpm, swiz & 1));
if (chan->normalized) {
return nir_fmul(b, temp,
nir_imm_float(b, 1/32768.0f));
@@ -154,7 +154,7 @@ vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c,
return temp;
}
} else {
- temp = nir_i2f(b, vc4_nir_unpack_16u(b, vpm, swiz & 1));
+ temp = nir_i2f32(b, vc4_nir_unpack_16u(b, vpm, swiz & 1));
if (chan->normalized) {
return nir_fmul(b, temp,
nir_imm_float(b, 1 / 65535.0));
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index 01ad05d27..5e7d26923 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -94,14 +94,17 @@ static void
replace_with_mov(struct vc4_compile *c, struct qinst *inst, struct qreg arg)
{
dump_from(c, inst);
+
+ inst->src[0] = arg;
+ if (qir_has_implicit_tex_uniform(inst))
+ inst->src[1] = inst->src[qir_get_tex_uniform_src(inst)];
+
if (qir_is_mul(inst))
inst->op = QOP_MMOV;
else if (qir_is_float_input(inst))
inst->op = QOP_FMOV;
else
inst->op = QOP_MOV;
- inst->src[0] = arg;
- inst->src[1] = c->undef;
dump_to(c, inst);
}
@@ -172,8 +175,12 @@ qir_opt_algebraic(struct vc4_compile *c)
break;
case QOP_ADD:
- if (replace_x_0_with_x(c, inst, 0) ||
- replace_x_0_with_x(c, inst, 1)) {
+ /* Kernel validation requires that we use an actual
+ * add instruction.
+ */
+ if (inst->dst.file != QFILE_TEX_S_DIRECT &&
+ (replace_x_0_with_x(c, inst, 0) ||
+ replace_x_0_with_x(c, inst, 1))) {
progress = true;
break;
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_coalesce_ff_writes.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_coalesce_ff_writes.c
new file mode 100644
index 000000000..e4f8e57fc
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_coalesce_ff_writes.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file vc4_opt_coalesce_ff_writes.c
+ *
+ * This modifies instructions that generate the value consumed by a VPM or TMU
+ * coordinate write to write directly into the VPM or TMU.
+ */
+
+#include "vc4_qir.h"
+
+bool
+qir_opt_coalesce_ff_writes(struct vc4_compile *c)
+{
+ /* For now, only do this pass when we don't have control flow. */
+ struct qblock *block = qir_entry_block(c);
+ if (block != qir_exit_block(c))
+ return false;
+
+ bool progress = false;
+ uint32_t use_count[c->num_temps];
+ memset(&use_count, 0, sizeof(use_count));
+
+ qir_for_each_inst_inorder(inst, c) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
+ if (inst->src[i].file == QFILE_TEMP) {
+ uint32_t temp = inst->src[i].index;
+ use_count[temp]++;
+ }
+ }
+ }
+
+ qir_for_each_inst_inorder(mov_inst, c) {
+ if (!qir_is_raw_mov(mov_inst) || mov_inst->sf)
+ continue;
+ if (mov_inst->src[0].file != QFILE_TEMP)
+ continue;
+
+ if (!(mov_inst->dst.file == QFILE_VPM ||
+ mov_inst->dst.file == QFILE_TLB_COLOR_WRITE ||
+ mov_inst->dst.file == QFILE_TLB_COLOR_WRITE_MS ||
+ qir_is_tex(mov_inst)))
+ continue;
+
+ uint32_t temp = mov_inst->src[0].index;
+ if (use_count[temp] != 1)
+ continue;
+
+ struct qinst *inst = c->defs[temp];
+ if (!inst)
+ continue;
+
+ /* Don't bother trying to fold in an ALU op using a uniform to
+ * a texture op, as we'll just have to lower the uniform back
+ * out.
+ */
+ if (qir_is_tex(mov_inst) && qir_has_uniform_read(inst))
+ continue;
+
+ if (qir_depends_on_flags(inst) || inst->sf)
+ continue;
+
+ if (qir_has_side_effects(c, inst) ||
+ qir_has_side_effect_reads(c, inst) ||
+ inst->op == QOP_TLB_COLOR_READ ||
+ inst->op == QOP_VARY_ADD_C) {
+ continue;
+ }
+
+ /* Move the generating instruction into the position of the FF
+ * write.
+ */
+ c->defs[inst->dst.index] = NULL;
+ inst->dst.file = mov_inst->dst.file;
+ inst->dst.index = mov_inst->dst.index;
+ if (qir_has_implicit_tex_uniform(mov_inst)) {
+ inst->src[qir_get_tex_uniform_src(inst)] =
+ mov_inst->src[qir_get_tex_uniform_src(mov_inst)];
+ }
+
+ list_del(&inst->link);
+ list_addtail(&inst->link, &mov_inst->link);
+
+ qir_remove_instruction(c, mov_inst);
+
+ progress = true;
+ }
+
+ return progress;
+}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c
index 7ff916155..de642d465 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c
@@ -58,7 +58,7 @@ dump_to(struct vc4_compile *c, struct qinst *inst)
static bool
constant_fold(struct vc4_compile *c, struct qinst *inst)
{
- int nsrc = qir_get_op_nsrc(inst->op);
+ int nsrc = qir_get_nsrc(inst);
uint32_t ui[nsrc];
for (int i = 0; i < nsrc; i++) {
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index d20ee5e22..9a6320a9a 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -67,7 +67,7 @@ try_copy_prop(struct vc4_compile *c, struct qinst *inst, struct qinst **movs)
bool debug = false;
bool progress = false;
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file != QFILE_TEMP)
continue;
@@ -113,7 +113,7 @@ try_copy_prop(struct vc4_compile *c, struct qinst *inst, struct qinst **movs)
* this instruction doesn't already use it.
*/
bool already_has_unpack = false;
- for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) {
+ for (int j = 0; j < qir_get_nsrc(inst); j++) {
if (inst->src[j].pack)
already_has_unpack = true;
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c
index 1838c394f..f04d0ff97 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c
@@ -54,7 +54,7 @@ dce(struct vc4_compile *c, struct qinst *inst)
static bool
has_nonremovable_reads(struct vc4_compile *c, struct qinst *inst)
{
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file == QFILE_VPM) {
uint32_t attr = inst->src[i].index / 4;
uint32_t offset = (inst->src[i].index % 4) * 4;
@@ -88,7 +88,7 @@ qir_opt_dead_code(struct vc4_compile *c)
bool *used = calloc(c->num_temps, sizeof(bool));
qir_for_each_inst_inorder(inst, c) {
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file == QFILE_TEMP)
used[inst->src[i].index] = true;
}
@@ -129,7 +129,7 @@ qir_opt_dead_code(struct vc4_compile *c)
continue;
}
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file != QFILE_VPM)
continue;
uint32_t attr = inst->src[i].index / 4;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c
index f4856673b..577290b1f 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c
@@ -62,7 +62,7 @@ inst_srcs_updated(struct qinst *inst, struct qinst *writer)
*/
switch (writer->dst.file) {
case QFILE_TEMP:
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file == QFILE_TEMP &&
inst->src[i].index == writer->dst.index) {
return true;
@@ -95,7 +95,7 @@ inst_result_equals(struct qinst *a, struct qinst *b)
return false;
}
- for (int i = 0; i < qir_get_op_nsrc(a->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(a); i++) {
if (!qir_reg_equals(a->src[i], b->src[i]) ||
src_file_varies_on_reread(a->src[i]) ||
src_file_varies_on_reread(b->src[i])) {
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
index e97cb63ae..07eca71f2 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
@@ -45,7 +45,7 @@ qir_opt_small_immediates(struct vc4_compile *c)
* elsewhere).
*/
bool uses_small_imm = false;
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file == QFILE_SMALL_IMM)
uses_small_imm = true;
}
@@ -63,7 +63,7 @@ qir_opt_small_immediates(struct vc4_compile *c)
if (inst->op == QOP_MIN_NOIMM)
continue;
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
struct qreg src = qir_follow_movs(c, inst->src[i]);
if (src.file != QFILE_UNIF ||
@@ -73,11 +73,8 @@ qir_opt_small_immediates(struct vc4_compile *c)
continue;
}
- if (i == 1 &&
- (inst->op == QOP_TEX_S ||
- inst->op == QOP_TEX_T ||
- inst->op == QOP_TEX_R ||
- inst->op == QOP_TEX_B)) {
+ if (qir_is_tex(inst) &&
+ i == qir_get_tex_uniform_src(inst)) {
/* No turning the implicit uniform read into
* an immediate.
*/
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c
index 83ba11b81..6f196e7d1 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c
@@ -24,10 +24,8 @@
/**
* @file vc4_opt_vpm.c
*
- * This modifies instructions that:
- * 1. exclusively consume a value read from the VPM to directly read the VPM if
- * other operands allow it.
- * 2. generate the value consumed by a VPM write to write directly into the VPM.
+ * This modifies instructions that exclusively consume a value read from the
+ * VPM to directly read the VPM if other operands allow it.
*/
#include "vc4_qir.h"
@@ -44,21 +42,11 @@ qir_opt_vpm(struct vc4_compile *c)
return false;
bool progress = false;
- struct qinst *vpm_writes[64] = { 0 };
uint32_t use_count[c->num_temps];
- uint32_t vpm_write_count = 0;
memset(&use_count, 0, sizeof(use_count));
qir_for_each_inst_inorder(inst, c) {
- switch (inst->dst.file) {
- case QFILE_VPM:
- vpm_writes[vpm_write_count++] = inst;
- break;
- default:
- break;
- }
-
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file == QFILE_TEMP) {
uint32_t temp = inst->src[i].index;
use_count[temp]++;
@@ -81,7 +69,7 @@ qir_opt_vpm(struct vc4_compile *c)
qir_is_tex(inst))
continue;
- for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) {
+ for (int j = 0; j < qir_get_nsrc(inst); j++) {
if (inst->src[j].file != QFILE_TEMP ||
inst->src[j].pack)
continue;
@@ -106,7 +94,7 @@ qir_opt_vpm(struct vc4_compile *c)
}
uint32_t temps = 0;
- for (int k = 0; k < qir_get_op_nsrc(inst->op); k++) {
+ for (int k = 0; k < qir_get_nsrc(inst); k++) {
if (inst->src[k].file == QFILE_TEMP)
temps++;
}
@@ -127,42 +115,5 @@ qir_opt_vpm(struct vc4_compile *c)
}
}
- for (int i = 0; i < vpm_write_count; i++) {
- if (!qir_is_raw_mov(vpm_writes[i]) ||
- vpm_writes[i]->src[0].file != QFILE_TEMP) {
- continue;
- }
-
- uint32_t temp = vpm_writes[i]->src[0].index;
- if (use_count[temp] != 1)
- continue;
-
- struct qinst *inst = c->defs[temp];
- if (!inst)
- continue;
-
- if (qir_depends_on_flags(inst) || inst->sf)
- continue;
-
- if (qir_has_side_effects(c, inst) ||
- qir_has_side_effect_reads(c, inst)) {
- continue;
- }
-
- /* Move the generating instruction to the end of the program
- * to maintain the order of the VPM writes.
- */
- assert(!vpm_writes[i]->sf);
- list_del(&inst->link);
- list_addtail(&inst->link, &vpm_writes[i]->link);
- qir_remove_instruction(c, vpm_writes[i]);
-
- c->defs[inst->dst.index] = NULL;
- inst->dst.file = QFILE_VPM;
- inst->dst.index = 0;
-
- progress = true;
- }
-
return progress;
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c
index 00e16e3db..59368734d 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c
@@ -24,7 +24,7 @@
#include <inttypes.h>
#include "util/u_format.h"
-#include "util/u_hash.h"
+#include "util/crc32.h"
#include "util/u_math.h"
#include "util/u_memory.h"
#include "util/ralloc.h"
@@ -38,9 +38,6 @@
#include "vc4_qpu.h"
#include "vc4_qir.h"
#include "mesa/state_tracker/st_glsl_types.h"
-#ifdef USE_VC4_SIMULATOR
-#include "simpenrose/simpenrose.h"
-#endif
static struct qreg
ntq_get_src(struct vc4_compile *c, nir_src src, int i);
@@ -68,6 +65,23 @@ resize_qreg_array(struct vc4_compile *c,
(*regs)[i] = c->undef;
}
+static void
+ntq_emit_thrsw(struct vc4_compile *c)
+{
+ if (!c->fs_threaded)
+ return;
+
+ /* Always thread switch after each texture operation for now.
+ *
+ * We could do better by batching a bunch of texture fetches up and
+ * then doing one thread switch and collecting all their results
+ * afterward.
+ */
+ qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef,
+ c->undef, c->undef));
+ c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
+}
+
static struct qreg
indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
{
@@ -106,8 +120,14 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
qir_uniform_ui(c, (range->dst_offset +
range->size - 4)));
- qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
+ qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
+ indirect_offset,
+ qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
+
c->num_texture_samples++;
+
+ ntq_emit_thrsw(c);
+
return qir_TEX_RESULT(c);
}
@@ -140,10 +160,33 @@ ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
return qregs;
}
+/**
+ * This function is responsible for getting QIR results into the associated
+ * storage for a NIR instruction.
+ *
+ * If it's a NIR SSA def, then we just set the associated hash table entry to
+ * the new result.
+ *
+ * If it's a NIR reg, then we need to update the existing qreg assigned to the
+ * NIR destination with the incoming value. To do that without introducing
+ * new MOVs, we require that the incoming qreg either be a uniform, or be
+ * SSA-defined by the previous QIR instruction in the block and rewritable by
+ * this function. That lets us sneak ahead and insert the SF flag beforehand
+ * (knowing that the previous instruction doesn't depend on flags) and rewrite
+ * its destination to be the NIR reg's destination
+ */
static void
ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,
struct qreg result)
{
+ struct qinst *last_inst = NULL;
+ if (!list_empty(&c->cur_block->instructions))
+ last_inst = (struct qinst *)c->cur_block->instructions.prev;
+
+ assert(result.file == QFILE_UNIF ||
+ (result.file == QFILE_TEMP &&
+ last_inst && last_inst == c->defs[result.index]));
+
if (dest->is_ssa) {
assert(chan < dest->ssa.num_components);
@@ -165,17 +208,34 @@ ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,
_mesa_hash_table_search(c->def_ht, reg);
struct qreg *qregs = entry->data;
- /* Conditionally move the result to the destination if the
- * channel is active.
+ /* Insert a MOV if the source wasn't an SSA def in the
+ * previous instruction.
+ */
+ if (result.file == QFILE_UNIF) {
+ result = qir_MOV(c, result);
+ last_inst = c->defs[result.index];
+ }
+
+ /* We know they're both temps, so just rewrite index. */
+ c->defs[last_inst->dst.index] = NULL;
+ last_inst->dst.index = qregs[chan].index;
+
+ /* If we're in control flow, then make this update of the reg
+ * conditional on the execution mask.
*/
if (c->execute.file != QFILE_NULL) {
- struct qinst *mov;
+ last_inst->dst.index = qregs[chan].index;
+ /* Set the flags to the current exec mask. To insert
+ * the SF, we temporarily remove our SSA instruction.
+ */
+ list_del(&last_inst->link);
qir_SF(c, c->execute);
- mov = qir_MOV_cond(c, QPU_COND_ZS, qregs[chan], result);
- mov->cond_is_exec_mask = true;
- } else {
- qir_MOV_dest(c, qregs[chan], result);
+ list_addtail(&last_inst->link,
+ &c->cur_block->instructions);
+
+ last_inst->cond = QPU_COND_ZS;
+ last_inst->cond_is_exec_mask = true;
}
}
}
@@ -324,24 +384,24 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));
addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4));
- qir_TEX_DIRECT(c, addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
+ qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
+ addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
+
+ ntq_emit_thrsw(c);
struct qreg tex = qir_TEX_RESULT(c);
c->num_texture_samples++;
- struct qreg dest[4];
enum pipe_format format = c->key->tex[unit].format;
if (util_format_is_depth_or_stencil(format)) {
struct qreg scaled = ntq_scale_depth_texture(c, tex);
for (int i = 0; i < 4; i++)
- dest[i] = scaled;
+ ntq_store_dest(c, &instr->dest, i, qir_MOV(c, scaled));
} else {
for (int i = 0; i < 4; i++)
- dest[i] = qir_UNPACK_8_F(c, tex, i);
+ ntq_store_dest(c, &instr->dest, i,
+ qir_UNPACK_8_F(c, tex, i));
}
-
- for (int i = 0; i < 4; i++)
- ntq_store_dest(c, &instr->dest, i, dest[i]);
}
static void
@@ -375,7 +435,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
lod = ntq_get_src(c, instr->src[i].src, 0);
is_txl = true;
break;
- case nir_tex_src_comparitor:
+ case nir_tex_src_comparator:
compare = ntq_get_src(c, instr->src[i].src, 0);
break;
default:
@@ -383,6 +443,16 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
}
}
+ if (c->stage != QSTAGE_FRAG && !is_txl) {
+ /* From the GLSL 1.20 spec:
+ *
+ * "If it is mip-mapped and running on the vertex shader,
+ * then the base texture is used."
+ */
+ is_txl = true;
+ lod = qir_uniform_ui(c, 0);
+ }
+
if (c->key->tex[unit].force_first_level) {
lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit);
is_txl = true;
@@ -413,14 +483,20 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
unit | (is_txl << 16));
}
+ struct qinst *tmu;
if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
- qir_TEX_R(c, r, texture_u[next_texture_u++]);
+ tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r);
+ tmu->src[qir_get_tex_uniform_src(tmu)] =
+ texture_u[next_texture_u++];
} else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP ||
c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
- qir_TEX_R(c, qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR, unit),
- texture_u[next_texture_u++]);
+ tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0),
+ qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR,
+ unit));
+ tmu->src[qir_get_tex_uniform_src(tmu)] =
+ texture_u[next_texture_u++];
}
if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) {
@@ -431,14 +507,23 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
t = qir_SAT(c, t);
}
- qir_TEX_T(c, t, texture_u[next_texture_u++]);
+ tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t);
+ tmu->src[qir_get_tex_uniform_src(tmu)] =
+ texture_u[next_texture_u++];
- if (is_txl || is_txb)
- qir_TEX_B(c, lod, texture_u[next_texture_u++]);
+ if (is_txl || is_txb) {
+ tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod);
+ tmu->src[qir_get_tex_uniform_src(tmu)] =
+ texture_u[next_texture_u++];
+ }
- qir_TEX_S(c, s, texture_u[next_texture_u++]);
+ tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s);
+ tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++];
c->num_texture_samples++;
+
+ ntq_emit_thrsw(c);
+
struct qreg tex = qir_TEX_RESULT(c);
enum pipe_format format = c->key->tex[unit].format;
@@ -514,8 +599,11 @@ ntq_ffract(struct vc4_compile *c, struct qreg src)
struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
struct qreg diff = qir_FSUB(c, src, trunc);
qir_SF(c, diff);
- return qir_SEL(c, QPU_COND_NS,
- qir_FADD(c, diff, qir_uniform_f(c, 1.0)), diff);
+
+ qir_FADD_dest(c, diff,
+ diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
+
+ return qir_MOV(c, diff);
}
/**
@@ -525,15 +613,18 @@ ntq_ffract(struct vc4_compile *c, struct qreg src)
static struct qreg
ntq_ffloor(struct vc4_compile *c, struct qreg src)
{
- struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
+ struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
/* This will be < 0 if we truncated and the truncation was of a value
* that was < 0 in the first place.
*/
- qir_SF(c, qir_FSUB(c, src, trunc));
+ qir_SF(c, qir_FSUB(c, src, result));
+
+ struct qinst *sub = qir_FSUB_dest(c, result,
+ result, qir_uniform_f(c, 1.0));
+ sub->cond = QPU_COND_NS;
- return qir_SEL(c, QPU_COND_NS,
- qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)), trunc);
+ return qir_MOV(c, result);
}
/**
@@ -543,15 +634,17 @@ ntq_ffloor(struct vc4_compile *c, struct qreg src)
static struct qreg
ntq_fceil(struct vc4_compile *c, struct qreg src)
{
- struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
+ struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
/* This will be < 0 if we truncated and the truncation was of a value
* that was > 0 in the first place.
*/
- qir_SF(c, qir_FSUB(c, trunc, src));
+ qir_SF(c, qir_FSUB(c, result, src));
+
+ qir_FADD_dest(c, result,
+ result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
- return qir_SEL(c, QPU_COND_NS,
- qir_FADD(c, trunc, qir_uniform_f(c, 1.0)), trunc);
+ return qir_MOV(c, result);
}
static struct qreg
@@ -632,7 +725,7 @@ ntq_fsign(struct vc4_compile *c, struct qreg src)
qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
- return t;
+ return qir_MOV(c, t);
}
static void
@@ -811,7 +904,7 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
qir_PACK_8_F(c, result, src, i);
}
- ntq_store_dest(c, &instr->dest.dest, 0, result);
+ ntq_store_dest(c, &instr->dest.dest, 0, qir_MOV(c, result));
}
/** Handles sign-extended bitfield extracts for 16 bits. */
@@ -917,6 +1010,9 @@ ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
break;
}
+ /* Make the temporary for nir_store_dest(). */
+ *dest = qir_MOV(c, *dest);
+
return true;
}
@@ -943,7 +1039,7 @@ static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
out:
qir_SF(c, src[0]);
- return qir_SEL(c, QPU_COND_NS, src[1], src[2]);
+ return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2]));
}
static struct qreg
@@ -962,9 +1058,9 @@ ntq_fddx(struct vc4_compile *c, struct qreg src)
qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0),
qir_uniform_ui(c, 1)));
- return qir_SEL(c, QPU_COND_ZS,
- qir_FSUB(c, from_right, src),
- qir_FSUB(c, src, from_left));
+ return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
+ qir_FSUB(c, from_right, src),
+ qir_FSUB(c, src, from_left)));
}
static struct qreg
@@ -981,9 +1077,9 @@ ntq_fddy(struct vc4_compile *c, struct qreg src)
qir_reg(QFILE_QPU_ELEMENT, 0),
qir_uniform_ui(c, 2)));
- return qir_SEL(c, QPU_COND_ZS,
- qir_FSUB(c, from_top, src),
- qir_FSUB(c, src, from_bottom));
+ return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
+ qir_FSUB(c, from_top, src),
+ qir_FSUB(c, src, from_bottom)));
}
static void
@@ -1004,7 +1100,8 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
srcs[i] = ntq_get_src(c, instr->src[i].src,
instr->src[i].swizzle[0]);
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
- ntq_store_dest(c, &instr->dest.dest, i, srcs[i]);
+ ntq_store_dest(c, &instr->dest.dest, i,
+ qir_MOV(c, srcs[i]));
return;
}
@@ -1053,12 +1150,12 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
result = qir_FMAX(c, src[0], src[1]);
break;
- case nir_op_f2i:
- case nir_op_f2u:
+ case nir_op_f2i32:
+ case nir_op_f2u32:
result = qir_FTOI(c, src[0]);
break;
- case nir_op_i2f:
- case nir_op_u2f:
+ case nir_op_i2f32:
+ case nir_op_u2f32:
result = qir_ITOF(c, src[0]);
break;
case nir_op_b2f:
@@ -1070,9 +1167,9 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
case nir_op_i2b:
case nir_op_f2b:
qir_SF(c, src[0]);
- result = qir_SEL(c, QPU_COND_ZC,
- qir_uniform_ui(c, ~0),
- qir_uniform_ui(c, 0));
+ result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC,
+ qir_uniform_ui(c, ~0),
+ qir_uniform_ui(c, 0)));
break;
case nir_op_iadd:
@@ -1136,7 +1233,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
break;
case nir_op_fcsel:
qir_SF(c, src[0]);
- result = qir_SEL(c, QPU_COND_ZC, src[1], src[2]);
+ result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2]));
break;
case nir_op_frcp:
@@ -1250,7 +1347,7 @@ emit_frag_end(struct vc4_compile *c)
}
uint32_t discard_cond = QPU_COND_ALWAYS;
- if (c->s->info.fs.uses_discard) {
+ if (c->s->info->fs.uses_discard) {
qir_SF(c, c->discard);
discard_cond = QPU_COND_ZS;
}
@@ -1414,7 +1511,7 @@ emit_vert_end(struct vc4_compile *c,
static void
emit_coord_end(struct vc4_compile *c)
{
- struct qreg rcp_w = qir_RCP(c, c->outputs[c->output_position_index + 3]);
+ struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
emit_stub_vpm_read(c);
@@ -1448,6 +1545,10 @@ vc4_optimize_nir(struct nir_shader *s)
NIR_PASS(progress, s, nir_opt_algebraic);
NIR_PASS(progress, s, nir_opt_constant_folding);
NIR_PASS(progress, s, nir_opt_undef);
+ NIR_PASS(progress, s, nir_opt_loop_unroll,
+ nir_var_shader_in |
+ nir_var_shader_out |
+ nir_var_local);
} while (progress);
}
@@ -1605,6 +1706,47 @@ ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr)
}
static void
+ntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr)
+{
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+ assert(const_offset->u32[0] == 0);
+
+ /* Reads of the per-sample color need to be done in
+ * order.
+ */
+ int sample_index = (nir_intrinsic_base(instr) -
+ VC4_NIR_TLB_COLOR_READ_INPUT);
+ for (int i = 0; i <= sample_index; i++) {
+ if (c->color_reads[i].file == QFILE_NULL) {
+ c->color_reads[i] =
+ qir_TLB_COLOR_READ(c);
+ }
+ }
+ ntq_store_dest(c, &instr->dest, 0,
+ qir_MOV(c, c->color_reads[sample_index]));
+}
+
+static void
+ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr)
+{
+ assert(instr->num_components == 1);
+
+ nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+ assert(const_offset && "vc4 doesn't support indirect inputs");
+
+ if (c->stage == QSTAGE_FRAG &&
+ nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) {
+ ntq_emit_color_read(c, instr);
+ return;
+ }
+
+ uint32_t offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+ int comp = nir_intrinsic_component(instr);
+ ntq_store_dest(c, &instr->dest, 0,
+ qir_MOV(c, c->inputs[offset * 4 + comp]));
+}
+
+static void
ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
{
nir_const_value *const_offset;
@@ -1681,31 +1823,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_input:
- assert(instr->num_components == 1);
- const_offset = nir_src_as_const_value(instr->src[0]);
- assert(const_offset && "vc4 doesn't support indirect inputs");
- if (c->stage == QSTAGE_FRAG &&
- nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) {
- assert(const_offset->u32[0] == 0);
- /* Reads of the per-sample color need to be done in
- * order.
- */
- int sample_index = (nir_intrinsic_base(instr) -
- VC4_NIR_TLB_COLOR_READ_INPUT);
- for (int i = 0; i <= sample_index; i++) {
- if (c->color_reads[i].file == QFILE_NULL) {
- c->color_reads[i] =
- qir_TLB_COLOR_READ(c);
- }
- }
- ntq_store_dest(c, &instr->dest, 0,
- c->color_reads[sample_index]);
- } else {
- offset = nir_intrinsic_base(instr) + const_offset->u32[0];
- int comp = nir_intrinsic_component(instr);
- ntq_store_dest(c, &instr->dest, 0,
- c->inputs[offset * 4 + comp]);
- }
+ ntq_emit_load_input(c, instr);
break;
case nir_intrinsic_store_output:
@@ -1855,11 +1973,12 @@ ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt)
qir_link_blocks(c->cur_block, after_block);
qir_set_emit_block(c, after_block);
- if (was_top_level)
+ if (was_top_level) {
c->execute = c->undef;
- else
+ c->last_top_block = c->cur_block;
+ } else {
ntq_activate_execute_for_block(c);
-
+ }
}
static void
@@ -1983,10 +2102,12 @@ ntq_emit_loop(struct vc4_compile *c, nir_loop *loop)
qir_link_blocks(c->cur_block, c->loop_break_block);
qir_set_emit_block(c, c->loop_break_block);
- if (was_top_level)
+ if (was_top_level) {
c->execute = c->undef;
- else
+ c->last_top_block = c->cur_block;
+ } else {
ntq_activate_execute_for_block(c);
+ }
c->loop_break_block = save_loop_break_block;
c->loop_cont_block = save_loop_cont_block;
@@ -2037,7 +2158,7 @@ ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl)
static void
nir_to_qir(struct vc4_compile *c)
{
- if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard)
+ if (c->stage == QSTAGE_FRAG && c->s->info->fs.uses_discard)
c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
ntq_setup_inputs(c);
@@ -2063,11 +2184,13 @@ static const nir_shader_compiler_options nir_options = {
.lower_fsqrt = true,
.lower_negate = true,
.native_integers = true,
+ .max_unroll_iterations = 32,
};
const void *
vc4_screen_get_compiler_options(struct pipe_screen *pscreen,
- enum pipe_shader_ir ir, unsigned shader)
+ enum pipe_shader_ir ir,
+ enum pipe_shader_type shader)
{
return &nir_options;
}
@@ -2089,7 +2212,7 @@ count_nir_instrs(nir_shader *nir)
static struct vc4_compile *
vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
- struct vc4_key *key)
+ struct vc4_key *key, bool fs_threaded)
{
struct vc4_compile *c = qir_compile_init();
@@ -2099,6 +2222,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
c->program_id = key->shader_state->program_id;
c->variant_id =
p_atomic_inc_return(&key->shader_state->compiled_variant_count);
+ c->fs_threaded = fs_threaded;
c->key = key;
switch (stage) {
@@ -2216,6 +2340,17 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
switch (stage) {
case QSTAGE_FRAG:
+ /* FS threading requires that the thread execute
+ * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating
+ * (with no other THRSW afterwards, obviously). If we didn't
+ * fetch a texture at a top level block, this wouldn't be
+ * true.
+ */
+ if (c->fs_threaded && !c->last_thrsw_at_top_level) {
+ c->failed = true;
+ return c;
+ }
+
emit_frag_end(c);
break;
case QSTAGE_VERT:
@@ -2300,7 +2435,7 @@ vc4_shader_state_create(struct pipe_context *pctx,
}
NIR_PASS_V(s, nir_opt_global_to_local);
- NIR_PASS_V(s, nir_convert_to_ssa);
+ NIR_PASS_V(s, nir_lower_regs_to_ssa);
NIR_PASS_V(s, nir_normalize_cubemap_coords);
NIR_PASS_V(s, nir_lower_load_const_to_scalar);
@@ -2360,7 +2495,7 @@ vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c,
memset(input_live, 0, sizeof(input_live));
qir_for_each_inst_inorder(inst, c) {
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file == QFILE_VARY)
input_live[inst->src[i].index] = true;
}
@@ -2416,12 +2551,16 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
{
struct hash_table *ht;
uint32_t key_size;
+ bool try_threading;
+
if (stage == QSTAGE_FRAG) {
ht = vc4->fs_cache;
key_size = sizeof(struct vc4_fs_key);
+ try_threading = vc4->screen->has_threaded_fs;
} else {
ht = vc4->vs_cache;
key_size = sizeof(struct vc4_vs_key);
+ try_threading = false;
}
struct vc4_compiled_shader *shader;
@@ -2429,7 +2568,13 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
if (entry)
return entry->data;
- struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key);
+ struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading);
+ /* If the FS failed to compile threaded, fall back to single threaded. */
+ if (try_threading && c->failed) {
+ qir_compile_destroy(c);
+ c = vc4_shader_ntq(vc4, stage, key, false);
+ }
+
shader = rzalloc(NULL, struct vc4_compiled_shader);
shader->program_id = vc4->next_compiled_program_id++;
@@ -2438,7 +2583,7 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
/* Note: the temporary clone in c->s has been freed. */
nir_shader *orig_shader = key->shader_state->base.ir.nir;
- if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH))
+ if (orig_shader->info->outputs_written & (1 << FRAG_RESULT_DEPTH))
shader->disable_early_z = true;
} else {
shader->num_inputs = c->num_inputs;
@@ -2463,6 +2608,8 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
sizeof(uint64_t));
}
+ shader->fs_threaded = c->fs_threaded;
+
/* Copy the compiler UBO range state to the compiled shader, dropping
* out arrays that were never referenced by an indirect load.
*
@@ -2496,10 +2643,17 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
}
}
+ if ((vc4_debug & VC4_DEBUG_SHADERDB) && stage == QSTAGE_FRAG) {
+ fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d FS threads\n",
+ qir_get_stage_name(c->stage),
+ c->program_id, c->variant_id,
+ 1 + shader->fs_threaded);
+ }
+
qir_compile_destroy(c);
struct vc4_key *dup_key;
- dup_key = ralloc_size(shader, key_size);
+ dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */
memcpy(dup_key, key, key_size);
_mesa_hash_table_insert(ht, dup_key, shader);
@@ -2573,8 +2727,7 @@ vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
}
if (job->msaa) {
key->msaa = vc4->rasterizer->base.multisample;
- key->sample_coverage = (vc4->rasterizer->base.multisample &&
- vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);
+ key->sample_coverage = (vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);
key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage;
key->sample_alpha_to_one = vc4->blend->alpha_to_one;
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c
index 4b94fcfb9..c829e7f93 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c
@@ -76,13 +76,10 @@ static const struct qir_op_info qir_op_info[] = {
[QOP_FRAG_Z] = { "frag_z", 1, 0 },
[QOP_FRAG_W] = { "frag_w", 1, 0 },
- [QOP_TEX_S] = { "tex_s", 0, 2, true },
- [QOP_TEX_T] = { "tex_t", 0, 2, true },
- [QOP_TEX_R] = { "tex_r", 0, 2, true },
- [QOP_TEX_B] = { "tex_b", 0, 2, true },
- [QOP_TEX_DIRECT] = { "tex_direct", 0, 2, true },
[QOP_TEX_RESULT] = { "tex_result", 1, 0, true },
+ [QOP_THRSW] = { "thrsw", 0, 0, true },
+
[QOP_LOAD_IMM] = { "load_imm", 0, 1 },
[QOP_LOAD_IMM_U2] = { "load_imm_u2", 0, 1 },
[QOP_LOAD_IMM_I2] = { "load_imm_i2", 0, 1 },
@@ -103,12 +100,35 @@ qir_get_op_name(enum qop qop)
}
int
-qir_get_op_nsrc(enum qop qop)
+qir_get_non_sideband_nsrc(struct qinst *inst)
{
- if (qop < ARRAY_SIZE(qir_op_info) && qir_op_info[qop].name)
- return qir_op_info[qop].nsrc;
- else
- abort();
+ assert(qir_op_info[inst->op].name);
+ return qir_op_info[inst->op].nsrc;
+}
+
+int
+qir_get_nsrc(struct qinst *inst)
+{
+ assert(qir_op_info[inst->op].name);
+
+ int nsrc = qir_get_non_sideband_nsrc(inst);
+
+ /* Normal (non-direct) texture coordinate writes also implicitly load
+ * a uniform for the texture parameters.
+ */
+ if (qir_is_tex(inst) && inst->dst.file != QFILE_TEX_S_DIRECT)
+ nsrc++;
+
+ return nsrc;
+}
+
+/* The sideband uniform for textures gets stored after the normal ALU
+ * arguments.
+ */
+int
+qir_get_tex_uniform_src(struct qinst *inst)
+{
+ return qir_get_nsrc(inst) - 1;
}
/**
@@ -123,6 +143,11 @@ qir_has_side_effects(struct vc4_compile *c, struct qinst *inst)
case QFILE_TLB_COLOR_WRITE:
case QFILE_TLB_COLOR_WRITE_MS:
case QFILE_TLB_STENCIL_SETUP:
+ case QFILE_TEX_S_DIRECT:
+ case QFILE_TEX_S:
+ case QFILE_TEX_T:
+ case QFILE_TEX_R:
+ case QFILE_TEX_B:
return true;
default:
break;
@@ -139,7 +164,7 @@ qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst)
* point/line coordinates reads, because they're generated by
* fixed-function hardware.
*/
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file == QFILE_VARY &&
c->input_slots[inst->src[i].index].slot == 0xff) {
return true;
@@ -156,6 +181,17 @@ qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst)
}
bool
+qir_has_uniform_read(struct qinst *inst)
+{
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
+ if (inst->src[i].file == QFILE_UNIF)
+ return true;
+ }
+
+ return false;
+}
+
+bool
qir_is_mul(struct qinst *inst)
{
switch (inst->op) {
@@ -207,7 +243,30 @@ qir_is_raw_mov(struct qinst *inst)
bool
qir_is_tex(struct qinst *inst)
{
- return inst->op >= QOP_TEX_S && inst->op <= QOP_TEX_DIRECT;
+ switch (inst->dst.file) {
+ case QFILE_TEX_S_DIRECT:
+ case QFILE_TEX_S:
+ case QFILE_TEX_T:
+ case QFILE_TEX_R:
+ case QFILE_TEX_B:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool
+qir_has_implicit_tex_uniform(struct qinst *inst)
+{
+ switch (inst->dst.file) {
+ case QFILE_TEX_S:
+ case QFILE_TEX_T:
+ case QFILE_TEX_R:
+ case QFILE_TEX_B:
+ return true;
+ default:
+ return false;
+ }
}
bool
@@ -299,6 +358,11 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
[QFILE_FRAG_Y] = "frag_y",
[QFILE_FRAG_REV_FLAG] = "frag_rev_flag",
[QFILE_QPU_ELEMENT] = "elem",
+ [QFILE_TEX_S_DIRECT] = "tex_s_direct",
+ [QFILE_TEX_S] = "tex_s",
+ [QFILE_TEX_T] = "tex_t",
+ [QFILE_TEX_R] = "tex_r",
+ [QFILE_TEX_B] = "tex_b",
};
switch (reg.file) {
@@ -331,6 +395,11 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
case QFILE_TLB_COLOR_WRITE_MS:
case QFILE_TLB_Z_WRITE:
case QFILE_TLB_STENCIL_SETUP:
+ case QFILE_TEX_S_DIRECT:
+ case QFILE_TEX_S:
+ case QFILE_TEX_T:
+ case QFILE_TEX_R:
+ case QFILE_TEX_B:
fprintf(stderr, "%s", files[reg.file]);
break;
@@ -371,7 +440,7 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
}
}
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
fprintf(stderr, ", ");
qir_print_reg(c, inst->src[i], false);
vc4_qpu_disasm_unpack(stderr, inst->src[i].pack);
@@ -382,6 +451,7 @@ void
qir_dump(struct vc4_compile *c)
{
int ip = 0;
+ int pressure = 0;
qir_for_each_block(block, c) {
fprintf(stderr, "BLOCK %d:\n", block->index);
@@ -389,6 +459,8 @@ qir_dump(struct vc4_compile *c)
if (c->temp_start) {
bool first = true;
+ fprintf(stderr, "%3d ", pressure);
+
for (int i = 0; i < c->num_temps; i++) {
if (c->temp_start[i] != ip)
continue;
@@ -399,6 +471,7 @@ qir_dump(struct vc4_compile *c)
fprintf(stderr, ", ");
}
fprintf(stderr, "S%4d", i);
+ pressure++;
}
if (first)
@@ -420,6 +493,7 @@ qir_dump(struct vc4_compile *c)
fprintf(stderr, ", ");
}
fprintf(stderr, "E%4d", i);
+ pressure--;
}
if (first)
@@ -471,7 +545,6 @@ qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1)
inst->op = op;
inst->dst = dst;
- inst->src = calloc(2, sizeof(inst->src[0]));
inst->src[0] = src0;
inst->src[1] = src1;
inst->cond = QPU_COND_ALWAYS;
@@ -479,26 +552,6 @@ qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1)
return inst;
}
-struct qinst *
-qir_inst4(enum qop op, struct qreg dst,
- struct qreg a,
- struct qreg b,
- struct qreg c,
- struct qreg d)
-{
- struct qinst *inst = CALLOC_STRUCT(qinst);
-
- inst->op = op;
- inst->dst = dst;
- inst->src = calloc(4, sizeof(*inst->src));
- inst->src[0] = a;
- inst->src[1] = b;
- inst->src[2] = c;
- inst->src[3] = d;
-
- return inst;
-}
-
static void
qir_emit(struct vc4_compile *c, struct qinst *inst)
{
@@ -593,6 +646,7 @@ qir_compile_init(void)
list_inithead(&c->blocks);
qir_set_emit_block(c, qir_new_block(c));
+ c->last_top_block = c->cur_block;
c->output_position_index = -1;
c->output_color_index = -1;
@@ -612,7 +666,6 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst)
c->defs[qinst->dst.index] = NULL;
list_del(&qinst->link);
- free(qinst->src);
free(qinst);
}
@@ -744,6 +797,7 @@ qir_optimize(struct vc4_compile *c)
OPTPASS(qir_opt_dead_code);
OPTPASS(qir_opt_small_immediates);
OPTPASS(qir_opt_vpm);
+ OPTPASS(qir_opt_coalesce_ff_writes);
if (!progress)
break;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h
index b3cac6bf2..6469e51b0 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h
@@ -55,6 +55,18 @@ enum qfile {
QFILE_TLB_Z_WRITE,
QFILE_TLB_STENCIL_SETUP,
+ /* If tex_s is written on its own without preceding t/r/b setup, it's
+ * a direct memory access using the input value, without the sideband
+ * uniform load. We represent these in QIR as a separate write
+ * destination so we can tell if the sideband uniform is present.
+ */
+ QFILE_TEX_S_DIRECT,
+
+ QFILE_TEX_S,
+ QFILE_TEX_T,
+ QFILE_TEX_R,
+ QFILE_TEX_B,
+
/* Payload registers that aren't in the physical register file, so we
* can just use the corresponding qpu_reg at qpu_emit time.
*/
@@ -133,30 +145,22 @@ enum qop {
QOP_FRAG_Z,
QOP_FRAG_W,
- /** Texture x coordinate parameter write */
- QOP_TEX_S,
- /** Texture y coordinate parameter write */
- QOP_TEX_T,
- /** Texture border color parameter or cube map z coordinate write */
- QOP_TEX_R,
- /** Texture LOD bias parameter write */
- QOP_TEX_B,
-
- /**
- * Texture-unit 4-byte read with address provided direct in S
- * cooordinate.
- *
- * The first operand is the offset from the start of the UBO, and the
- * second is the uniform that has the UBO's base pointer.
- */
- QOP_TEX_DIRECT,
-
/**
* Signal of texture read being necessary and then reading r4 into
* the destination
*/
QOP_TEX_RESULT,
+ /**
+ * Insert the signal for switching threads in a threaded fragment
+ * shader. No value can be live in an accumulator across a thrsw.
+ *
+ * At the QPU level, this will have several delay slots before the
+ * switch happens. Those slots are the responsibility of the
+ * scheduler.
+ */
+ QOP_THRSW,
+
/* 32-bit immediate loaded to each SIMD channel */
QOP_LOAD_IMM,
@@ -194,7 +198,7 @@ struct qinst {
enum qop op;
struct qreg dst;
- struct qreg *src;
+ struct qreg src[3];
bool sf;
bool cond_is_exec_mask;
uint8_t cond;
@@ -502,9 +506,13 @@ struct vc4_compile {
struct qblock *cur_block;
struct qblock *loop_cont_block;
struct qblock *loop_break_block;
+ struct qblock *last_top_block;
struct list_head qpu_inst_list;
+ /* Pre-QPU-scheduled instruction containing the last THRSW */
+ uint64_t *last_thrsw;
+
uint64_t *qpu_insts;
uint32_t qpu_inst_count;
uint32_t qpu_inst_size;
@@ -524,6 +532,15 @@ struct vc4_compile {
uint32_t program_id;
uint32_t variant_id;
+
+ /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH
+ * is used to hide texturing latency at the cost of limiting ourselves
+ * to the bottom half of physical reg space.
+ */
+ bool fs_threaded;
+
+ bool last_thrsw_at_top_level;
+
bool failed;
};
@@ -543,11 +560,6 @@ struct qblock *qir_entry_block(struct vc4_compile *c);
struct qblock *qir_exit_block(struct vc4_compile *c);
struct qinst *qir_inst(enum qop op, struct qreg dst,
struct qreg src0, struct qreg src1);
-struct qinst *qir_inst4(enum qop op, struct qreg dst,
- struct qreg a,
- struct qreg b,
- struct qreg c,
- struct qreg d);
void qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst);
struct qreg qir_uniform(struct vc4_compile *c,
enum quniform_contents contents,
@@ -561,13 +573,17 @@ struct qinst *qir_emit_nondef(struct vc4_compile *c, struct qinst *inst);
struct qreg qir_get_temp(struct vc4_compile *c);
void qir_calculate_live_intervals(struct vc4_compile *c);
-int qir_get_op_nsrc(enum qop qop);
+int qir_get_nsrc(struct qinst *inst);
+int qir_get_non_sideband_nsrc(struct qinst *inst);
+int qir_get_tex_uniform_src(struct qinst *inst);
bool qir_reg_equals(struct qreg a, struct qreg b);
bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst);
bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst);
+bool qir_has_uniform_read(struct qinst *inst);
bool qir_is_mul(struct qinst *inst);
bool qir_is_raw_mov(struct qinst *inst);
bool qir_is_tex(struct qinst *inst);
+bool qir_has_implicit_tex_uniform(struct qinst *inst);
bool qir_is_float_input(struct qinst *inst);
bool qir_depends_on_flags(struct qinst *inst);
bool qir_writes_r4(struct qinst *inst);
@@ -582,6 +598,7 @@ void qir_validate(struct vc4_compile *c);
void qir_optimize(struct vc4_compile *c);
bool qir_opt_algebraic(struct vc4_compile *c);
+bool qir_opt_coalesce_ff_writes(struct vc4_compile *c);
bool qir_opt_constant_folding(struct vc4_compile *c);
bool qir_opt_copy_propagation(struct vc4_compile *c);
bool qir_opt_dead_code(struct vc4_compile *c);
@@ -722,11 +739,6 @@ QIR_ALU1(RSQ)
QIR_ALU1(EXP2)
QIR_ALU1(LOG2)
QIR_ALU1(VARY_ADD_C)
-QIR_NODST_2(TEX_S)
-QIR_NODST_2(TEX_T)
-QIR_NODST_2(TEX_R)
-QIR_NODST_2(TEX_B)
-QIR_NODST_2(TEX_DIRECT)
QIR_PAYLOAD(FRAG_Z)
QIR_PAYLOAD(FRAG_W)
QIR_ALU0(TEX_RESULT)
@@ -737,10 +749,8 @@ static inline struct qreg
qir_SEL(struct vc4_compile *c, uint8_t cond, struct qreg src0, struct qreg src1)
{
struct qreg t = qir_get_temp(c);
- struct qinst *a = qir_MOV_dest(c, t, src0);
- struct qinst *b = qir_MOV_dest(c, t, src1);
- a->cond = cond;
- b->cond = qpu_cond_complement(cond);
+ qir_MOV_dest(c, t, src1);
+ qir_MOV_dest(c, t, src0)->cond = cond;
return t;
}
@@ -881,6 +891,6 @@ qir_BRANCH(struct vc4_compile *c, uint8_t cond)
#define qir_for_each_inst_inorder(inst, c) \
qir_for_each_block(_block, c) \
- qir_for_each_inst(inst, _block)
+ qir_for_each_inst_safe(inst, _block)
#endif /* VC4_QIR_H */
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c
index 3fd6358e3..443682a46 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c
@@ -36,24 +36,10 @@
#include "util/u_math.h"
static bool
-inst_reads_a_uniform(struct qinst *inst)
-{
- if (qir_is_tex(inst))
- return true;
-
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
- if (inst->src[i].file == QFILE_UNIF)
- return true;
- }
-
- return false;
-}
-
-static bool
block_reads_any_uniform(struct qblock *block)
{
qir_for_each_inst(inst, block) {
- if (inst_reads_a_uniform(inst))
+ if (qir_has_uniform_read(inst))
return true;
}
@@ -94,7 +80,7 @@ qir_emit_uniform_stream_resets(struct vc4_compile *c)
}
qir_for_each_inst(inst, block) {
- if (inst_reads_a_uniform(inst))
+ if (qir_has_uniform_read(inst))
uniform_count++;
}
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c
index beefb0d7f..7108b3ee9 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c
@@ -205,7 +205,7 @@ qir_setup_def_use(struct vc4_compile *c)
_mesa_hash_table_clear(partial_update_ht, NULL);
qir_for_each_inst(inst, block) {
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++)
+ for (int i = 0; i < qir_get_nsrc(inst); i++)
qir_setup_use(c, block, ip, inst->src[i]);
qir_setup_def(c, block, ip, partial_update_ht, inst);
@@ -301,8 +301,13 @@ qir_calculate_live_intervals(struct vc4_compile *c)
{
int bitset_words = BITSET_WORDS(c->num_temps);
- c->temp_start = reralloc(c, c->temp_start, int, c->num_temps);
- c->temp_end = reralloc(c, c->temp_end, int, c->num_temps);
+ /* If we called this function more than once, then we should be
+ * freeing the previous arrays.
+ */
+ assert(!c->temp_start);
+
+ c->temp_start = rzalloc_array(c, int, c->num_temps);
+ c->temp_end = rzalloc_array(c, int, c->num_temps);
for (int i = 0; i < c->num_temps; i++) {
c->temp_start[i] = MAX_INSTRUCTION;
@@ -310,10 +315,10 @@ qir_calculate_live_intervals(struct vc4_compile *c)
}
qir_for_each_block(block, c) {
- block->def = reralloc(c, block->def, BITSET_WORD, bitset_words);
- block->use = reralloc(c, block->use, BITSET_WORD, bitset_words);
- block->live_in = reralloc(c, block->live_in, BITSET_WORD, bitset_words);
- block->live_out = reralloc(c, block->live_out, BITSET_WORD, bitset_words);
+ block->def = rzalloc_array(c, BITSET_WORD, bitset_words);
+ block->use = rzalloc_array(c, BITSET_WORD, bitset_words);
+ block->live_in = rzalloc_array(c, BITSET_WORD, bitset_words);
+ block->live_out = rzalloc_array(c, BITSET_WORD, bitset_words);
}
qir_setup_def_use(c);
@@ -322,4 +327,27 @@ qir_calculate_live_intervals(struct vc4_compile *c)
;
qir_compute_start_end(c, c->num_temps);
+
+ if (vc4_debug & VC4_DEBUG_SHADERDB) {
+ int last_ip = 0;
+ for (int i = 0; i < c->num_temps; i++)
+ last_ip = MAX2(last_ip, c->temp_end[i]);
+
+ int reg_pressure = 0;
+ int max_reg_pressure = 0;
+ for (int i = 0; i < last_ip; i++) {
+ for (int j = 0; j < c->num_temps; j++) {
+ if (c->temp_start[j] == i)
+ reg_pressure++;
+ if (c->temp_end[j] == i)
+ reg_pressure--;
+ }
+ max_reg_pressure = MAX2(max_reg_pressure, reg_pressure);
+ }
+
+ fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d max temps\n",
+ qir_get_stage_name(c->stage),
+ c->program_id, c->variant_id,
+ max_reg_pressure);
+ }
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
index 8ec6c7973..9ecfe6521 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
@@ -77,7 +77,7 @@ is_lowerable_uniform(struct qinst *inst, int i)
if (inst->src[i].file != QFILE_UNIF)
return false;
if (qir_is_tex(inst))
- return i != 1;
+ return i != qir_get_tex_uniform_src(inst);
return true;
}
@@ -89,7 +89,7 @@ qir_get_instruction_uniform_count(struct qinst *inst)
{
uint32_t count = 0;
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file != QFILE_UNIF)
continue;
@@ -119,7 +119,7 @@ qir_lower_uniforms(struct vc4_compile *c)
* ht.
*/
qir_for_each_inst_inorder(inst, c) {
- uint32_t nsrc = qir_get_op_nsrc(inst->op);
+ uint32_t nsrc = qir_get_nsrc(inst);
if (qir_get_instruction_uniform_count(inst) <= 1)
continue;
@@ -155,7 +155,7 @@ qir_lower_uniforms(struct vc4_compile *c)
struct qinst *mov = NULL;
qir_for_each_inst(inst, block) {
- uint32_t nsrc = qir_get_op_nsrc(inst->op);
+ uint32_t nsrc = qir_get_nsrc(inst);
uint32_t count = qir_get_instruction_uniform_count(inst);
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c
index 69bd0dd62..5118caf31 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c
@@ -187,7 +187,7 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
* ignore uniforms accesses, because qir_reorder_uniforms() happens
* after this.
*/
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
switch (inst->src[i].file) {
case QFILE_TEMP:
add_dep(dir,
@@ -212,23 +212,35 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
add_dep(dir, state->last_vary_read, n);
break;
- case QOP_TEX_S:
- case QOP_TEX_T:
- case QOP_TEX_R:
- case QOP_TEX_B:
- case QOP_TEX_DIRECT:
- /* Texturing setup gets scheduled in order, because
- * the uniforms referenced by them have to land in a
- * specific order.
- */
- add_write_dep(dir, &state->last_tex_coord, n);
- break;
-
case QOP_TEX_RESULT:
/* Results have to be fetched in order. */
add_write_dep(dir, &state->last_tex_result, n);
break;
+ case QOP_THRSW:
+ /* After a new THRSW, one must collect all texture samples
+ * queued since the previous THRSW/program start. For now, we
+ * have one THRSW in between each texture setup and its
+ * results collection as our input, and we just make sure that
+ * that ordering is maintained.
+ */
+ add_write_dep(dir, &state->last_tex_coord, n);
+ add_write_dep(dir, &state->last_tex_result, n);
+
+ /* accumulators and flags are lost across thread switches. */
+ add_write_dep(dir, &state->last_sf, n);
+
+ /* Setup, like the varyings, will need to be drained before we
+ * thread switch.
+ */
+ add_write_dep(dir, &state->last_vary_read, n);
+
+ /* The TLB-locking operations have to stay after the last
+ * thread switch.
+ */
+ add_write_dep(dir, &state->last_tlb, n);
+ break;
+
case QOP_TLB_COLOR_READ:
case QOP_MS_MASK:
add_write_dep(dir, &state->last_tlb, n);
@@ -254,6 +266,18 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
add_write_dep(dir, &state->last_tlb, n);
break;
+ case QFILE_TEX_S_DIRECT:
+ case QFILE_TEX_S:
+ case QFILE_TEX_T:
+ case QFILE_TEX_R:
+ case QFILE_TEX_B:
+ /* Texturing setup gets scheduled in order, because
+ * the uniforms referenced by them have to land in a
+ * specific order.
+ */
+ add_write_dep(dir, &state->last_tex_coord, n);
+ break;
+
default:
break;
}
@@ -281,7 +305,7 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
calculate_deps(&state, n);
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
switch (inst->src[i].file) {
case QFILE_UNIF:
add_dep(state.dir, state.last_uniforms_reset, n);
@@ -291,26 +315,59 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
}
}
- switch (inst->op) {
- case QOP_TEX_S:
- case QOP_TEX_T:
- case QOP_TEX_R:
- case QOP_TEX_B:
- case QOP_TEX_DIRECT:
- /* If the texture coordinate fifo is full,
- * block this on the last QOP_TEX_RESULT.
+ switch (inst->dst.file) {
+ case QFILE_TEX_S_DIRECT:
+ case QFILE_TEX_S:
+ case QFILE_TEX_T:
+ case QFILE_TEX_R:
+ case QFILE_TEX_B:
+ /* From the VC4 spec:
+ *
+ * "The TFREQ input FIFO holds two full lots of s,
+ * t, r, b data, plus associated setup data, per
+ * QPU, that is, there are eight data slots. For
+ * each texture request, slots are only consumed
+ * for the components of s, t, r, and b actually
+ * written. Thus the FIFO can hold four requests
+ * of just (s, t) data, or eight requests of just
+ * s data (for direct addressed data lookups).
+ *
+ * Note that there is one FIFO per QPU, and the
+ * FIFO has no concept of threads - that is,
+ * multi-threaded shaders must be careful to use
+ * only 1/2 the FIFO depth before reading
+ * back. Multi-threaded programs must also
+ * therefore always thread switch on texture
+ * fetch as the other thread may have data
+ * waiting in the FIFO."
+ *
+ * If the texture coordinate fifo is full, block this
+ * on the last QOP_TEX_RESULT.
*/
- if (state.tfreq_count == 8) {
+ if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) {
block_until_tex_result(&state, n);
}
- /* If the texture result fifo is full, block
- * adding any more to it until the last
- * QOP_TEX_RESULT.
+ /* From the VC4 spec:
+ *
+ * "Since the maximum number of texture requests
+ * in the input (TFREQ) FIFO is four lots of (s,
+ * t) data, the output (TFRCV) FIFO is sized to
+ * holds four lots of max-size color data per
+ * QPU. For non-float color, reads are packed
+ * RGBA8888 data (one read per pixel). For 16-bit
+ * float color, two reads are necessary per
+ * pixel, with reads packed as RG1616 then
+ * BA1616. So per QPU there are eight color slots
+ * in the TFRCV FIFO."
+ *
+ * If the texture result fifo is full, block adding
+ * any more to it until the last QOP_TEX_RESULT.
*/
- if (inst->op == QOP_TEX_S ||
- inst->op == QOP_TEX_DIRECT) {
- if (state.tfrcv_count == 4)
+ if (inst->dst.file == QFILE_TEX_S ||
+ inst->dst.file == QFILE_TEX_S_DIRECT) {
+ if (state.tfrcv_count ==
+ (c->fs_threaded ? 2 : 4))
block_until_tex_result(&state, n);
state.tfrcv_count++;
}
@@ -319,6 +376,11 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
state.tfreq_count++;
break;
+ default:
+ break;
+ }
+
+ switch (inst->op) {
case QOP_TEX_RESULT:
/* Results have to be fetched after the
* coordinate setup. Note that we're assuming
@@ -341,7 +403,6 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
break;
default:
- assert(!qir_is_tex(inst));
break;
}
}
@@ -372,11 +433,21 @@ get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
state->temp_writes[inst->dst.index] == 1)
cost--;
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
- if (inst->src[i].file == QFILE_TEMP &&
- !BITSET_TEST(state->temp_live, inst->src[i].index)) {
- cost++;
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
+ if (inst->src[i].file != QFILE_TEMP ||
+ BITSET_TEST(state->temp_live, inst->src[i].index)) {
+ continue;
}
+
+ bool already_counted = false;
+ for (int j = 0; j < i; j++) {
+ if (inst->src[i].file == inst->src[j].file &&
+ inst->src[i].index == inst->src[j].index) {
+ already_counted = true;
+ }
+ }
+ if (!already_counted)
+ cost++;
}
return cost;
@@ -503,11 +574,33 @@ dump_state(struct vc4_compile *c, struct schedule_state *state)
static uint32_t
latency_between(struct schedule_node *before, struct schedule_node *after)
{
- if ((before->inst->op == QOP_TEX_S ||
- before->inst->op == QOP_TEX_DIRECT) &&
+ if ((before->inst->dst.file == QFILE_TEX_S ||
+ before->inst->dst.file == QFILE_TEX_S_DIRECT) &&
after->inst->op == QOP_TEX_RESULT)
return 100;
+ switch (before->inst->op) {
+ case QOP_RCP:
+ case QOP_RSQ:
+ case QOP_EXP2:
+ case QOP_LOG2:
+ for (int i = 0; i < qir_get_nsrc(after->inst); i++) {
+ if (after->inst->src[i].file ==
+ before->inst->dst.file &&
+ after->inst->src[i].index ==
+ before->inst->dst.index) {
+ /* There are two QPU delay slots before we can
+ * read a math result, which could be up to 4
+ * QIR instructions if they packed well.
+ */
+ return 4;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
return 1;
}
@@ -532,7 +625,7 @@ compute_delay(struct schedule_node *n)
compute_delay(n->children[i]);
n->delay = MAX2(n->delay,
n->children[i]->delay +
- latency_between(n, n->children[i]));
+ latency_between(n->children[i], n));
}
}
}
@@ -583,15 +676,15 @@ schedule_instructions(struct vc4_compile *c,
child->unblocked_time = MAX2(child->unblocked_time,
state->time +
- latency_between(chosen,
- child));
+ latency_between(child,
+ chosen));
child->parent_count--;
if (child->parent_count == 0)
list_add(&child->link, &state->worklist);
}
/* Update our tracking of register pressure. */
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file == QFILE_TEMP)
BITSET_SET(state->temp_live, inst->src[i].index);
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c
index e7cfe5ad2..302eb4826 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c
@@ -84,9 +84,28 @@ void qir_validate(struct vc4_compile *c)
case QFILE_LOAD_IMM:
fail_instr(c, inst, "Bad dest file");
break;
+
+ case QFILE_TEX_S:
+ case QFILE_TEX_T:
+ case QFILE_TEX_R:
+ case QFILE_TEX_B:
+ if (inst->src[qir_get_tex_uniform_src(inst)].file !=
+ QFILE_UNIF) {
+ fail_instr(c, inst,
+ "tex op missing implicit uniform");
+ }
+ break;
+
+ case QFILE_TEX_S_DIRECT:
+ if (inst->op != QOP_ADD) {
+ fail_instr(c, inst,
+ "kernel validation requires that "
+ "direct texture lookups use an ADD");
+ }
+ break;
}
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
struct qreg src = inst->src[i];
switch (src.file) {
@@ -119,6 +138,11 @@ void qir_validate(struct vc4_compile *c)
case QFILE_TLB_COLOR_WRITE_MS:
case QFILE_TLB_Z_WRITE:
case QFILE_TLB_STENCIL_SETUP:
+ case QFILE_TEX_S_DIRECT:
+ case QFILE_TEX_S:
+ case QFILE_TEX_T:
+ case QFILE_TEX_R:
+ case QFILE_TEX_B:
fail_instr(c, inst, "Bad src file");
break;
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c
index 67850a811..380b9f43c 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c
@@ -323,6 +323,7 @@ qpu_waddr_ignores_ws(uint32_t waddr)
case QPU_W_ACC1:
case QPU_W_ACC2:
case QPU_W_ACC3:
+ case QPU_W_NOP:
case QPU_W_TLB_Z:
case QPU_W_TLB_COLOR_MS:
case QPU_W_TLB_COLOR_ALL:
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c
index 529472272..9ea26455b 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c
@@ -86,11 +86,11 @@ static const char *qpu_sig[] = {
static const char *qpu_pack_mul[] = {
[QPU_PACK_MUL_NOP] = "",
- [QPU_PACK_MUL_8888] = "8888",
- [QPU_PACK_MUL_8A] = "8a",
- [QPU_PACK_MUL_8B] = "8b",
- [QPU_PACK_MUL_8C] = "8c",
- [QPU_PACK_MUL_8D] = "8d",
+ [QPU_PACK_MUL_8888] = ".8888",
+ [QPU_PACK_MUL_8A] = ".8a",
+ [QPU_PACK_MUL_8B] = ".8b",
+ [QPU_PACK_MUL_8C] = ".8c",
+ [QPU_PACK_MUL_8D] = ".8d",
};
/* The QPU unpack for A and R4 files can be described the same, it's just that
@@ -264,7 +264,7 @@ get_special_write_desc(int reg, bool is_a)
void
vc4_qpu_disasm_pack_mul(FILE *out, uint32_t pack)
{
- fprintf(out, ".%s", DESC(qpu_pack_mul, pack));
+ fprintf(out, "%s", DESC(qpu_pack_mul, pack));
}
void
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 2ee52a497..aaa3a0412 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -157,7 +157,7 @@ setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
* address.
*
* In that case, we need to move one to a temporary that can be used in the
- * instruction, instead. We reserve ra31/rb31 for this purpose.
+ * instruction, instead. We reserve ra14/rb14 for this purpose.
*/
static void
fixup_raddr_conflict(struct qblock *block,
@@ -183,9 +183,9 @@ fixup_raddr_conflict(struct qblock *block,
* in case of unpacks.
*/
if (qir_is_float_input(inst))
- queue(block, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
+ queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
else
- queue(block, qpu_a_MOV(qpu_rb(31), *src0));
+ queue(block, qpu_a_MOV(qpu_rb(14), *src0));
/* If we had an unpack on this A-file source, we need to put
* it into this MOV, not into the later move from regfile B.
@@ -194,10 +194,10 @@ fixup_raddr_conflict(struct qblock *block,
*last_inst(block) |= *unpack;
*unpack = 0;
}
- *src0 = qpu_rb(31);
+ *src0 = qpu_rb(14);
} else {
- queue(block, qpu_a_MOV(qpu_ra(31), *src0));
- *src0 = qpu_ra(31);
+ queue(block, qpu_a_MOV(qpu_ra(14), *src0));
+ *src0 = qpu_ra(14);
}
}
@@ -226,10 +226,14 @@ static void
handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
struct qpu_reg dst)
{
- if (dst.mux != QPU_MUX_R4)
+ if (dst.mux != QPU_MUX_R4) {
queue(block, qpu_a_MOV(dst, qpu_r4()));
- else if (qinst->sf)
- queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
+ set_last_cond_add(block, qinst->cond);
+ } else {
+ assert(qinst->cond == QPU_COND_ALWAYS);
+ if (qinst->sf)
+ queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
+ }
}
static void
@@ -290,8 +294,8 @@ vc4_generate_code_block(struct vc4_compile *c,
};
uint64_t unpack = 0;
- struct qpu_reg src[4];
- for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
+ struct qpu_reg src[ARRAY_SIZE(qinst->src)];
+ for (int i = 0; i < qir_get_nsrc(qinst); i++) {
int index = qinst->src[i].index;
switch (qinst->src[i].file) {
case QFILE_NULL:
@@ -349,6 +353,11 @@ vc4_generate_code_block(struct vc4_compile *c,
case QFILE_TLB_COLOR_WRITE_MS:
case QFILE_TLB_Z_WRITE:
case QFILE_TLB_STENCIL_SETUP:
+ case QFILE_TEX_S:
+ case QFILE_TEX_S_DIRECT:
+ case QFILE_TEX_T:
+ case QFILE_TEX_R:
+ case QFILE_TEX_B:
unreachable("bad qir src file");
}
}
@@ -381,6 +390,23 @@ vc4_generate_code_block(struct vc4_compile *c,
dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
break;
+ case QFILE_TEX_S:
+ case QFILE_TEX_S_DIRECT:
+ dst = qpu_rb(QPU_W_TMU0_S);
+ break;
+
+ case QFILE_TEX_T:
+ dst = qpu_rb(QPU_W_TMU0_T);
+ break;
+
+ case QFILE_TEX_R:
+ dst = qpu_rb(QPU_W_TMU0_R);
+ break;
+
+ case QFILE_TEX_B:
+ dst = qpu_rb(QPU_W_TMU0_B);
+ break;
+
case QFILE_VARY:
case QFILE_UNIF:
case QFILE_SMALL_IMM:
@@ -422,6 +448,7 @@ vc4_generate_code_block(struct vc4_compile *c,
}
handle_r4_qpu_write(block, qinst, dst);
+ handled_qinst_cond = true;
break;
@@ -473,33 +500,27 @@ vc4_generate_code_block(struct vc4_compile *c,
*last_inst(block) = qpu_set_sig(*last_inst(block),
QPU_SIG_COLOR_LOAD);
handle_r4_qpu_write(block, qinst, dst);
+ handled_qinst_cond = true;
break;
case QOP_VARY_ADD_C:
queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
break;
- case QOP_TEX_S:
- case QOP_TEX_T:
- case QOP_TEX_R:
- case QOP_TEX_B:
- queue(block, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
- (qinst->op - QOP_TEX_S)),
- src[0]) | unpack);
- break;
-
- case QOP_TEX_DIRECT:
- fixup_raddr_conflict(block, dst, &src[0], &src[1],
- qinst, &unpack);
- queue(block, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
- src[0], src[1]) | unpack);
- break;
case QOP_TEX_RESULT:
queue(block, qpu_NOP());
*last_inst(block) = qpu_set_sig(*last_inst(block),
QPU_SIG_LOAD_TMU0);
handle_r4_qpu_write(block, qinst, dst);
+ handled_qinst_cond = true;
+ break;
+
+ case QOP_THRSW:
+ queue(block, qpu_NOP());
+ *last_inst(block) = qpu_set_sig(*last_inst(block),
+ QPU_SIG_THREAD_SWITCH);
+ c->last_thrsw = last_inst(block);
break;
case QOP_BRANCH:
@@ -533,7 +554,7 @@ vc4_generate_code_block(struct vc4_compile *c,
* argument slot as well so that we don't take up
* another raddr just to get unused data.
*/
- if (qir_get_op_nsrc(qinst->op) == 1)
+ if (qir_get_non_sideband_nsrc(qinst) == 1)
src[1] = src[0];
fixup_raddr_conflict(block, dst, &src[0], &src[1],
@@ -587,6 +608,23 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
qir_for_each_block(block, c)
vc4_generate_code_block(c, block, temp_registers);
+ /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
+ *
+ * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
+ * that ensures that a later thread doesn't try to lock the scoreboard
+ * and terminate before an earlier-spawned thread on the same QPU, by
+ * delaying switching back to the later shader until earlier has
+ * finished. Otherwise, if the earlier thread was hitting the same
+ * quad, the scoreboard would deadlock.
+ */
+ if (c->last_thrsw) {
+ assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
+ QPU_SIG_THREAD_SWITCH);
+ *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
+ QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
+ QPU_SIG));
+ }
+
uint32_t cycles = qpu_schedule_instructions(c);
uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index 25adbe671..9141396c8 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -385,12 +385,27 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
switch (sig) {
case QPU_SIG_SW_BREAKPOINT:
case QPU_SIG_NONE:
- case QPU_SIG_THREAD_SWITCH:
- case QPU_SIG_LAST_THREAD_SWITCH:
case QPU_SIG_SMALL_IMM:
case QPU_SIG_LOAD_IMM:
break;
+ case QPU_SIG_THREAD_SWITCH:
+ case QPU_SIG_LAST_THREAD_SWITCH:
+ /* All accumulator contents and flags are undefined after the
+ * switch.
+ */
+ for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
+ add_write_dep(state, &state->last_r[i], n);
+ add_write_dep(state, &state->last_sf, n);
+
+ /* Scoreboard-locking operations have to stay after the last
+ * thread switch.
+ */
+ add_write_dep(state, &state->last_tlb, n);
+
+ add_write_dep(state, &state->last_tmu_write, n);
+ break;
+
case QPU_SIG_LOAD_TMU0:
case QPU_SIG_LOAD_TMU1:
/* TMU loads are coming from a FIFO, so ordering is important.
@@ -453,6 +468,7 @@ struct choose_scoreboard {
int last_sfu_write_tick;
int last_uniforms_reset_tick;
uint32_t last_waddr_a, last_waddr_b;
+ bool tlb_locked;
};
static bool
@@ -461,6 +477,11 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+ /* Full immediate loads don't read any registers. */
+ if (sig == QPU_SIG_LOAD_IMM)
+ return false;
+
uint32_t src_muxes[] = {
QPU_GET_FIELD(inst, QPU_ADD_A),
QPU_GET_FIELD(inst, QPU_ADD_B),
@@ -554,15 +575,28 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
struct schedule_node *chosen = NULL;
int chosen_prio = 0;
+ /* Don't pair up anything with a thread switch signal -- emit_thrsw()
+ * will handle pairing it along with filling the delay slots.
+ */
+ if (prev_inst) {
+ uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst,
+ QPU_SIG);
+ if (prev_sig == QPU_SIG_THREAD_SWITCH ||
+ prev_sig == QPU_SIG_LAST_THREAD_SWITCH) {
+ return NULL;
+ }
+ }
+
list_for_each_entry(struct schedule_node, n, schedule_list, link) {
uint64_t inst = n->inst->inst;
+ uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
/* Don't choose the branch instruction until it's the last one
* left. XXX: We could potentially choose it before it's the
* last one, if the remaining instructions fit in the delay
* slots.
*/
- if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH &&
+ if (sig == QPU_SIG_BRANCH &&
!list_is_singular(schedule_list)) {
continue;
}
@@ -586,9 +620,25 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
* that they're compatible.
*/
if (prev_inst) {
+ /* Don't pair up a thread switch signal -- we'll
+ * handle pairing it when we pick it on its own.
+ */
+ if (sig == QPU_SIG_THREAD_SWITCH ||
+ sig == QPU_SIG_LAST_THREAD_SWITCH) {
+ continue;
+ }
+
if (prev_inst->uniform != -1 && n->uniform != -1)
continue;
+ /* Don't merge in something that will lock the TLB.
+ * Hopwefully what we have in inst will release some
+ * other instructions, allowing us to delay the
+ * TLB-locking instruction until later.
+ */
+ if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
+ continue;
+
inst = qpu_merge_inst(prev_inst->inst->inst, inst);
if (!inst)
continue;
@@ -647,6 +697,9 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
waddr_mul == QPU_W_UNIFORMS_ADDRESS) {
scoreboard->last_uniforms_reset_tick = scoreboard->tick;
}
+
+ if (qpu_inst_is_tlb(inst))
+ scoreboard->tlb_locked = true;
}
static void
@@ -678,6 +731,26 @@ static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
/* Apply some huge latency between texture fetch requests and getting
* their results back.
+ *
+ * FIXME: This is actually pretty bogus. If we do:
+ *
+ * mov tmu0_s, a
+ * <a bit of math>
+ * mov tmu0_s, b
+ * load_tmu0
+ * <more math>
+ * load_tmu0
+ *
+ * we count that as worse than
+ *
+ * mov tmu0_s, a
+ * mov tmu0_s, b
+ * <lots of math>
+ * load_tmu0
+ * <more math>
+ * load_tmu0
+ *
+ * because we associate the first load_tmu0 with the *second* tmu0_s.
*/
if (waddr == QPU_W_TMU0_S) {
if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
@@ -768,6 +841,51 @@ mark_instruction_scheduled(struct list_head *schedule_list,
}
}
+/**
+ * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
+ * with another instruction.
+ */
+static void
+emit_thrsw(struct vc4_compile *c,
+ struct choose_scoreboard *scoreboard,
+ uint64_t inst)
+{
+ uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+ /* There should be nothing in a thrsw inst being scheduled other than
+ * the signal bits.
+ */
+ assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP);
+ assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP);
+
+ /* Try to find an earlier scheduled instruction that we can merge the
+ * thrsw into.
+ */
+ int thrsw_ip = c->qpu_inst_count;
+ for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
+ uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
+ uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
+
+ if (prev_sig == QPU_SIG_NONE)
+ thrsw_ip = c->qpu_inst_count - i;
+ }
+
+ if (thrsw_ip != c->qpu_inst_count) {
+ /* Merge the thrsw into the existing instruction. */
+ c->qpu_insts[thrsw_ip] =
+ QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
+ } else {
+ qpu_serialize_one_inst(c, inst);
+ update_scoreboard_for_chosen(scoreboard, inst);
+ }
+
+ /* Fill the delay slots. */
+ while (c->qpu_inst_count < thrsw_ip + 3) {
+ update_scoreboard_for_chosen(scoreboard, qpu_NOP());
+ qpu_serialize_one_inst(c, qpu_NOP());
+ }
+}
+
static uint32_t
schedule_instructions(struct vc4_compile *c,
struct choose_scoreboard *scoreboard,
@@ -860,10 +978,6 @@ schedule_instructions(struct vc4_compile *c,
fprintf(stderr, "\n");
}
- qpu_serialize_one_inst(c, inst);
-
- update_scoreboard_for_chosen(scoreboard, inst);
-
/* Now that we've scheduled a new instruction, some of its
* children can be promoted to the list of instructions ready to
* be scheduled. Update the children's unblocked time for this
@@ -872,6 +986,14 @@ schedule_instructions(struct vc4_compile *c,
mark_instruction_scheduled(schedule_list, time, chosen, false);
mark_instruction_scheduled(schedule_list, time, merge, false);
+ if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
+ QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
+ emit_thrsw(c, scoreboard, inst);
+ } else {
+ qpu_serialize_one_inst(c, inst);
+ update_scoreboard_for_chosen(scoreboard, inst);
+ }
+
scoreboard->tick++;
time++;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c
index 02fadaf61..08dd6e5df 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c
@@ -58,6 +58,10 @@ _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
return false;
+ /* Load immediates don't read any registers. */
+ if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
+ return false;
+
for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
if (!ignore_a &&
src_regs[i].mux == QPU_MUX_A &&
@@ -109,6 +113,7 @@ void
vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
{
bool scoreboard_locked = false;
+ bool threaded = false;
/* We don't want to do validation in release builds, but we want to
* keep compiling the validation code to make sure it doesn't get
@@ -120,11 +125,17 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
for (int i = 0; i < num_inst; i++) {
uint64_t inst = insts[i];
+ uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
- if (QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_PROG_END) {
+ if (sig != QPU_SIG_PROG_END) {
if (qpu_inst_is_tlb(inst))
scoreboard_locked = true;
+ if (sig == QPU_SIG_THREAD_SWITCH ||
+ sig == QPU_SIG_LAST_THREAD_SWITCH) {
+ threaded = true;
+ }
+
continue;
}
@@ -359,4 +370,98 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
waddr_mul == QPU_W_UNIFORMS_ADDRESS)
last_unif_pointer_update = i;
}
+
+ if (threaded) {
+ bool last_thrsw_found = false;
+ bool scoreboard_locked = false;
+ int tex_samples_outstanding = 0;
+ int last_tex_samples_outstanding = 0;
+ int thrsw_ip = -1;
+
+ for (int i = 0; i < num_inst; i++) {
+ uint64_t inst = insts[i];
+ uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+ if (i == thrsw_ip) {
+ /* In order to get texture results back in the
+ * correct order, before a new thrsw we have
+ * to read all the texture results from before
+ * the previous thrsw.
+ *
+ * FIXME: Is collecting the remaining results
+ * during the delay slots OK, or should we do
+ * this at THRSW signal time?
+ */
+ if (last_tex_samples_outstanding != 0) {
+ fail_instr(inst, "THRSW with texture "
+ "results from the previous "
+ "THRSW still in the FIFO.");
+ }
+
+ last_tex_samples_outstanding =
+ tex_samples_outstanding;
+ tex_samples_outstanding = 0;
+ }
+
+ if (qpu_inst_is_tlb(inst))
+ scoreboard_locked = true;
+
+ switch (sig) {
+ case QPU_SIG_THREAD_SWITCH:
+ case QPU_SIG_LAST_THREAD_SWITCH:
+ /* No thread switching with the scoreboard
+ * locked. Doing so means we may deadlock
+ * when the other thread tries to lock
+ * scoreboard.
+ */
+ if (scoreboard_locked) {
+ fail_instr(inst, "THRSW with the "
+ "scoreboard locked.");
+ }
+
+ /* No thread switching after lthrsw, since
+ * lthrsw means that we get delayed until the
+ * other shader is ready for us to terminate.
+ */
+ if (last_thrsw_found) {
+ fail_instr(inst, "THRSW after a "
+ "previous LTHRSW");
+ }
+
+ if (sig == QPU_SIG_LAST_THREAD_SWITCH)
+ last_thrsw_found = true;
+
+ /* No THRSW while we already have a THRSW
+ * queued.
+ */
+ if (i < thrsw_ip) {
+ fail_instr(inst,
+ "THRSW with a THRSW queued.");
+ }
+
+ thrsw_ip = i + 3;
+ break;
+
+ case QPU_SIG_LOAD_TMU0:
+ case QPU_SIG_LOAD_TMU1:
+ if (last_tex_samples_outstanding == 0) {
+ fail_instr(inst, "TMU load with nothing "
+ "in the results fifo from "
+ "the previous THRSW.");
+ }
+
+ last_tex_samples_outstanding--;
+ break;
+ }
+
+ uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+ uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+ if (waddr_add == QPU_W_TMU0_S ||
+ waddr_add == QPU_W_TMU1_S ||
+ waddr_mul == QPU_W_TMU0_S ||
+ waddr_mul == QPU_W_TMU1_S) {
+ tex_samples_outstanding++;
+ }
+ }
+ }
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c b/lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c
index ab343ee31..506fdb593 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -115,37 +115,67 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), true);
- vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
- vc4->reg_class_a_or_b_or_acc = ra_alloc_reg_class(vc4->regs);
- vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
- vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
+ /* The physical regfiles split us into two classes, with [0] being the
+ * whole space and [1] being the bottom half (for threaded fragment
+ * shaders).
+ */
+ for (int i = 0; i < 2; i++) {
+ vc4->reg_class_any[i] = ra_alloc_reg_class(vc4->regs);
+ vc4->reg_class_a_or_b[i] = ra_alloc_reg_class(vc4->regs);
+ vc4->reg_class_a_or_b_or_acc[i] = ra_alloc_reg_class(vc4->regs);
+ vc4->reg_class_r4_or_a[i] = ra_alloc_reg_class(vc4->regs);
+ vc4->reg_class_a[i] = ra_alloc_reg_class(vc4->regs);
+ }
vc4->reg_class_r0_r3 = ra_alloc_reg_class(vc4->regs);
- for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
- /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in
+
+ /* r0-r3 */
+ for (uint32_t i = ACC_INDEX; i < ACC_INDEX + 4; i++) {
+ ra_class_add_reg(vc4->regs, vc4->reg_class_r0_r3, i);
+ ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i);
+ ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i);
+ }
+
+ /* R4 gets a special class because it can't be written as a general
+ * purpose register. (it's TMU_NOSWAP as a write address).
+ */
+ for (int i = 0; i < 2; i++) {
+ ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[i],
+ ACC_INDEX + 4);
+ ra_class_add_reg(vc4->regs, vc4->reg_class_any[i],
+ ACC_INDEX + 4);
+ }
+
+ /* A/B */
+ for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i ++) {
+ /* Reserve ra14/rb14 for spilling fixup_raddr_conflict() in
* vc4_qpu_emit.c
*/
- if (vc4_regs[i].addr == 31)
+ if (vc4_regs[i].addr == 14)
continue;
- /* R4 can't be written as a general purpose register. (it's
- * TMU_NOSWAP as a write address).
- */
- if (vc4_regs[i].mux == QPU_MUX_R4) {
- ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
- ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
- continue;
+ ra_class_add_reg(vc4->regs, vc4->reg_class_any[0], i);
+ ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[0], i);
+ ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i);
+
+ if (vc4_regs[i].addr < 16) {
+ ra_class_add_reg(vc4->regs, vc4->reg_class_any[1], i);
+ ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[1], i);
+ ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i);
}
- if (vc4_regs[i].mux <= QPU_MUX_R3)
- ra_class_add_reg(vc4->regs, vc4->reg_class_r0_r3, i);
- ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
- ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc, i);
- }
+ /* A only */
+ if (((i - AB_INDEX) & 1) == 0) {
+ ra_class_add_reg(vc4->regs, vc4->reg_class_a[0], i);
+ ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[0], i);
- for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) {
- ra_class_add_reg(vc4->regs, vc4->reg_class_a, i);
- ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
+ if (vc4_regs[i].addr < 16) {
+ ra_class_add_reg(vc4->regs,
+ vc4->reg_class_a[1], i);
+ ra_class_add_reg(vc4->regs,
+ vc4->reg_class_r4_or_a[1], i);
+ }
+ }
}
ra_set_finalize(vc4->regs, NULL);
@@ -166,7 +196,7 @@ node_to_temp_priority(const void *in_a, const void *in_b)
}
#define CLASS_BIT_A (1 << 0)
-#define CLASS_BIT_B_OR_ACC (1 << 1)
+#define CLASS_BIT_B (1 << 1)
#define CLASS_BIT_R4 (1 << 2)
#define CLASS_BIT_R0_R3 (1 << 4)
@@ -212,7 +242,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
* incrementally remove bits that the temp definitely can't be in.
*/
memset(class_bits,
- CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4,
+ CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3,
sizeof(class_bits));
int ip = 0;
@@ -226,6 +256,14 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
if (c->temp_start[i] < ip && c->temp_end[i] > ip)
class_bits[i] &= ~CLASS_BIT_R4;
}
+
+ /* If we're doing a conditional write of something
+ * writing R4 (math, tex results), then make sure that
+ * we store in a temp so that we actually
+ * conditionally move the result.
+ */
+ if (inst->cond != QPU_COND_ALWAYS)
+ class_bits[inst->dst.index] &= ~CLASS_BIT_R4;
} else {
/* R4 can't be written as a general purpose
* register. (it's TMU_NOSWAP as a write address).
@@ -250,6 +288,17 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
class_bits[inst->src[0].index] &= CLASS_BIT_R0_R3;
break;
+ case QOP_THRSW:
+ /* All accumulators are invalidated across a thread
+ * switch.
+ */
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] < ip && c->temp_end[i] > ip)
+ class_bits[i] &= ~(CLASS_BIT_R0_R3 |
+ CLASS_BIT_R4);
+ }
+ break;
+
default:
break;
}
@@ -265,7 +314,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
* can only be done from regfile A, while float unpacks can be
* either A or R4.
*/
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file == QFILE_TEMP &&
inst->src[i].pack) {
if (qir_is_float_input(inst)) {
@@ -285,22 +334,40 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
int node = temp_to_node[i];
switch (class_bits[i]) {
- case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
- ra_set_node_class(g, node, vc4->reg_class_any);
+ case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3:
+ ra_set_node_class(g, node,
+ vc4->reg_class_any[c->fs_threaded]);
break;
- case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
- ra_set_node_class(g, node, vc4->reg_class_a_or_b_or_acc);
+ case CLASS_BIT_A | CLASS_BIT_B:
+ ra_set_node_class(g, node,
+ vc4->reg_class_a_or_b[c->fs_threaded]);
+ break;
+ case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R0_R3:
+ ra_set_node_class(g, node,
+ vc4->reg_class_a_or_b_or_acc[c->fs_threaded]);
break;
case CLASS_BIT_A | CLASS_BIT_R4:
- ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
+ ra_set_node_class(g, node,
+ vc4->reg_class_r4_or_a[c->fs_threaded]);
break;
case CLASS_BIT_A:
- ra_set_node_class(g, node, vc4->reg_class_a);
+ ra_set_node_class(g, node,
+ vc4->reg_class_a[c->fs_threaded]);
break;
case CLASS_BIT_R0_R3:
ra_set_node_class(g, node, vc4->reg_class_r0_r3);
break;
+
default:
+ /* DDX/DDY used across thread switched might get us
+ * here.
+ */
+ if (c->fs_threaded) {
+ c->failed = true;
+ free(temp_registers);
+ return NULL;
+ }
+
fprintf(stderr, "temp %d: bad class bits: 0x%x\n",
i, class_bits[i]);
abort();
@@ -321,9 +388,13 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
bool ok = ra_allocate(g);
if (!ok) {
- fprintf(stderr, "Failed to register allocate:\n");
- qir_dump(c);
+ if (!c->fs_threaded) {
+ fprintf(stderr, "Failed to register allocate:\n");
+ qir_dump(c);
+ }
+
c->failed = true;
+ free(temp_registers);
return NULL;
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c b/lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
index 7d5076f42..37acefdc0 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
@@ -46,7 +46,7 @@ qir_reorder_uniforms(struct vc4_compile *c)
qir_for_each_inst_inorder(inst, c) {
uint32_t new = ~0;
- for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+ for (int i = 0; i < qir_get_nsrc(inst); i++) {
if (inst->src[i].file != QFILE_UNIF)
continue;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c
index 704cd71ea..596f73dfb 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c
@@ -165,7 +165,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
prsc->width0 == box->width &&
prsc->height0 == box->height &&
prsc->depth0 == box->depth &&
- prsc->array_size == 1) {
+ prsc->array_size == 1 &&
+ rsc->bo->private) {
usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
}
@@ -283,6 +284,20 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
if (usage & PIPE_TRANSFER_MAP_DIRECTLY)
return NULL;
+ if (format == PIPE_FORMAT_ETC1_RGB8) {
+ /* ETC1 is arranged as 64-bit blocks, where each block
+ * is 4x4 pixels. Texture tiling operates on the
+ * 64-bit block the way it would an uncompressed
+ * pixels.
+ */
+ assert(!(ptrans->box.x & 3));
+ assert(!(ptrans->box.y & 3));
+ ptrans->box.x >>= 2;
+ ptrans->box.y >>= 2;
+ ptrans->box.width = (ptrans->box.width + 3) >> 2;
+ ptrans->box.height = (ptrans->box.height + 3) >> 2;
+ }
+
/* We need to align the box to utile boundaries, since that's
* what load/store operates on. This may cause us to need to
* read out the original contents in that border area. Right
@@ -387,6 +402,11 @@ vc4_setup_slices(struct vc4_resource *rsc)
struct pipe_resource *prsc = &rsc->base.b;
uint32_t width = prsc->width0;
uint32_t height = prsc->height0;
+ if (prsc->format == PIPE_FORMAT_ETC1_RGB8) {
+ width = (width + 3) >> 2;
+ height = (height + 3) >> 2;
+ }
+
uint32_t pot_width = util_next_power_of_two(width);
uint32_t pot_height = util_next_power_of_two(height);
uint32_t offset = 0;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c
index 72fd09aee..27d23dc96 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c
@@ -123,9 +123,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_TEXTURE_SHADOW_MAP:
case PIPE_CAP_BLEND_EQUATION_SEPARATE:
case PIPE_CAP_TWO_SIDED_STENCIL:
- case PIPE_CAP_USER_INDEX_BUFFERS:
case PIPE_CAP_TEXTURE_MULTISAMPLE:
case PIPE_CAP_TEXTURE_SWIZZLE:
+ case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
return 1;
/* lying for GL 2.0 */
@@ -225,8 +225,8 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_STRING_MARKER:
case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
case PIPE_CAP_QUERY_BUFFER_OBJECT:
- case PIPE_CAP_QUERY_MEMORY_INFO:
- case PIPE_CAP_PCI_GROUP:
+ case PIPE_CAP_QUERY_MEMORY_INFO:
+ case PIPE_CAP_PCI_GROUP:
case PIPE_CAP_PCI_BUS:
case PIPE_CAP_PCI_DEVICE:
case PIPE_CAP_PCI_FUNCTION:
@@ -239,11 +239,25 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
+ case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
+ case PIPE_CAP_NATIVE_FENCE_FD:
+ case PIPE_CAP_TGSI_FS_FBFETCH:
+ case PIPE_CAP_TGSI_MUL_ZERO_WINS:
+ case PIPE_CAP_DOUBLES:
+ case PIPE_CAP_INT64:
+ case PIPE_CAP_INT64_DIVMOD:
+ case PIPE_CAP_TGSI_TEX_TXF_LZ:
+ case PIPE_CAP_TGSI_CLOCK:
+ case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE:
+ case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
+ case PIPE_CAP_TGSI_BALLOT:
+ case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
return 0;
/* Stream output. */
case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+ case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
return 0;
@@ -336,8 +350,9 @@ vc4_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
}
static int
-vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
- enum pipe_shader_cap param)
+vc4_screen_get_shader_param(struct pipe_screen *pscreen,
+ enum pipe_shader_type shader,
+ enum pipe_shader_cap param)
{
if (shader != PIPE_SHADER_VERTEX &&
shader != PIPE_SHADER_FRAGMENT) {
@@ -356,10 +371,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
return vc4_screen(pscreen)->has_control_flow;
case PIPE_SHADER_CAP_MAX_INPUTS:
- if (shader == PIPE_SHADER_FRAGMENT)
- return 8;
- else
- return 16;
+ return 8;
case PIPE_SHADER_CAP_MAX_OUTPUTS:
return shader == PIPE_SHADER_FRAGMENT ? 1 : 8;
case PIPE_SHADER_CAP_MAX_TEMPS:
@@ -368,8 +380,6 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
return 16 * 1024 * sizeof(float);
case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
return 1;
- case PIPE_SHADER_CAP_MAX_PREDS:
- return 0; /* nothing uses this */
case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
return 0;
case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
@@ -384,7 +394,6 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
return 0;
case PIPE_SHADER_CAP_INTEGERS:
return 1;
- case PIPE_SHADER_CAP_DOUBLES:
case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
@@ -401,6 +410,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
return 32;
case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+ case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
return 0;
default:
fprintf(stderr, "unknown shader param %d\n", param);
@@ -416,6 +426,7 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen,
unsigned sample_count,
unsigned usage)
{
+ struct vc4_screen *screen = vc4_screen(pscreen);
unsigned retval = 0;
if (sample_count > 1 && sample_count != VC4_MAX_SAMPLES)
@@ -485,7 +496,8 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen,
}
if ((usage & PIPE_BIND_SAMPLER_VIEW) &&
- vc4_tex_format_supported(format)) {
+ vc4_tex_format_supported(format) &&
+ (format != PIPE_FORMAT_ETC1_RGB8 || screen->has_etc1)) {
retval |= PIPE_BIND_SAMPLER_VIEW;
}
@@ -526,16 +538,12 @@ static int handle_compare(void *key1, void *key2)
}
static bool
-vc4_supports_branches(struct vc4_screen *screen)
+vc4_has_feature(struct vc4_screen *screen, uint32_t feature)
{
-#if USE_VC4_SIMULATOR
- return true;
-#endif
-
struct drm_vc4_get_param p = {
- .param = DRM_VC4_PARAM_SUPPORTS_BRANCHES,
+ .param = feature,
};
- int ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &p);
+ int ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &p);
if (ret != 0)
return false;
@@ -546,11 +554,6 @@ vc4_supports_branches(struct vc4_screen *screen)
static bool
vc4_get_chip_info(struct vc4_screen *screen)
{
-#if USE_VC4_SIMULATOR
- screen->v3d_ver = 21;
- return true;
-#endif
-
struct drm_vc4_get_param ident0 = {
.param = DRM_VC4_PARAM_V3D_IDENT0,
};
@@ -559,7 +562,7 @@ vc4_get_chip_info(struct vc4_screen *screen)
};
int ret;
- ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &ident0);
+ ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &ident0);
if (ret != 0) {
if (errno == EINVAL) {
/* Backwards compatibility with 2835 kernels which
@@ -573,7 +576,7 @@ vc4_get_chip_info(struct vc4_screen *screen)
return false;
}
}
- ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &ident1);
+ ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &ident1);
if (ret != 0) {
fprintf(stderr, "Couldn't get V3D IDENT1: %s\n",
strerror(errno));
@@ -612,11 +615,15 @@ vc4_screen_create(int fd)
screen->fd = fd;
list_inithead(&screen->bo_cache.time_list);
- pipe_mutex_init(screen->bo_handles_mutex);
+ (void) mtx_init(&screen->bo_handles_mutex, mtx_plain);
screen->bo_handles = util_hash_table_create(handle_hash, handle_compare);
- if (vc4_supports_branches(screen))
- screen->has_control_flow = true;
+ screen->has_control_flow =
+ vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_BRANCHES);
+ screen->has_etc1 =
+ vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_ETC1);
+ screen->has_threaded_fs =
+ vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_THREADED_FS);
if (!vc4_get_chip_info(screen))
goto fail;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h
index 16003cfcc..34d15381a 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h
@@ -30,6 +30,10 @@
#include "util/list.h"
#include "util/slab.h"
+#ifndef DRM_VC4_PARAM_SUPPORTS_ETC1
+#define DRM_VC4_PARAM_SUPPORTS_ETC1 4
+#endif
+
struct vc4_bo;
#define VC4_DEBUG_CL 0x0001
@@ -47,6 +51,8 @@ struct vc4_bo;
#define VC4_MAX_MIP_LEVELS 12
#define VC4_MAX_TEXTURE_SAMPLERS 16
+struct vc4_simulator_file;
+
struct vc4_screen {
struct pipe_screen base;
int fd;
@@ -55,9 +61,6 @@ struct vc4_screen {
const char *name;
- void *simulator_mem_base;
- uint32_t simulator_mem_size;
-
/** The last seqno we've completed a wait for.
*
* This lets us slightly optimize our waits by skipping wait syscalls
@@ -74,18 +77,22 @@ struct vc4_screen {
struct list_head *size_list;
uint32_t size_list_size;
- pipe_mutex lock;
+ mtx_t lock;
uint32_t bo_size;
uint32_t bo_count;
} bo_cache;
struct util_hash_table *bo_handles;
- pipe_mutex bo_handles_mutex;
+ mtx_t bo_handles_mutex;
uint32_t bo_size;
uint32_t bo_count;
bool has_control_flow;
+ bool has_etc1;
+ bool has_threaded_fs;
+
+ struct vc4_simulator_file *sim_file;
};
static inline struct vc4_screen *
@@ -105,7 +112,8 @@ vc4_screen_bo_from_handle(struct pipe_screen *pscreen,
const void *
vc4_screen_get_compiler_options(struct pipe_screen *pscreen,
- enum pipe_shader_ir ir, unsigned shader);
+ enum pipe_shader_ir ir,
+ enum pipe_shader_type shader);
extern uint32_t vc4_debug;
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c
index 0291a4e14..9565c49ef 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c
@@ -21,9 +21,37 @@
* IN THE SOFTWARE.
*/
+/**
+ * @file vc4_simulator.c
+ *
+ * Implements VC4 simulation on top of a non-VC4 GEM fd.
+ *
+ * This file's goal is to emulate the VC4 ioctls' behavior in the kernel on
+ * top of the simpenrose software simulator. Generally, VC4 driver BOs have a
+ * GEM-side copy of their contents and a simulator-side memory area that the
+ * GEM contents get copied into during simulation. Once simulation is done,
+ * the simulator's data is copied back out to the GEM BOs, so that rendering
+ * appears on the screen as if actual hardware rendering had been done.
+ *
+ * One of the limitations of this code is that we shouldn't really need a
+ * GEM-side BO for non-window-system BOs. However, do we need unique BO
+ * handles for each of our GEM bos so that this file can look up its state
+ * from the handle passed in at submit ioctl time (also, a couple of places
+ * outside of this file still call ioctls directly on the fd).
+ *
+ * Another limitation is that BO import doesn't work unless the underlying
+ * window system's BO size matches what VC4 is going to use, which of course
+ * doesn't work out in practice. This means that for now, only DRI3 (VC4
+ * makes the winsys BOs) is supported, not DRI2 (window system makes the winys
+ * BOs).
+ */
+
#ifdef USE_VC4_SIMULATOR
+#include <sys/mman.h>
+#include "xf86drm.h"
#include "util/u_memory.h"
+#include "util/u_mm.h"
#include "util/ralloc.h"
#include "vc4_screen.h"
@@ -32,53 +60,160 @@
#include "vc4_simulator_validate.h"
#include "simpenrose/simpenrose.h"
-static mtx_t exec_mutex = _MTX_INITIALIZER_NP;
+/** Global (across GEM fds) state for the simulator */
+static struct vc4_simulator_state {
+ mtx_t mutex;
+
+ void *mem;
+ ssize_t mem_size;
+ struct mem_block *heap;
+ struct mem_block *overflow;
+
+ /** Mapping from GEM handle to struct vc4_simulator_bo * */
+ struct hash_table *fd_map;
+
+ int refcount;
+} sim_state = {
+ .mutex = _MTX_INITIALIZER_NP,
+};
+
+/** Per-GEM-fd state for the simulator. */
+struct vc4_simulator_file {
+ int fd;
+
+ /* This is weird -- we make a "vc4_device" per file, even though on
+ * the kernel side this is a global. We do this so that kernel code
+ * calling us for BO allocation can get to our screen.
+ */
+ struct drm_device dev;
+
+ /** Mapping from GEM handle to struct vc4_simulator_bo * */
+ struct hash_table *bo_map;
+};
+
+/** Wrapper for drm_vc4_bo tracking the simulator-specific state. */
+struct vc4_simulator_bo {
+ struct drm_vc4_bo base;
+ struct vc4_simulator_file *file;
+
+ /** Area for this BO within sim_state->mem */
+ struct mem_block *block;
+ void *winsys_map;
+ uint32_t winsys_stride;
+
+ int handle;
+};
+
+static void *
+int_to_key(int key)
+{
+ return (void *)(uintptr_t)key;
+}
+
+static struct vc4_simulator_file *
+vc4_get_simulator_file_for_fd(int fd)
+{
+ struct hash_entry *entry = _mesa_hash_table_search(sim_state.fd_map,
+ int_to_key(fd + 1));
+ return entry ? entry->data : NULL;
+}
/* A marker placed just after each BO, then checked after rendering to make
* sure it's still there.
*/
#define BO_SENTINEL 0xfedcba98
-#define OVERFLOW_SIZE (32 * 1024 * 1024)
+#define PAGE_ALIGN2 12
-static struct drm_gem_cma_object *
-vc4_wrap_bo_with_cma(struct drm_device *dev, struct vc4_bo *bo)
+/**
+ * Allocates space in simulator memory and returns a tracking struct for it
+ * that also contains the drm_gem_cma_object struct.
+ */
+static struct vc4_simulator_bo *
+vc4_create_simulator_bo(int fd, int handle, unsigned size)
{
- struct vc4_context *vc4 = dev->vc4;
- struct vc4_screen *screen = vc4->screen;
- struct drm_vc4_bo *drm_bo = CALLOC_STRUCT(drm_vc4_bo);
- struct drm_gem_cma_object *obj = &drm_bo->base;
- uint32_t size = align(bo->size, 4096);
+ struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd);
+ struct vc4_simulator_bo *sim_bo = rzalloc(file,
+ struct vc4_simulator_bo);
+ struct drm_vc4_bo *bo = &sim_bo->base;
+ struct drm_gem_cma_object *obj = &bo->base;
+ size = align(size, 4096);
+
+ sim_bo->file = file;
+ sim_bo->handle = handle;
+
+ mtx_lock(&sim_state.mutex);
+ sim_bo->block = u_mmAllocMem(sim_state.heap, size + 4, PAGE_ALIGN2, 0);
+ mtx_unlock(&sim_state.mutex);
+ assert(sim_bo->block);
- drm_bo->bo = bo;
obj->base.size = size;
- obj->base.dev = dev;
- obj->vaddr = screen->simulator_mem_base + dev->simulator_mem_next;
+ obj->base.dev = &file->dev;
+ obj->vaddr = sim_state.mem + sim_bo->block->ofs;
obj->paddr = simpenrose_hw_addr(obj->vaddr);
- dev->simulator_mem_next += size + sizeof(uint32_t);
- dev->simulator_mem_next = align(dev->simulator_mem_next, 4096);
- assert(dev->simulator_mem_next <= screen->simulator_mem_size);
+ *(uint32_t *)(obj->vaddr + size) = BO_SENTINEL;
- *(uint32_t *)(obj->vaddr + bo->size) = BO_SENTINEL;
+ /* A handle of 0 is used for vc4_gem.c internal allocations that
+ * don't need to go in the lookup table.
+ */
+ if (handle != 0) {
+ mtx_lock(&sim_state.mutex);
+ _mesa_hash_table_insert(file->bo_map, int_to_key(handle), bo);
+ mtx_unlock(&sim_state.mutex);
+ }
+
+ return sim_bo;
+}
- return obj;
+static void
+vc4_free_simulator_bo(struct vc4_simulator_bo *sim_bo)
+{
+ struct vc4_simulator_file *sim_file = sim_bo->file;
+ struct drm_vc4_bo *bo = &sim_bo->base;
+ struct drm_gem_cma_object *obj = &bo->base;
+
+ if (sim_bo->winsys_map)
+ munmap(sim_bo->winsys_map, obj->base.size);
+
+ mtx_lock(&sim_state.mutex);
+ u_mmFreeMem(sim_bo->block);
+ if (sim_bo->handle) {
+ struct hash_entry *entry =
+ _mesa_hash_table_search(sim_file->bo_map,
+ int_to_key(sim_bo->handle));
+ _mesa_hash_table_remove(sim_file->bo_map, entry);
+ }
+ mtx_unlock(&sim_state.mutex);
+ ralloc_free(sim_bo);
+}
+
+static struct vc4_simulator_bo *
+vc4_get_simulator_bo(struct vc4_simulator_file *file, int gem_handle)
+{
+ mtx_lock(&sim_state.mutex);
+ struct hash_entry *entry =
+ _mesa_hash_table_search(file->bo_map, int_to_key(gem_handle));
+ mtx_unlock(&sim_state.mutex);
+
+ return entry ? entry->data : NULL;
}
struct drm_gem_cma_object *
drm_gem_cma_create(struct drm_device *dev, size_t size)
{
- struct vc4_context *vc4 = dev->vc4;
- struct vc4_screen *screen = vc4->screen;
-
- struct vc4_bo *bo = vc4_bo_alloc(screen, size, "simulator validate");
- return vc4_wrap_bo_with_cma(dev, bo);
+ struct vc4_screen *screen = dev->screen;
+ struct vc4_simulator_bo *sim_bo = vc4_create_simulator_bo(screen->fd,
+ 0, size);
+ return &sim_bo->base.base;
}
static int
vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_job *job,
struct vc4_exec_info *exec)
{
+ int fd = dev->screen->fd;
+ struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd);
struct drm_vc4_submit_cl *args = exec->args;
struct vc4_bo **bos = job->bo_pointers.base;
@@ -86,9 +221,12 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_job *job,
exec->bo = calloc(exec->bo_count, sizeof(void *));
for (int i = 0; i < exec->bo_count; i++) {
struct vc4_bo *bo = bos[i];
- struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo);
+ struct vc4_simulator_bo *sim_bo =
+ vc4_get_simulator_bo(file, bo->handle);
+ struct drm_vc4_bo *drm_bo = &sim_bo->base;
+ struct drm_gem_cma_object *obj = &drm_bo->base;
- struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base);
+ drm_bo->bo = bo;
#if 0
fprintf(stderr, "bo hindex %d: %s\n", i, bo->name);
#endif
@@ -118,14 +256,14 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base);
struct vc4_bo *bo = drm_bo->bo;
- assert(*(uint32_t *)(obj->vaddr + bo->size) == BO_SENTINEL);
+ assert(*(uint32_t *)(obj->vaddr +
+ obj->base.size) == BO_SENTINEL);
memcpy(bo->map, obj->vaddr, bo->size);
if (drm_bo->validated_shader) {
free(drm_bo->validated_shader->texture_samples);
free(drm_bo->validated_shader);
}
- free(obj);
}
free(exec->bo);
@@ -194,8 +332,8 @@ vc4_dump_to_file(struct vc4_exec_info *exec)
/* Add the static overflow memory area. */
bo_state[i].handle = exec->bo_count;
- bo_state[i].paddr = 0;
- bo_state[i].size = OVERFLOW_SIZE;
+ bo_state[i].paddr = sim_state.overflow->ofs;
+ bo_state[i].size = sim_state.overflow->size;
i++;
fwrite(bo_state, sizeof(*bo_state), state->bo_count, f);
@@ -211,8 +349,8 @@ vc4_dump_to_file(struct vc4_exec_info *exec)
fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
}
- void *overflow = calloc(1, OVERFLOW_SIZE);
- fwrite(overflow, 1, OVERFLOW_SIZE, f);
+ void *overflow = calloc(1, sim_state.overflow->size);
+ fwrite(overflow, 1, sim_state.overflow->size, f);
free(overflow);
free(state);
@@ -225,23 +363,22 @@ vc4_simulator_flush(struct vc4_context *vc4,
struct drm_vc4_submit_cl *args, struct vc4_job *job)
{
struct vc4_screen *screen = vc4->screen;
+ int fd = screen->fd;
+ struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd);
struct vc4_surface *csurf = vc4_surface(vc4->framebuffer.cbufs[0]);
struct vc4_resource *ctex = csurf ? vc4_resource(csurf->base.texture) : NULL;
- uint32_t winsys_stride = ctex ? ctex->bo->simulator_winsys_stride : 0;
+ struct vc4_simulator_bo *csim_bo = ctex ? vc4_get_simulator_bo(file, ctex->bo->handle) : NULL;
+ uint32_t winsys_stride = ctex ? csim_bo->winsys_stride : 0;
uint32_t sim_stride = ctex ? ctex->slices[0].stride : 0;
uint32_t row_len = MIN2(sim_stride, winsys_stride);
struct vc4_exec_info exec;
- struct drm_device local_dev = {
- .vc4 = vc4,
- .simulator_mem_next = OVERFLOW_SIZE,
- };
- struct drm_device *dev = &local_dev;
+ struct drm_device *dev = &file->dev;
int ret;
memset(&exec, 0, sizeof(exec));
list_inithead(&exec.unref_list);
- if (ctex && ctex->bo->simulator_winsys_map) {
+ if (ctex && csim_bo->winsys_map) {
#if 0
fprintf(stderr, "%dx%d %d %d %d\n",
ctex->base.b.width0, ctex->base.b.height0,
@@ -252,7 +389,7 @@ vc4_simulator_flush(struct vc4_context *vc4,
for (int y = 0; y < ctex->base.b.height0; y++) {
memcpy(ctex->bo->map + y * sim_stride,
- ctex->bo->simulator_winsys_map + y * winsys_stride,
+ csim_bo->winsys_map + y * winsys_stride,
row_len);
}
}
@@ -269,7 +406,7 @@ vc4_simulator_flush(struct vc4_context *vc4,
if (vc4_debug & VC4_DEBUG_CL) {
fprintf(stderr, "RCL:\n");
- vc4_dump_cl(screen->simulator_mem_base + exec.ct1ca,
+ vc4_dump_cl(sim_state.mem + exec.ct1ca,
exec.ct1ea - exec.ct1ca, true);
}
@@ -281,7 +418,7 @@ vc4_simulator_flush(struct vc4_context *vc4,
fprintf(stderr, "Binning returned %d flushes, should be 1.\n",
bfc);
fprintf(stderr, "Relocated binning command list:\n");
- vc4_dump_cl(screen->simulator_mem_base + exec.ct0ca,
+ vc4_dump_cl(sim_state.mem + exec.ct0ca,
exec.ct0ea - exec.ct0ca, false);
abort();
}
@@ -291,7 +428,7 @@ vc4_simulator_flush(struct vc4_context *vc4,
fprintf(stderr, "Rendering returned %d frames, should be 1.\n",
rfc);
fprintf(stderr, "Relocated render command list:\n");
- vc4_dump_cl(screen->simulator_mem_base + exec.ct1ca,
+ vc4_dump_cl(sim_state.mem + exec.ct1ca,
exec.ct1ea - exec.ct1ca, true);
abort();
}
@@ -302,16 +439,17 @@ vc4_simulator_flush(struct vc4_context *vc4,
list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec.unref_list,
unref_head) {
+ struct vc4_simulator_bo *sim_bo = (struct vc4_simulator_bo *)bo;
+ struct drm_gem_cma_object *obj = &sim_bo->base.base;
list_del(&bo->unref_head);
- assert(*(uint32_t *)(bo->base.vaddr + bo->bo->size) ==
+ assert(*(uint32_t *)(obj->vaddr + obj->base.size) ==
BO_SENTINEL);
- vc4_bo_unreference(&bo->bo);
- free(bo);
+ vc4_free_simulator_bo(sim_bo);
}
- if (ctex && ctex->bo->simulator_winsys_map) {
+ if (ctex && csim_bo->winsys_map) {
for (int y = 0; y < ctex->base.b.height0; y++) {
- memcpy(ctex->bo->simulator_winsys_map + y * winsys_stride,
+ memcpy(csim_bo->winsys_map + y * winsys_stride,
ctex->bo->map + y * sim_stride,
row_len);
}
@@ -320,33 +458,234 @@ vc4_simulator_flush(struct vc4_context *vc4,
return 0;
}
-static void *sim_mem_base = NULL;
-static int sim_mem_refcount = 0;
-static ssize_t sim_mem_size = 256 * 1024 * 1024;
+/**
+ * Map the underlying GEM object from the real hardware GEM handle.
+ */
+static void *
+vc4_simulator_map_winsys_bo(int fd, struct vc4_simulator_bo *sim_bo)
+{
+ struct drm_vc4_bo *bo = &sim_bo->base;
+ struct drm_gem_cma_object *obj = &bo->base;
+ int ret;
+ void *map;
-void
-vc4_simulator_init(struct vc4_screen *screen)
+ struct drm_mode_map_dumb map_dumb = {
+ .handle = sim_bo->handle,
+ };
+ ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map_dumb);
+ if (ret != 0) {
+ fprintf(stderr, "map ioctl failure\n");
+ abort();
+ }
+
+ map = mmap(NULL, obj->base.size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd, map_dumb.offset);
+ if (map == MAP_FAILED) {
+ fprintf(stderr,
+ "mmap of bo %d (offset 0x%016llx, size %d) failed\n",
+ sim_bo->handle, (long long)map_dumb.offset,
+ (int)obj->base.size);
+ abort();
+ }
+
+ return map;
+}
+
+/**
+ * Do fixups after a BO has been opened from a handle.
+ *
+ * This could be done at DRM_IOCTL_GEM_OPEN/DRM_IOCTL_GEM_PRIME_FD_TO_HANDLE
+ * time, but we're still using drmPrimeFDToHandle() so we have this helper to
+ * be called afterward instead.
+ */
+void vc4_simulator_open_from_handle(int fd, uint32_t winsys_stride,
+ int handle, uint32_t size)
{
- mtx_lock(&exec_mutex);
- if (sim_mem_refcount++) {
- screen->simulator_mem_size = sim_mem_size;
- screen->simulator_mem_base = sim_mem_base;
- mtx_unlock(&exec_mutex);
+ struct vc4_simulator_bo *sim_bo =
+ vc4_create_simulator_bo(fd, handle, size);
+
+ sim_bo->winsys_stride = winsys_stride;
+ sim_bo->winsys_map = vc4_simulator_map_winsys_bo(fd, sim_bo);
+}
+
+/**
+ * Simulated ioctl(fd, DRM_VC4_CREATE_BO) implementation.
+ *
+ * Making a VC4 BO is just a matter of making a corresponding BO on the host.
+ */
+static int
+vc4_simulator_create_bo_ioctl(int fd, struct drm_vc4_create_bo *args)
+{
+ int ret;
+ struct drm_mode_create_dumb create = {
+ .width = 128,
+ .bpp = 8,
+ .height = (args->size + 127) / 128,
+ };
+
+ ret = drmIoctl(fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
+ assert(create.size >= args->size);
+
+ args->handle = create.handle;
+
+ vc4_create_simulator_bo(fd, create.handle, args->size);
+
+ return ret;
+}
+
+/**
+ * Simulated ioctl(fd, DRM_VC4_CREATE_SHADER_BO) implementation.
+ *
+ * In simulation we defer shader validation until exec time. Just make a host
+ * BO and memcpy the contents in.
+ */
+static int
+vc4_simulator_create_shader_bo_ioctl(int fd,
+ struct drm_vc4_create_shader_bo *args)
+{
+ int ret;
+ struct drm_mode_create_dumb create = {
+ .width = 128,
+ .bpp = 8,
+ .height = (args->size + 127) / 128,
+ };
+
+ ret = drmIoctl(fd, DRM_IOCTL_MODE_CREATE_DUMB, &create);
+ if (ret)
+ return ret;
+ assert(create.size >= args->size);
+
+ args->handle = create.handle;
+
+ vc4_create_simulator_bo(fd, create.handle, args->size);
+
+ struct drm_mode_map_dumb map = {
+ .handle = create.handle
+ };
+ ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map);
+ if (ret)
+ return ret;
+
+ void *shader = mmap(NULL, args->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd, map.offset);
+ memcpy(shader, (void *)(uintptr_t)args->data, args->size);
+ munmap(shader, args->size);
+
+ return 0;
+}
+
+/**
+ * Simulated ioctl(fd, DRM_VC4_MMAP_BO) implementation.
+ *
+ * We just pass this straight through to dumb mmap.
+ */
+static int
+vc4_simulator_mmap_bo_ioctl(int fd, struct drm_vc4_mmap_bo *args)
+{
+ int ret;
+ struct drm_mode_map_dumb map = {
+ .handle = args->handle,
+ };
+
+ ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map);
+ args->offset = map.offset;
+
+ return ret;
+}
+
+static int
+vc4_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
+{
+ /* Free the simulator's internal tracking. */
+ struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd);
+ struct vc4_simulator_bo *sim_bo = vc4_get_simulator_bo(file,
+ args->handle);
+
+ vc4_free_simulator_bo(sim_bo);
+
+ /* Pass the call on down. */
+ return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args);
+}
+
+static int
+vc4_simulator_get_param_ioctl(int fd, struct drm_vc4_get_param *args)
+{
+ switch (args->param) {
+ case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
+ case DRM_VC4_PARAM_SUPPORTS_ETC1:
+ case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
+ args->value = true;
+ return 0;
+
+ case DRM_VC4_PARAM_V3D_IDENT0:
+ args->value = 0x02000000;
+ return 0;
+
+ case DRM_VC4_PARAM_V3D_IDENT1:
+ args->value = 0x00000001;
+ return 0;
+
+ default:
+ fprintf(stderr, "Unknown DRM_IOCTL_VC4_GET_PARAM(%lld)\n",
+ (long long)args->param);
+ abort();
+ };
+}
+
+int
+vc4_simulator_ioctl(int fd, unsigned long request, void *args)
+{
+ switch (request) {
+ case DRM_IOCTL_VC4_CREATE_BO:
+ return vc4_simulator_create_bo_ioctl(fd, args);
+ case DRM_IOCTL_VC4_CREATE_SHADER_BO:
+ return vc4_simulator_create_shader_bo_ioctl(fd, args);
+ case DRM_IOCTL_VC4_MMAP_BO:
+ return vc4_simulator_mmap_bo_ioctl(fd, args);
+
+ case DRM_IOCTL_VC4_WAIT_BO:
+ case DRM_IOCTL_VC4_WAIT_SEQNO:
+ /* We do all of the vc4 rendering synchronously, so we just
+ * return immediately on the wait ioctls. This ignores any
+ * native rendering to the host BO, so it does mean we race on
+ * front buffer rendering.
+ */
+ return 0;
+
+ case DRM_IOCTL_VC4_GET_PARAM:
+ return vc4_simulator_get_param_ioctl(fd, args);
+
+ case DRM_IOCTL_GEM_CLOSE:
+ return vc4_simulator_gem_close_ioctl(fd, args);
+
+ case DRM_IOCTL_GEM_OPEN:
+ case DRM_IOCTL_GEM_FLINK:
+ return drmIoctl(fd, request, args);
+ default:
+ fprintf(stderr, "Unknown ioctl 0x%08x\n", (int)request);
+ abort();
+ }
+}
+
+static void
+vc4_simulator_init_global(void)
+{
+ mtx_lock(&sim_state.mutex);
+ if (sim_state.refcount++) {
+ mtx_unlock(&sim_state.mutex);
return;
}
- sim_mem_base = calloc(sim_mem_size, 1);
- if (!sim_mem_base)
+ sim_state.mem_size = 256 * 1024 * 1024;
+ sim_state.mem = calloc(sim_state.mem_size, 1);
+ if (!sim_state.mem)
abort();
-
- screen->simulator_mem_size = sim_mem_size;
- screen->simulator_mem_base = sim_mem_base;
+ sim_state.heap = u_mmInit(0, sim_state.mem_size);
/* We supply our own memory so that we can have more aperture
* available (256MB instead of simpenrose's default 64MB).
*/
- simpenrose_init_hardware_supply_mem(screen->simulator_mem_base,
- screen->simulator_mem_size);
+ simpenrose_init_hardware_supply_mem(sim_state.mem, sim_state.mem_size);
/* Carve out low memory for tile allocation overflow. The kernel
* should be automatically handling overflow memory setup on real
@@ -355,20 +694,50 @@ vc4_simulator_init(struct vc4_screen *screen)
* up over the whole lifetime of simpenrose (not reused on each
* flush), so it had better be big.
*/
- simpenrose_supply_overflow_mem(0, OVERFLOW_SIZE);
+ sim_state.overflow = u_mmAllocMem(sim_state.heap, 32 * 1024 * 1024,
+ PAGE_ALIGN2, 0);
+ simpenrose_supply_overflow_mem(sim_state.overflow->ofs,
+ sim_state.overflow->size);
+
+ mtx_unlock(&sim_state.mutex);
+
+ sim_state.fd_map =
+ _mesa_hash_table_create(NULL,
+ _mesa_hash_pointer,
+ _mesa_key_pointer_equal);
+}
+
+void
+vc4_simulator_init(struct vc4_screen *screen)
+{
+ vc4_simulator_init_global();
+
+ screen->sim_file = rzalloc(screen, struct vc4_simulator_file);
+
+ screen->sim_file->bo_map =
+ _mesa_hash_table_create(screen->sim_file,
+ _mesa_hash_pointer,
+ _mesa_key_pointer_equal);
+
+ mtx_lock(&sim_state.mutex);
+ _mesa_hash_table_insert(sim_state.fd_map, int_to_key(screen->fd + 1),
+ screen->sim_file);
+ mtx_unlock(&sim_state.mutex);
- mtx_unlock(&exec_mutex);
+ screen->sim_file->dev.screen = screen;
}
void
vc4_simulator_destroy(struct vc4_screen *screen)
{
- mtx_lock(&exec_mutex);
- if (!--sim_mem_refcount) {
- free(sim_mem_base);
- sim_mem_base = NULL;
+ mtx_lock(&sim_state.mutex);
+ if (!--sim_state.refcount) {
+ _mesa_hash_table_destroy(sim_state.fd_map, NULL);
+ u_mmDestroy(sim_state.heap);
+ free(sim_state.mem);
+ /* No memsetting it, because it contains the mutex. */
}
- mtx_unlock(&exec_mutex);
+ mtx_unlock(&sim_state.mutex);
}
#endif /* USE_VC4_SIMULATOR */
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h
index 1352c9baf..d507b5fb6 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -78,8 +78,7 @@ typedef uint16_t u16;
typedef uint32_t u32;
struct drm_device {
- struct vc4_context *vc4;
- uint32_t simulator_mem_next;
+ struct vc4_screen *screen;
};
struct drm_gem_object {
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_state.c b/lib/mesa/src/gallium/drivers/vc4/vc4_state.c
index 124715895..2e00104e4 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_state.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_state.c
@@ -374,7 +374,8 @@ vc4_vertex_state_bind(struct pipe_context *pctx, void *hwcso)
}
static void
-vc4_set_constant_buffer(struct pipe_context *pctx, uint shader, uint index,
+vc4_set_constant_buffer(struct pipe_context *pctx,
+ enum pipe_shader_type shader, uint index,
const struct pipe_constant_buffer *cb)
{
struct vc4_context *vc4 = vc4_context(pctx);
@@ -615,6 +616,9 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
VC4_SET_FIELD(prsc->height0 & 2047, VC4_TEX_P1_HEIGHT) |
VC4_SET_FIELD(prsc->width0 & 2047, VC4_TEX_P1_WIDTH));
+ if (prsc->format == PIPE_FORMAT_ETC1_RGB8)
+ so->texture_p1 |= VC4_TEX_P1_ETCFLIP_MASK;
+
return &so->base;
}
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c
index 4bcb85b16..07e1c9c5f 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c
@@ -52,41 +52,6 @@
#include "vc4_context.h"
#include "vc4_tiling.h"
-/** Return the width in pixels of a 64-byte microtile. */
-uint32_t
-vc4_utile_width(int cpp)
-{
- switch (cpp) {
- case 1:
- case 2:
- return 8;
- case 4:
- return 4;
- case 8:
- return 2;
- default:
- fprintf(stderr, "unknown cpp: %d\n", cpp);
- abort();
- }
-}
-
-/** Return the height in pixels of a 64-byte microtile. */
-uint32_t
-vc4_utile_height(int cpp)
-{
- switch (cpp) {
- case 1:
- return 8;
- case 2:
- case 4:
- case 8:
- return 4;
- default:
- fprintf(stderr, "unknown cpp: %d\n", cpp);
- abort();
- }
-}
-
/**
* The texture unit decides what tiling format a particular miplevel is using
* this function, so we lay out our miptrees accordingly.
@@ -98,32 +63,6 @@ vc4_size_is_lt(uint32_t width, uint32_t height, int cpp)
height <= 4 * vc4_utile_height(cpp));
}
-void
-vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp)
-{
- uint32_t utile_h = vc4_utile_height(cpp);
- uint32_t row_size = 64 / utile_h;
-
- for (int y = 0; y < utile_h; y++) {
- memcpy(dst, src, row_size);
- dst += dst_stride;
- src += row_size;
- }
-}
-
-void
-vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp)
-{
- uint32_t utile_h = vc4_utile_height(cpp);
- uint32_t row_size = 64 / utile_h;
-
- for (int y = 0; y < utile_h; y++) {
- memcpy(dst, src, row_size);
- dst += row_size;
- src += src_stride;
- }
-}
-
static void
check_box_utile_alignment(const struct pipe_box *box, int cpp)
{
@@ -133,48 +72,6 @@ check_box_utile_alignment(const struct pipe_box *box, int cpp)
assert(!(box->height & (vc4_utile_height(cpp) - 1)));
}
-static void
-vc4_load_lt_image(void *dst, uint32_t dst_stride,
- void *src, uint32_t src_stride,
- int cpp, const struct pipe_box *box)
-{
- uint32_t utile_w = vc4_utile_width(cpp);
- uint32_t utile_h = vc4_utile_height(cpp);
- uint32_t xstart = box->x;
- uint32_t ystart = box->y;
-
- for (uint32_t y = 0; y < box->height; y += utile_h) {
- for (int x = 0; x < box->width; x += utile_w) {
- vc4_load_utile(dst + (dst_stride * y +
- x * cpp),
- src + ((ystart + y) * src_stride +
- (xstart + x) * 64 / utile_w),
- dst_stride, cpp);
- }
- }
-}
-
-static void
-vc4_store_lt_image(void *dst, uint32_t dst_stride,
- void *src, uint32_t src_stride,
- int cpp, const struct pipe_box *box)
-{
- uint32_t utile_w = vc4_utile_width(cpp);
- uint32_t utile_h = vc4_utile_height(cpp);
- uint32_t xstart = box->x;
- uint32_t ystart = box->y;
-
- for (uint32_t y = 0; y < box->height; y += utile_h) {
- for (int x = 0; x < box->width; x += utile_w) {
- vc4_store_utile(dst + ((ystart + y) * dst_stride +
- (xstart + x) * 64 / utile_w),
- src + (src_stride * y +
- x * cpp),
- src_stride, cpp);
- }
- }
-}
-
/**
* Takes a utile x and y (and the number of utiles of width of the image) and
* returns the offset to the utile within a VC4_TILING_FORMAT_TF image.
@@ -209,7 +106,10 @@ t_utile_address(uint32_t utile_x, uint32_t utile_y,
odd_stile_map[stile_index] :
even_stile_map[stile_index]);
- uint32_t utile_offset = 64 * ((utile_y & 3) * 4 + (utile_x & 3));
+ /* This function no longer handles the utile offset within a subtile.
+ * Walking subtiles is the job of the LT image handler.
+ */
+ assert(!(utile_x & 3) && !(utile_y & 3));
#if 0
fprintf(stderr, "utile %d,%d -> %d + %d + %d (stride %d,%d) = %d\n",
@@ -219,29 +119,70 @@ t_utile_address(uint32_t utile_x, uint32_t utile_y,
tile_offset + stile_offset + utile_offset);
#endif
- return tile_offset + stile_offset + utile_offset;
+ return tile_offset + stile_offset;
}
-static void
-vc4_load_t_image(void *dst, uint32_t dst_stride,
- void *src, uint32_t src_stride,
- int cpp, const struct pipe_box *box)
+/**
+ * Loads or stores a T texture image by breaking it down into subtiles
+ * (1024-byte, 4x4-utile) sub-images that we can use the LT tiling functions
+ * on.
+ */
+static inline void
+vc4_t_image_helper(void *gpu, uint32_t gpu_stride,
+ void *cpu, uint32_t cpu_stride,
+ int cpp, const struct pipe_box *box,
+ bool to_cpu)
{
uint32_t utile_w = vc4_utile_width(cpp);
uint32_t utile_h = vc4_utile_height(cpp);
- uint32_t utile_stride = src_stride / cpp / utile_w;
- uint32_t xstart = box->x / utile_w;
- uint32_t ystart = box->y / utile_h;
+ uint32_t utile_w_shift = ffs(utile_w) - 1;
+ uint32_t utile_h_shift = ffs(utile_h) - 1;
+ uint32_t stile_w = 4 * utile_w;
+ uint32_t stile_h = 4 * utile_h;
+ assert(stile_w * stile_h * cpp == 1024);
+ uint32_t utile_stride = gpu_stride / cpp / utile_w;
+ uint32_t x1 = box->x;
+ uint32_t y1 = box->y;
+ uint32_t x2 = box->x + box->width;
+ uint32_t y2 = box->y + box->height;
+ struct pipe_box partial_box;
+ uint32_t gpu_lt_stride = stile_w * cpp;
+
+ for (uint32_t y = y1; y < y2; y = align(y + 1, stile_h)) {
+ partial_box.y = y & (stile_h - 1);
+ partial_box.height = MIN2(y2 - y, stile_h - partial_box.y);
+
+ uint32_t cpu_offset = 0;
+ for (uint32_t x = x1; x < x2; x = align(x + 1, stile_w)) {
+ partial_box.x = x & (stile_w - 1);
+ partial_box.width = MIN2(x2 - x,
+ stile_w - partial_box.x);
+
+ /* The dst offset we want is the start of this
+ * subtile
+ */
+ uint32_t gpu_offset =
+ t_utile_address((x >> utile_w_shift) & ~0x3,
+ (y >> utile_h_shift) & ~0x3,
+ utile_stride);
- for (uint32_t y = 0; y < box->height / utile_h; y++) {
- for (int x = 0; x < box->width / utile_w; x++) {
- vc4_load_utile(dst + (y * utile_h * dst_stride +
- x * utile_w * cpp),
- src + t_utile_address(xstart + x,
- ystart + y,
- utile_stride),
- dst_stride, cpp);
+ if (to_cpu) {
+ vc4_load_lt_image(cpu + cpu_offset,
+ cpu_stride,
+ gpu + gpu_offset,
+ gpu_lt_stride,
+ cpp, &partial_box);
+ } else {
+ vc4_store_lt_image(gpu + gpu_offset,
+ gpu_lt_stride,
+ cpu + cpu_offset,
+ cpu_stride,
+ cpp, &partial_box);
+ }
+
+ cpu_offset += partial_box.width * cpp;
}
+ cpu += cpu_stride * partial_box.height;
}
}
@@ -250,22 +191,19 @@ vc4_store_t_image(void *dst, uint32_t dst_stride,
void *src, uint32_t src_stride,
int cpp, const struct pipe_box *box)
{
- uint32_t utile_w = vc4_utile_width(cpp);
- uint32_t utile_h = vc4_utile_height(cpp);
- uint32_t utile_stride = dst_stride / cpp / utile_w;
- uint32_t xstart = box->x / utile_w;
- uint32_t ystart = box->y / utile_h;
+ vc4_t_image_helper(dst, dst_stride,
+ src, src_stride,
+ cpp, box, false);
+}
- for (uint32_t y = 0; y < box->height / utile_h; y++) {
- for (int x = 0; x < box->width / utile_w; x++) {
- vc4_store_utile(dst + t_utile_address(xstart + x,
- ystart + y,
- utile_stride),
- src + (y * utile_h * src_stride +
- x * utile_w * cpp),
- src_stride, cpp);
- }
- }
+static void
+vc4_load_t_image(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box)
+{
+ vc4_t_image_helper(src, src_stride,
+ dst, dst_stride,
+ cpp, box, true);
}
/**
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h
index b90bba702..ba1ad6fb3 100644
--- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h
@@ -24,11 +24,56 @@
#ifndef VC4_TILING_H
#define VC4_TILING_H
-uint32_t vc4_utile_width(int cpp) ATTRIBUTE_CONST;
-uint32_t vc4_utile_height(int cpp) ATTRIBUTE_CONST;
+#include <stdbool.h>
+#include <stdint.h>
+#include "util/macros.h"
+
+/** Return the width in pixels of a 64-byte microtile. */
+static inline uint32_t
+vc4_utile_width(int cpp)
+{
+ switch (cpp) {
+ case 1:
+ case 2:
+ return 8;
+ case 4:
+ return 4;
+ case 8:
+ return 2;
+ default:
+ unreachable("unknown cpp");
+ }
+}
+
+/** Return the height in pixels of a 64-byte microtile. */
+static inline uint32_t
+vc4_utile_height(int cpp)
+{
+ switch (cpp) {
+ case 1:
+ return 8;
+ case 2:
+ case 4:
+ case 8:
+ return 4;
+ default:
+ unreachable("unknown cpp");
+ }
+}
+
bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
-void vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp);
-void vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp);
+void vc4_load_lt_image_base(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box);
+void vc4_store_lt_image_base(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box);
+void vc4_load_lt_image_neon(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box);
+void vc4_store_lt_image_neon(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box);
void vc4_load_tiled_image(void *dst, uint32_t dst_stride,
void *src, uint32_t src_stride,
uint8_t tiling_format, int cpp,
@@ -38,4 +83,34 @@ void vc4_store_tiled_image(void *dst, uint32_t dst_stride,
uint8_t tiling_format, int cpp,
const struct pipe_box *box);
+/* If we're building for ARMv7 (Pi 2+), assume it has NEON. For Raspbian we
+ * should extend this to have some runtime detection of being built for ARMv6
+ * on a Pi 2+.
+ */
+#if defined(__ARM_ARCH) && __ARM_ARCH == 7
+#define NEON_SUFFIX(x) x ## _neon
+#else
+#define NEON_SUFFIX(x) x ## _base
+#endif
+
+static inline void
+vc4_load_lt_image(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box)
+{
+ NEON_SUFFIX(vc4_load_lt_image)(dst, dst_stride, src, src_stride,
+ cpp, box);
+}
+
+static inline void
+vc4_store_lt_image(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box)
+{
+ NEON_SUFFIX(vc4_store_lt_image)(dst, dst_stride, src, src_stride,
+ cpp, box);
+}
+
+#undef NEON_SUFFIX
+
#endif /* VC4_TILING_H */
diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
new file mode 100644
index 000000000..f37a92e93
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright © 2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file vc4_tiling_lt.c
+ *
+ * Helper functions from vc4_tiling.c that will be compiled for using NEON
+ * assembly or not.
+ *
+ * If VC4_BUILD_NEON is set, then the functions will be suffixed with _neon.
+ * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86
+ * sim build working.
+ */
+
+#include <string.h>
+#include "pipe/p_state.h"
+#include "vc4_tiling.h"
+
+#ifdef VC4_BUILD_NEON
+#define NEON_TAG(x) x ## _neon
+#else
+#define NEON_TAG(x) x ## _base
+#endif
+
+/** Returns the stride in bytes of a 64-byte microtile. */
+static uint32_t
+vc4_utile_stride(int cpp)
+{
+ switch (cpp) {
+ case 1:
+ return 8;
+ case 2:
+ case 4:
+ case 8:
+ return 16;
+ default:
+ unreachable("bad cpp");
+ }
+}
+
+static void
+vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
+{
+ uint32_t gpu_stride = vc4_utile_stride(cpp);
+#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
+ if (gpu_stride == 8) {
+ __asm__ volatile (
+ /* Load from the GPU in one shot, no interleave, to
+ * d0-d7.
+ */
+ "vldm %0, {q0, q1, q2, q3}\n"
+ /* Store each 8-byte line to cpu-side destination,
+ * incrementing it by the stride each time.
+ */
+ "vst1.8 d0, [%1], %2\n"
+ "vst1.8 d1, [%1], %2\n"
+ "vst1.8 d2, [%1], %2\n"
+ "vst1.8 d3, [%1], %2\n"
+ "vst1.8 d4, [%1], %2\n"
+ "vst1.8 d5, [%1], %2\n"
+ "vst1.8 d6, [%1], %2\n"
+ "vst1.8 d7, [%1]\n"
+ :
+ : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+ : "q0", "q1", "q2", "q3");
+ } else {
+ assert(gpu_stride == 16);
+ __asm__ volatile (
+ /* Load from the GPU in one shot, no interleave, to
+ * d0-d7.
+ */
+ "vldm %0, {q0, q1, q2, q3};\n"
+ /* Store each 16-byte line in 2 parts to the cpu-side
+ * destination. (vld1 can only store one d-register
+ * at a time).
+ */
+ "vst1.8 d0, [%1], %3\n"
+ "vst1.8 d1, [%2], %3\n"
+ "vst1.8 d2, [%1], %3\n"
+ "vst1.8 d3, [%2], %3\n"
+ "vst1.8 d4, [%1], %3\n"
+ "vst1.8 d5, [%2], %3\n"
+ "vst1.8 d6, [%1]\n"
+ "vst1.8 d7, [%2]\n"
+ :
+ : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+ : "q0", "q1", "q2", "q3");
+ }
+#else
+ for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
+ memcpy(cpu, gpu + gpu_offset, gpu_stride);
+ cpu += cpu_stride;
+ }
+#endif
+}
+
+static void
+vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
+{
+ uint32_t gpu_stride = vc4_utile_stride(cpp);
+
+#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM)
+ if (gpu_stride == 8) {
+ __asm__ volatile (
+ /* Load each 8-byte line from cpu-side source,
+ * incrementing it by the stride each time.
+ */
+ "vld1.8 d0, [%1], %2\n"
+ "vld1.8 d1, [%1], %2\n"
+ "vld1.8 d2, [%1], %2\n"
+ "vld1.8 d3, [%1], %2\n"
+ "vld1.8 d4, [%1], %2\n"
+ "vld1.8 d5, [%1], %2\n"
+ "vld1.8 d6, [%1], %2\n"
+ "vld1.8 d7, [%1]\n"
+ /* Load from the GPU in one shot, no interleave, to
+ * d0-d7.
+ */
+ "vstm %0, {q0, q1, q2, q3}\n"
+ :
+ : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+ : "q0", "q1", "q2", "q3");
+ } else {
+ assert(gpu_stride == 16);
+ __asm__ volatile (
+ /* Load each 16-byte line in 2 parts from the cpu-side
+ * destination. (vld1 can only store one d-register
+ * at a time).
+ */
+ "vld1.8 d0, [%1], %3\n"
+ "vld1.8 d1, [%2], %3\n"
+ "vld1.8 d2, [%1], %3\n"
+ "vld1.8 d3, [%2], %3\n"
+ "vld1.8 d4, [%1], %3\n"
+ "vld1.8 d5, [%2], %3\n"
+ "vld1.8 d6, [%1]\n"
+ "vld1.8 d7, [%2]\n"
+ /* Store to the GPU in one shot, no interleave. */
+ "vstm %0, {q0, q1, q2, q3}\n"
+ :
+ : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+ : "q0", "q1", "q2", "q3");
+ }
+#else
+ for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
+ memcpy(gpu + gpu_offset, cpu, gpu_stride);
+ cpu += cpu_stride;
+ }
+#endif
+
+}
+
+void
+NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box)
+{
+ uint32_t utile_w = vc4_utile_width(cpp);
+ uint32_t utile_h = vc4_utile_height(cpp);
+ uint32_t xstart = box->x;
+ uint32_t ystart = box->y;
+
+ for (uint32_t y = 0; y < box->height; y += utile_h) {
+ for (int x = 0; x < box->width; x += utile_w) {
+ vc4_load_utile(dst + (dst_stride * y +
+ x * cpp),
+ src + ((ystart + y) * src_stride +
+ (xstart + x) * 64 / utile_w),
+ dst_stride, cpp);
+ }
+ }
+}
+
+void
+NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box)
+{
+ uint32_t utile_w = vc4_utile_width(cpp);
+ uint32_t utile_h = vc4_utile_height(cpp);
+ uint32_t xstart = box->x;
+ uint32_t ystart = box->y;
+
+ for (uint32_t y = 0; y < box->height; y += utile_h) {
+ for (int x = 0; x < box->width; x += utile_w) {
+ vc4_store_utile(dst + ((ystart + y) * dst_stride +
+ (xstart + x) * 64 / utile_w),
+ src + (src_stride * y +
+ x * cpp),
+ src_stride, cpp);
+ }
+ }
+}