diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2017-08-14 09:45:54 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2017-08-14 09:45:54 +0000 |
commit | 4c58069f5013f0a621503525f7d5193bfe9976b3 (patch) | |
tree | bd8f8a08b889e9a8b99c9de01ae12459d527ea6d /lib/mesa/src/gallium/drivers/vc4 | |
parent | 5caa025e6b62d0456faad86c89f239a14d1eaadb (diff) |
Import Mesa 17.1.6
Diffstat (limited to 'lib/mesa/src/gallium/drivers/vc4')
49 files changed, 2271 insertions, 796 deletions
diff --git a/lib/mesa/src/gallium/drivers/vc4/Makefile.am b/lib/mesa/src/gallium/drivers/vc4/Makefile.am index 19fc38759..b361a0c58 100644 --- a/lib/mesa/src/gallium/drivers/vc4/Makefile.am +++ b/lib/mesa/src/gallium/drivers/vc4/Makefile.am @@ -40,3 +40,11 @@ noinst_LTLIBRARIES = libvc4.la libvc4_la_SOURCES = $(C_SOURCES) libvc4_la_LIBADD = $(SIM_LIB) $(VC4_LIBS) libvc4_la_LDFLAGS = $(SIM_LDFLAGS) + +noinst_LTLIBRARIES += libvc4_neon.la +libvc4_la_LIBADD += libvc4_neon.la + +libvc4_neon_la_SOURCES = vc4_tiling_lt.c +libvc4_neon_la_CFLAGS = $(AM_CFLAGS) -DVC4_BUILD_NEON + +EXTRA_DIST = kernel/README diff --git a/lib/mesa/src/gallium/drivers/vc4/Makefile.sources b/lib/mesa/src/gallium/drivers/vc4/Makefile.sources index e1496d101..10de34361 100644 --- a/lib/mesa/src/gallium/drivers/vc4/Makefile.sources +++ b/lib/mesa/src/gallium/drivers/vc4/Makefile.sources @@ -28,6 +28,7 @@ C_SOURCES := \ vc4_opt_peephole_sf.c \ vc4_opt_small_immediates.c \ vc4_opt_vpm.c \ + vc4_opt_coalesce_ff_writes.c \ vc4_program.c \ vc4_qir.c \ vc4_qir_emit_uniform_stream_resets.c \ @@ -54,6 +55,7 @@ C_SOURCES := \ vc4_simulator_validate.h \ vc4_state.c \ vc4_tiling.c \ + vc4_tiling_lt.c \ vc4_tiling.h \ vc4_uniforms.c \ $() diff --git a/lib/mesa/src/gallium/drivers/vc4/kernel/README b/lib/mesa/src/gallium/drivers/vc4/kernel/README new file mode 100644 index 000000000..89e4442b4 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/vc4/kernel/README @@ -0,0 +1,6 @@ +This is a mirror of the kernel validation code into the userspace GL library. +It is only built when USE_VC4_SIMULATOR is defined, for compiling the driver +on an x86 system with the simpenrose simulator. It allows testing of changes +across the kernel and userspace with exposure to most of the software stack, +on a higher-performance and more-debuggable environment than the native +hardware. diff --git a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h index 90f45397d..8f5ed00d9 100644 --- a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h +++ b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_drv.h @@ -150,6 +150,8 @@ struct vc4_validated_shader_info uint32_t num_uniform_addr_offsets; uint32_t *uniform_addr_offsets; + + bool is_threaded; }; /* vc4_validate.c */ diff --git a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c index 4ef01108b..bd193b993 100644 --- a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c +++ b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate.c @@ -640,6 +640,13 @@ reloc_tex(struct vc4_exec_info *exec, cpp = 1; break; case VC4_TEXTURE_TYPE_ETC1: + /* ETC1 is arranged as 64-bit blocks, where each block is 4x4 + * pixels. + */ + cpp = 8; + width = (width + 3) >> 2; + height = (height + 3) >> 2; + break; case VC4_TEXTURE_TYPE_BW1: case VC4_TEXTURE_TYPE_A4: case VC4_TEXTURE_TYPE_A1: @@ -773,11 +780,6 @@ validate_gl_shader_rec(struct drm_device *dev, exec->shader_rec_v += roundup(packet_size, 16); exec->shader_rec_size -= packet_size; - if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) { - DRM_ERROR("Multi-threaded fragment shaders not supported.\n"); - return -EINVAL; - } - for (i = 0; i < shader_reloc_count; i++) { if (src_handles[i] > exec->bo_count) { DRM_ERROR("Shader handle %d too big\n", src_handles[i]); @@ -794,6 +796,18 @@ validate_gl_shader_rec(struct drm_device *dev, return -EINVAL; } + if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) != + to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) { + DRM_ERROR("Thread mode of CL and FS do not match\n"); + return -EINVAL; + } + + if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded || + to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) { + DRM_ERROR("cs and vs cannot be threaded\n"); + return -EINVAL; + } + for (i = 0; i < shader_reloc_count; i++) { struct vc4_validated_shader_info *validated_shader; uint32_t o = shader_reloc_offsets[i]; diff --git a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c index 82717ca55..d93f5239d 100644 --- a/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c +++ b/lib/mesa/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c @@ -84,6 +84,14 @@ struct vc4_shader_validation_state { * basic blocks. */ bool needs_uniform_address_for_loop; + + /* Set when we find an instruction which violates the criterion for a + * threaded shader. These are: + * - only write the lower half of the register space + * - last thread switch signaled at the end + * So track the usage of the thread switches and the register usage. + */ + bool all_registers_used; }; static uint32_t @@ -119,6 +127,12 @@ raddr_add_a_to_live_reg_index(uint64_t inst) return ~0; } +static bool live_reg_is_upper_half(uint32_t lri) +{ + return (lri >=16 && lri < 32) || + (lri >=32 + 16 && lri < 32 + 32); +} + static bool is_tmu_submit(uint32_t waddr) { @@ -385,6 +399,9 @@ check_reg_write(struct vc4_validated_shader_info *validated_shader, } else { validation_state->live_immediates[lri] = ~0; } + + if (live_reg_is_upper_half(lri)) + validation_state->all_registers_used = true; } switch (waddr) { @@ -593,6 +610,11 @@ check_instruction_reads(struct vc4_validated_shader_info *validated_shader, } } + if ((raddr_a >= 16 && raddr_a < 32) || + (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) { + validation_state->all_registers_used = true; + } + return true; } @@ -603,9 +625,7 @@ static bool vc4_validate_branches(struct vc4_shader_validation_state *validation_state) { uint32_t max_branch_target = 0; - bool found_shader_end = false; int ip; - int shader_end_ip = 0; int last_branch = -2; for (ip = 0; ip < validation_state->max_ip; ip++) { @@ -616,8 +636,13 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state) uint32_t branch_target_ip; if (sig == QPU_SIG_PROG_END) { - shader_end_ip = ip; - found_shader_end = true; + /* There are two delay slots after program end is + * signaled that are still executed, then we're + * finished. validation_state->max_ip is the + * instruction after the last valid instruction in the + * program. + */ + validation_state->max_ip = ip + 3; continue; } @@ -671,15 +696,9 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state) } set_bit(after_delay_ip, validation_state->branch_targets); max_branch_target = max(max_branch_target, after_delay_ip); - - /* There are two delay slots after program end is signaled - * that are still executed, then we're finished. - */ - if (found_shader_end && ip == shader_end_ip + 2) - break; } - if (max_branch_target > shader_end_ip) { + if (max_branch_target > validation_state->max_ip - 3) { DRM_ERROR("Branch landed after QPU_SIG_PROG_END"); return false; } @@ -751,6 +770,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) { bool found_shader_end = false; int shader_end_ip = 0; + uint32_t last_thread_switch_ip = -3; uint32_t ip; struct vc4_validated_shader_info *validated_shader = NULL; struct vc4_shader_validation_state validation_state; @@ -783,6 +803,16 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) if (!vc4_handle_branch_target(&validation_state)) goto fail; + if (ip == last_thread_switch_ip + 3) { + /* Reset r0-r3 live clamp data */ + int i; + for (i = 64; i < LIVE_REG_COUNT; i++) { + validation_state.live_min_clamp_offsets[i] = ~0; + validation_state.live_max_clamp_regs[i] = false; + validation_state.live_immediates[i] = ~0; + } + } + switch (sig) { case QPU_SIG_NONE: case QPU_SIG_WAIT_FOR_SCOREBOARD: @@ -792,6 +822,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) case QPU_SIG_LOAD_TMU1: case QPU_SIG_PROG_END: case QPU_SIG_SMALL_IMM: + case QPU_SIG_THREAD_SWITCH: + case QPU_SIG_LAST_THREAD_SWITCH: if (!check_instruction_writes(validated_shader, &validation_state)) { DRM_ERROR("Bad write at ip %d\n", ip); @@ -807,6 +839,18 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) shader_end_ip = ip; } + if (sig == QPU_SIG_THREAD_SWITCH || + sig == QPU_SIG_LAST_THREAD_SWITCH) { + validated_shader->is_threaded = true; + + if (ip < last_thread_switch_ip + 3) { + DRM_ERROR("Thread switch too soon after " + "last switch at ip %d\n", ip); + goto fail; + } + last_thread_switch_ip = ip; + } + break; case QPU_SIG_LOAD_IMM: @@ -821,6 +865,13 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) if (!check_branch(inst, validated_shader, &validation_state, ip)) goto fail; + + if (ip < last_thread_switch_ip + 3) { + DRM_ERROR("Branch in thread switch at ip %d", + ip); + goto fail; + } + break; default: DRM_ERROR("Unsupported QPU signal %d at " @@ -842,6 +893,14 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj) goto fail; } + /* Might corrupt other thread */ + if (validated_shader->is_threaded && + validation_state.all_registers_used) { + DRM_ERROR("Shader uses threading, but uses the upper " + "half of the registers, too\n"); + goto fail; + } + /* If we did a backwards branch and we haven't emitted a uniforms * reset since then, we still need the uniforms stream to have the * uniforms address available so that the backwards branch can do its diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c b/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c index 1e056568a..0e4ab5bfa 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_blit.c @@ -212,14 +212,16 @@ vc4_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info) if (vc4_tile_blit(pctx, blit_info)) return; - if (util_try_blit_via_copy_region(pctx, &info)) { - return; /* done */ - } - if (info.mask & PIPE_MASK_S) { - fprintf(stderr, "cannot blit stencil, skipping\n"); + if (util_try_blit_via_copy_region(pctx, &info)) + return; + info.mask &= ~PIPE_MASK_S; + fprintf(stderr, "cannot blit stencil, skipping\n"); } - vc4_render_blit(pctx, &info); + if (vc4_render_blit(pctx, &info)) + return; + + fprintf(stderr, "Unsupported blit\n"); } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c index cf6a5114b..12af7f8a9 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.c @@ -97,7 +97,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name) return NULL; struct vc4_bo *bo = NULL; - pipe_mutex_lock(cache->lock); + mtx_lock(&cache->lock); if (!list_empty(&cache->size_list[page_index])) { bo = LIST_ENTRY(struct vc4_bo, cache->size_list[page_index].next, size_list); @@ -107,7 +107,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name) * user will proceed to CPU map it and fill it with stuff. */ if (!vc4_bo_wait(bo, 0, NULL)) { - pipe_mutex_unlock(cache->lock); + mtx_unlock(&cache->lock); return NULL; } @@ -116,7 +116,7 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name) bo->name = name; } - pipe_mutex_unlock(cache->lock); + mtx_unlock(&cache->lock); return bo; } @@ -148,28 +148,17 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name) bo->name = name; bo->private = true; + retry: + ; + bool cleared_and_retried = false; -retry: - if (!using_vc4_simulator) { - struct drm_vc4_create_bo create; - memset(&create, 0, sizeof(create)); - - create.size = size; - - ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_CREATE_BO, &create); - bo->handle = create.handle; - } else { - struct drm_mode_create_dumb create; - memset(&create, 0, sizeof(create)); - - create.width = 128; - create.bpp = 8; - create.height = (size + 127) / 128; - - ret = drmIoctl(screen->fd, DRM_IOCTL_MODE_CREATE_DUMB, &create); - bo->handle = create.handle; - assert(create.size >= size); - } + struct drm_vc4_create_bo create = { + .size = size + }; + + ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_CREATE_BO, &create); + bo->handle = create.handle; + if (ret != 0) { if (!list_empty(&screen->bo_cache.time_list) && !cleared_and_retried) { @@ -199,9 +188,9 @@ vc4_bo_last_unreference(struct vc4_bo *bo) struct timespec time; clock_gettime(CLOCK_MONOTONIC, &time); - pipe_mutex_lock(screen->bo_cache.lock); + mtx_lock(&screen->bo_cache.lock); vc4_bo_last_unreference_locked_timed(bo, time.tv_sec); - pipe_mutex_unlock(screen->bo_cache.lock); + mtx_unlock(&screen->bo_cache.lock); } static void @@ -210,20 +199,19 @@ vc4_bo_free(struct vc4_bo *bo) struct vc4_screen *screen = bo->screen; if (bo->map) { -#ifdef USE_VC4_SIMULATOR - if (bo->simulator_winsys_map) { + if (using_vc4_simulator && bo->name && + strcmp(bo->name, "winsys") == 0) { free(bo->map); - bo->map = bo->simulator_winsys_map; + } else { + munmap(bo->map, bo->size); + VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0)); } -#endif - munmap(bo->map, bo->size); - VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0)); } struct drm_gem_close c; memset(&c, 0, sizeof(c)); c.handle = bo->handle; - int ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c); + int ret = vc4_ioctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c); if (ret != 0) fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno)); @@ -273,13 +261,13 @@ free_stale_bos(struct vc4_screen *screen, time_t time) static void vc4_bo_cache_free_all(struct vc4_bo_cache *cache) { - pipe_mutex_lock(cache->lock); + mtx_lock(&cache->lock); list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list, time_list) { vc4_bo_remove_from_cache(cache, bo); vc4_bo_free(bo); } - pipe_mutex_unlock(cache->lock); + mtx_unlock(&cache->lock); } void @@ -301,17 +289,8 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time) /* Move old list contents over (since the array has moved, and * therefore the pointers to the list heads have to change). */ - for (int i = 0; i < cache->size_list_size; i++) { - struct list_head *old_head = &cache->size_list[i]; - if (list_empty(old_head)) - list_inithead(&new_list[i]); - else { - new_list[i].next = old_head->next; - new_list[i].prev = old_head->prev; - new_list[i].next->prev = &new_list[i]; - new_list[i].prev->next = &new_list[i]; - } - } + for (int i = 0; i < cache->size_list_size; i++) + list_replace(&cache->size_list[i], &new_list[i]); for (int i = cache->size_list_size; i < page_index + 1; i++) list_inithead(&new_list[i]); @@ -343,7 +322,7 @@ vc4_bo_open_handle(struct vc4_screen *screen, assert(size); - pipe_mutex_lock(screen->bo_handles_mutex); + mtx_lock(&screen->bo_handles_mutex); bo = util_hash_table_get(screen->bo_handles, (void*)(uintptr_t)handle); if (bo) { @@ -360,16 +339,15 @@ vc4_bo_open_handle(struct vc4_screen *screen, bo->private = false; #ifdef USE_VC4_SIMULATOR - vc4_bo_map(bo); - bo->simulator_winsys_map = bo->map; - bo->simulator_winsys_stride = winsys_stride; + vc4_simulator_open_from_handle(screen->fd, winsys_stride, + bo->handle, bo->size); bo->map = malloc(bo->size); #endif util_hash_table_set(screen->bo_handles, (void *)(uintptr_t)handle, bo); done: - pipe_mutex_unlock(screen->bo_handles_mutex); + mtx_unlock(&screen->bo_handles_mutex); return bo; } @@ -380,7 +358,7 @@ vc4_bo_open_name(struct vc4_screen *screen, uint32_t name, struct drm_gem_open o = { .name = name }; - int ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_OPEN, &o); + int ret = vc4_ioctl(screen->fd, DRM_IOCTL_GEM_OPEN, &o); if (ret) { fprintf(stderr, "Failed to open bo %d: %s\n", name, strerror(errno)); @@ -423,10 +401,10 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo) return -1; } - pipe_mutex_lock(bo->screen->bo_handles_mutex); + mtx_lock(&bo->screen->bo_handles_mutex); bo->private = false; util_hash_table_set(bo->screen->bo_handles, (void *)(uintptr_t)bo->handle, bo); - pipe_mutex_unlock(bo->screen->bo_handles_mutex); + mtx_unlock(&bo->screen->bo_handles_mutex); return fd; } @@ -447,30 +425,15 @@ vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size) bo->name = "code"; bo->private = false; /* Make sure it doesn't go back to the cache. */ - if (!using_vc4_simulator) { - struct drm_vc4_create_shader_bo create = { - .size = size, - .data = (uintptr_t)data, - }; - - ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_CREATE_SHADER_BO, - &create); - bo->handle = create.handle; - } else { - struct drm_mode_create_dumb create; - memset(&create, 0, sizeof(create)); - - create.width = 128; - create.bpp = 8; - create.height = (size + 127) / 128; - - ret = drmIoctl(screen->fd, DRM_IOCTL_MODE_CREATE_DUMB, &create); - bo->handle = create.handle; - assert(create.size >= size); - - vc4_bo_map(bo); - memcpy(bo->map, data, size); - } + struct drm_vc4_create_shader_bo create = { + .size = size, + .data = (uintptr_t)data, + }; + + ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_CREATE_SHADER_BO, + &create); + bo->handle = create.handle; + if (ret != 0) { fprintf(stderr, "create shader ioctl failure\n"); abort(); @@ -492,7 +455,7 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name) struct drm_gem_flink flink = { .handle = bo->handle, }; - int ret = drmIoctl(bo->screen->fd, DRM_IOCTL_GEM_FLINK, &flink); + int ret = vc4_ioctl(bo->screen->fd, DRM_IOCTL_GEM_FLINK, &flink); if (ret) { fprintf(stderr, "Failed to flink bo %d: %s\n", bo->handle, strerror(errno)); @@ -508,14 +471,11 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name) static int vc4_wait_seqno_ioctl(int fd, uint64_t seqno, uint64_t timeout_ns) { - if (using_vc4_simulator) - return 0; - struct drm_vc4_wait_seqno wait = { .seqno = seqno, .timeout_ns = timeout_ns, }; - int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait); + int ret = vc4_ioctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait); if (ret == -1) return -errno; else @@ -553,14 +513,11 @@ vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns, static int vc4_wait_bo_ioctl(int fd, uint32_t handle, uint64_t timeout_ns) { - if (using_vc4_simulator) - return 0; - struct drm_vc4_wait_bo wait = { .handle = handle, .timeout_ns = timeout_ns, }; - int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_BO, &wait); + int ret = vc4_ioctl(fd, DRM_IOCTL_VC4_WAIT_BO, &wait); if (ret == -1) return -errno; else @@ -602,19 +559,11 @@ vc4_bo_map_unsynchronized(struct vc4_bo *bo) if (bo->map) return bo->map; - if (!using_vc4_simulator) { - struct drm_vc4_mmap_bo map; - memset(&map, 0, sizeof(map)); - map.handle = bo->handle; - ret = drmIoctl(bo->screen->fd, DRM_IOCTL_VC4_MMAP_BO, &map); - offset = map.offset; - } else { - struct drm_mode_map_dumb map; - memset(&map, 0, sizeof(map)); - map.handle = bo->handle; - ret = drmIoctl(bo->screen->fd, DRM_IOCTL_MODE_MAP_DUMB, &map); - offset = map.offset; - } + struct drm_vc4_mmap_bo map; + memset(&map, 0, sizeof(map)); + map.handle = bo->handle; + ret = vc4_ioctl(bo->screen->fd, DRM_IOCTL_VC4_MMAP_BO, &map); + offset = map.offset; if (ret != 0) { fprintf(stderr, "map ioctl failure\n"); abort(); diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h index 71a442648..838314f43 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_bufmgr.h @@ -39,11 +39,6 @@ struct vc4_bo { uint32_t handle; uint32_t size; -#ifdef USE_VC4_SIMULATOR - void *simulator_winsys_map; - uint32_t simulator_winsys_stride; -#endif - /** Entry in the linked list of buffers freed, by age. */ struct list_head time_list; /** Entry in the per-page-count linked list of buffers freed (by age). */ @@ -98,7 +93,7 @@ vc4_bo_unreference(struct vc4_bo **bo) vc4_bo_last_unreference(*bo); } else { screen = (*bo)->screen; - pipe_mutex_lock(screen->bo_handles_mutex); + mtx_lock(&screen->bo_handles_mutex); if (pipe_reference(&(*bo)->reference, NULL)) { util_hash_table_remove(screen->bo_handles, @@ -106,7 +101,7 @@ vc4_bo_unreference(struct vc4_bo **bo) vc4_bo_last_unreference(*bo); } - pipe_mutex_unlock(screen->bo_handles_mutex); + mtx_unlock(&screen->bo_handles_mutex); } *bo = NULL; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c index afb9987f4..35578370e 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_cl.c @@ -28,7 +28,7 @@ void vc4_init_cl(void *mem_ctx, struct vc4_cl *cl) { - cl->base = ralloc_size(mem_ctx, 1); + cl->base = rzalloc_size(mem_ctx, 1); /* TODO: don't use rzalloc */ cl->next = cl->base; cl->size = 0; } @@ -76,5 +76,7 @@ vc4_gem_hindex(struct vc4_job *job, struct vc4_bo *bo) cl_ptr(&out, vc4_bo_reference(bo)); cl_end(&job->bo_pointers, out); + job->bo_space += bo->size; + return hindex; } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c index 974df8a1d..401c160fc 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.c @@ -144,7 +144,12 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) vc4->fd = screen->fd; slab_create_child(&vc4->transfer_pool, &screen->transfer_pool); - vc4->blitter = util_blitter_create(pctx); + + vc4->uploader = u_upload_create_default(&vc4->base); + vc4->base.stream_uploader = vc4->uploader; + vc4->base.const_uploader = vc4->uploader; + + vc4->blitter = util_blitter_create(pctx); if (!vc4->blitter) goto fail; @@ -153,10 +158,6 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) if (!vc4->primconvert) goto fail; - vc4->uploader = u_upload_create(pctx, 16 * 1024, - PIPE_BIND_INDEX_BUFFER, - PIPE_USAGE_STREAM); - vc4_debug |= saved_shaderdb_flag; vc4->sample_mask = (1 << VC4_MAX_SAMPLES) - 1; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h index c164eba80..6bd2424ec 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_context.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_context.h @@ -30,6 +30,7 @@ #include "pipe/p_context.h" #include "pipe/p_state.h" #include "util/slab.h" +#include "xf86drm.h" #define __user #include "vc4_drm.h" @@ -38,6 +39,13 @@ #include "vc4_cl.h" #include "vc4_qir.h" +#ifndef DRM_VC4_PARAM_SUPPORTS_ETC1 +#define DRM_VC4_PARAM_SUPPORTS_ETC1 4 +#endif +#ifndef DRM_VC4_PARAM_SUPPORTS_THREADED_FS +#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS 5 +#endif + #ifdef USE_VC4_SIMULATOR #define using_vc4_simulator true #else @@ -162,6 +170,8 @@ struct vc4_compiled_shader { */ bool failed; + bool fs_threaded; + uint8_t num_inputs; /* Byte offsets for the start of the vertex attributes 0-7, and the @@ -218,6 +228,13 @@ struct vc4_job { struct vc4_cl bo_handles; struct vc4_cl bo_pointers; uint32_t shader_rec_count; + /** + * Amount of memory used by the BOs in bo_pointers. + * + * Used for checking when we should flush the job early so we don't + * OOM. + */ + uint32_t bo_space; /** @{ Surfaces to submit rendering for. */ struct pipe_surface *color_read; @@ -317,11 +334,12 @@ struct vc4_context { uint64_t next_compiled_program_id; struct ra_regs *regs; - unsigned int reg_class_any; - unsigned int reg_class_a_or_b_or_acc; + unsigned int reg_class_any[2]; + unsigned int reg_class_a_or_b[2]; + unsigned int reg_class_a_or_b_or_acc[2]; unsigned int reg_class_r0_r3; - unsigned int reg_class_r4_or_a; - unsigned int reg_class_a; + unsigned int reg_class_r4_or_a[2]; + unsigned int reg_class_a[2]; uint8_t prim_mode; @@ -433,6 +451,18 @@ void vc4_simulator_destroy(struct vc4_screen *screen); int vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args, struct vc4_job *job); +int vc4_simulator_ioctl(int fd, unsigned long request, void *arg); +void vc4_simulator_open_from_handle(int fd, uint32_t winsys_stride, + int handle, uint32_t size); + +static inline int +vc4_ioctl(int fd, unsigned long request, void *arg) +{ + if (using_vc4_simulator) + return vc4_simulator_ioctl(fd, request, arg); + else + return drmIoctl(fd, request, arg); +} void vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader); void vc4_write_uniforms(struct vc4_context *vc4, diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c b/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c index c5afc0cda..ebd080298 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_draw.c @@ -155,7 +155,8 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4, /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */ cl_u16(&shader_rec, VC4_SHADER_FLAG_ENABLE_CLIPPING | - VC4_SHADER_FLAG_FS_SINGLE_THREAD | + (vc4->prog.fs->fs_threaded ? + 0 : VC4_SHADER_FLAG_FS_SINGLE_THREAD) | ((info->mode == PIPE_PRIM_POINTS && vc4->rasterizer->base.point_size_per_vertex) ? VC4_SHADER_FLAG_VS_POINT_SIZE : 0)); @@ -465,6 +466,13 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) job->resolve |= PIPE_CLEAR_COLOR0; + /* If we've used half of the presumably 256MB CMA area, flush the job + * so that we don't accumulate a job that will end up not being + * executable. + */ + if (job->bo_space > 128 * 1024 * 1024) + vc4_flush(pctx); + if (vc4_debug & VC4_DEBUG_ALWAYS_FLUSH) vc4_flush(pctx); } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_emit.c b/lib/mesa/src/gallium/drivers/vc4/vc4_emit.c index 9258ceebe..b48d89a06 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_emit.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_emit.c @@ -76,6 +76,7 @@ vc4_emit_state(struct pipe_context *pctx) VC4_DIRTY_ZSA | VC4_DIRTY_COMPILED_FS)) { uint8_t ez_enable_mask_out = ~0; + uint8_t rasosm_mask_out = ~0; /* HW-2905: If the RCL ends up doing a full-res load when * multisampling, then early Z tracking may end up with values @@ -89,10 +90,20 @@ vc4_emit_state(struct pipe_context *pctx) if (job->msaa || vc4->prog.fs->disable_early_z) ez_enable_mask_out &= ~VC4_CONFIG_BITS_EARLY_Z; + /* Don't set the rasterizer to oversample if we're doing our + * binning and load/stores in single-sample mode. This is for + * the samples == 1 case, where vc4 doesn't do any + * multisampling behavior. + */ + if (!job->msaa) { + rasosm_mask_out &= + ~VC4_CONFIG_BITS_RASTERIZER_OVERSAMPLE_4X; + } + cl_u8(&bcl, VC4_PACKET_CONFIGURATION_BITS); cl_u8(&bcl, - vc4->rasterizer->config_bits[0] | - vc4->zsa->config_bits[0]); + (vc4->rasterizer->config_bits[0] | + vc4->zsa->config_bits[0]) & rasosm_mask_out); cl_u8(&bcl, vc4->rasterizer->config_bits[1] | vc4->zsa->config_bits[1]); diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_formats.c b/lib/mesa/src/gallium/drivers/vc4/vc4_formats.c index dd700cdec..42cdad115 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_formats.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_formats.c @@ -83,6 +83,8 @@ static const struct vc4_format vc4_format_table[] = { FORMAT(B5G6R5_UNORM, RGB565, RGB565, SWIZ(X, Y, Z, 1)), + FORMAT(ETC1_RGB8, NO, ETC1, SWIZ(X, Y, Z, 1)), + /* Depth sampling will be handled by doing nearest filtering and not * unpacking the RGBA value. */ diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c index b7e31b80c..2ed89ead5 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_blend.c @@ -494,7 +494,7 @@ vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b, discard->num_components = 1; discard->src[0] = nir_src_for_ssa(nir_inot(b, condition)); nir_builder_instr_insert(b, &discard->instr); - c->s->info.fs.uses_discard = true; + c->s->info->fs.uses_discard = true; } static nir_ssa_def * @@ -630,25 +630,14 @@ vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b, { nir_ssa_def *frag_color = intr->src[0].ssa; - if (c->fs_key->sample_coverage) { - nir_intrinsic_instr *load = - nir_intrinsic_instr_create(b->shader, - nir_intrinsic_load_sample_mask_in); - load->num_components = 1; - nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); - nir_builder_instr_insert(b, &load->instr); - - nir_ssa_def *bitmask = &load->dest.ssa; - - vc4_nir_store_sample_mask(c, b, bitmask); - } else if (c->fs_key->sample_alpha_to_coverage) { + if (c->fs_key->sample_alpha_to_coverage) { nir_ssa_def *a = nir_channel(b, frag_color, 3); /* XXX: We should do a nice dither based on the fragment * coordinate, instead. */ nir_ssa_def *num_samples = nir_imm_float(b, VC4_MAX_SAMPLES); - nir_ssa_def *num_bits = nir_f2i(b, nir_fmul(b, a, num_samples)); + nir_ssa_def *num_bits = nir_f2i32(b, nir_fmul(b, a, num_samples)); nir_ssa_def *bitmask = nir_isub(b, nir_ishl(b, nir_imm_int(b, 1), @@ -730,4 +719,16 @@ vc4_nir_lower_blend(nir_shader *s, struct vc4_compile *c) nir_metadata_dominance); } } + + /* If we didn't do alpha-to-coverage on the output color, we still + * need to pass glSampleMask() through. + */ + if (c->fs_key->sample_coverage && !c->fs_key->sample_alpha_to_coverage) { + nir_function_impl *impl = nir_shader_get_entrypoint(s); + nir_builder b; + nir_builder_init(&b, impl); + b.cursor = nir_after_block(nir_impl_last_block(impl)); + + vc4_nir_store_sample_mask(c, &b, nir_load_sample_mask_in(&b)); + } } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c index 4a795f8da..b7969a562 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_nir_lower_io.c @@ -106,11 +106,11 @@ vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c, } else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_SIGNED) { if (chan->normalized) { return nir_fmul(b, - nir_i2f(b, vpm_reads[swiz]), + nir_i2f32(b, vpm_reads[swiz]), nir_imm_float(b, 1.0 / 0x7fffffff)); } else { - return nir_i2f(b, vpm_reads[swiz]); + return nir_i2f32(b, vpm_reads[swiz]); } } else if (chan->size == 8 && (chan->type == UTIL_FORMAT_TYPE_UNSIGNED || @@ -125,16 +125,16 @@ vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c, nir_imm_float(b, 1.0)); } else { return nir_fadd(b, - nir_i2f(b, - vc4_nir_unpack_8i(b, temp, - swiz)), + nir_i2f32(b, + vc4_nir_unpack_8i(b, temp, + swiz)), nir_imm_float(b, -128.0)); } } else { if (chan->normalized) { return vc4_nir_unpack_8f(b, vpm, swiz); } else { - return nir_i2f(b, vc4_nir_unpack_8i(b, vpm, swiz)); + return nir_i2f32(b, vc4_nir_unpack_8i(b, vpm, swiz)); } } } else if (chan->size == 16 && @@ -146,7 +146,7 @@ vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c, * UNPACK_16_I for all of these. */ if (chan->type == UTIL_FORMAT_TYPE_SIGNED) { - temp = nir_i2f(b, vc4_nir_unpack_16i(b, vpm, swiz & 1)); + temp = nir_i2f32(b, vc4_nir_unpack_16i(b, vpm, swiz & 1)); if (chan->normalized) { return nir_fmul(b, temp, nir_imm_float(b, 1/32768.0f)); @@ -154,7 +154,7 @@ vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c, return temp; } } else { - temp = nir_i2f(b, vc4_nir_unpack_16u(b, vpm, swiz & 1)); + temp = nir_i2f32(b, vc4_nir_unpack_16u(b, vpm, swiz & 1)); if (chan->normalized) { return nir_fmul(b, temp, nir_imm_float(b, 1 / 65535.0)); diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c index 01ad05d27..5e7d26923 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_algebraic.c @@ -94,14 +94,17 @@ static void replace_with_mov(struct vc4_compile *c, struct qinst *inst, struct qreg arg) { dump_from(c, inst); + + inst->src[0] = arg; + if (qir_has_implicit_tex_uniform(inst)) + inst->src[1] = inst->src[qir_get_tex_uniform_src(inst)]; + if (qir_is_mul(inst)) inst->op = QOP_MMOV; else if (qir_is_float_input(inst)) inst->op = QOP_FMOV; else inst->op = QOP_MOV; - inst->src[0] = arg; - inst->src[1] = c->undef; dump_to(c, inst); } @@ -172,8 +175,12 @@ qir_opt_algebraic(struct vc4_compile *c) break; case QOP_ADD: - if (replace_x_0_with_x(c, inst, 0) || - replace_x_0_with_x(c, inst, 1)) { + /* Kernel validation requires that we use an actual + * add instruction. + */ + if (inst->dst.file != QFILE_TEX_S_DIRECT && + (replace_x_0_with_x(c, inst, 0) || + replace_x_0_with_x(c, inst, 1))) { progress = true; break; } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_coalesce_ff_writes.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_coalesce_ff_writes.c new file mode 100644 index 000000000..e4f8e57fc --- /dev/null +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_coalesce_ff_writes.c @@ -0,0 +1,111 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file vc4_opt_coalesce_ff_writes.c + * + * This modifies instructions that generate the value consumed by a VPM or TMU + * coordinate write to write directly into the VPM or TMU. + */ + +#include "vc4_qir.h" + +bool +qir_opt_coalesce_ff_writes(struct vc4_compile *c) +{ + /* For now, only do this pass when we don't have control flow. */ + struct qblock *block = qir_entry_block(c); + if (block != qir_exit_block(c)) + return false; + + bool progress = false; + uint32_t use_count[c->num_temps]; + memset(&use_count, 0, sizeof(use_count)); + + qir_for_each_inst_inorder(inst, c) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { + if (inst->src[i].file == QFILE_TEMP) { + uint32_t temp = inst->src[i].index; + use_count[temp]++; + } + } + } + + qir_for_each_inst_inorder(mov_inst, c) { + if (!qir_is_raw_mov(mov_inst) || mov_inst->sf) + continue; + if (mov_inst->src[0].file != QFILE_TEMP) + continue; + + if (!(mov_inst->dst.file == QFILE_VPM || + mov_inst->dst.file == QFILE_TLB_COLOR_WRITE || + mov_inst->dst.file == QFILE_TLB_COLOR_WRITE_MS || + qir_is_tex(mov_inst))) + continue; + + uint32_t temp = mov_inst->src[0].index; + if (use_count[temp] != 1) + continue; + + struct qinst *inst = c->defs[temp]; + if (!inst) + continue; + + /* Don't bother trying to fold in an ALU op using a uniform to + * a texture op, as we'll just have to lower the uniform back + * out. + */ + if (qir_is_tex(mov_inst) && qir_has_uniform_read(inst)) + continue; + + if (qir_depends_on_flags(inst) || inst->sf) + continue; + + if (qir_has_side_effects(c, inst) || + qir_has_side_effect_reads(c, inst) || + inst->op == QOP_TLB_COLOR_READ || + inst->op == QOP_VARY_ADD_C) { + continue; + } + + /* Move the generating instruction into the position of the FF + * write. + */ + c->defs[inst->dst.index] = NULL; + inst->dst.file = mov_inst->dst.file; + inst->dst.index = mov_inst->dst.index; + if (qir_has_implicit_tex_uniform(mov_inst)) { + inst->src[qir_get_tex_uniform_src(inst)] = + mov_inst->src[qir_get_tex_uniform_src(mov_inst)]; + } + + list_del(&inst->link); + list_addtail(&inst->link, &mov_inst->link); + + qir_remove_instruction(c, mov_inst); + + progress = true; + } + + return progress; +} diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c index 7ff916155..de642d465 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_constant_folding.c @@ -58,7 +58,7 @@ dump_to(struct vc4_compile *c, struct qinst *inst) static bool constant_fold(struct vc4_compile *c, struct qinst *inst) { - int nsrc = qir_get_op_nsrc(inst->op); + int nsrc = qir_get_nsrc(inst); uint32_t ui[nsrc]; for (int i = 0; i < nsrc; i++) { diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c index d20ee5e22..9a6320a9a 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c @@ -67,7 +67,7 @@ try_copy_prop(struct vc4_compile *c, struct qinst *inst, struct qinst **movs) bool debug = false; bool progress = false; - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file != QFILE_TEMP) continue; @@ -113,7 +113,7 @@ try_copy_prop(struct vc4_compile *c, struct qinst *inst, struct qinst **movs) * this instruction doesn't already use it. */ bool already_has_unpack = false; - for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) { + for (int j = 0; j < qir_get_nsrc(inst); j++) { if (inst->src[j].pack) already_has_unpack = true; } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c index 1838c394f..f04d0ff97 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_dead_code.c @@ -54,7 +54,7 @@ dce(struct vc4_compile *c, struct qinst *inst) static bool has_nonremovable_reads(struct vc4_compile *c, struct qinst *inst) { - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file == QFILE_VPM) { uint32_t attr = inst->src[i].index / 4; uint32_t offset = (inst->src[i].index % 4) * 4; @@ -88,7 +88,7 @@ qir_opt_dead_code(struct vc4_compile *c) bool *used = calloc(c->num_temps, sizeof(bool)); qir_for_each_inst_inorder(inst, c) { - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file == QFILE_TEMP) used[inst->src[i].index] = true; } @@ -129,7 +129,7 @@ qir_opt_dead_code(struct vc4_compile *c) continue; } - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file != QFILE_VPM) continue; uint32_t attr = inst->src[i].index / 4; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c index f4856673b..577290b1f 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_peephole_sf.c @@ -62,7 +62,7 @@ inst_srcs_updated(struct qinst *inst, struct qinst *writer) */ switch (writer->dst.file) { case QFILE_TEMP: - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file == QFILE_TEMP && inst->src[i].index == writer->dst.index) { return true; @@ -95,7 +95,7 @@ inst_result_equals(struct qinst *a, struct qinst *b) return false; } - for (int i = 0; i < qir_get_op_nsrc(a->op); i++) { + for (int i = 0; i < qir_get_nsrc(a); i++) { if (!qir_reg_equals(a->src[i], b->src[i]) || src_file_varies_on_reread(a->src[i]) || src_file_varies_on_reread(b->src[i])) { diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c index e97cb63ae..07eca71f2 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_small_immediates.c @@ -45,7 +45,7 @@ qir_opt_small_immediates(struct vc4_compile *c) * elsewhere). */ bool uses_small_imm = false; - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file == QFILE_SMALL_IMM) uses_small_imm = true; } @@ -63,7 +63,7 @@ qir_opt_small_immediates(struct vc4_compile *c) if (inst->op == QOP_MIN_NOIMM) continue; - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { struct qreg src = qir_follow_movs(c, inst->src[i]); if (src.file != QFILE_UNIF || @@ -73,11 +73,8 @@ qir_opt_small_immediates(struct vc4_compile *c) continue; } - if (i == 1 && - (inst->op == QOP_TEX_S || - inst->op == QOP_TEX_T || - inst->op == QOP_TEX_R || - inst->op == QOP_TEX_B)) { + if (qir_is_tex(inst) && + i == qir_get_tex_uniform_src(inst)) { /* No turning the implicit uniform read into * an immediate. */ diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c index 83ba11b81..6f196e7d1 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_opt_vpm.c @@ -24,10 +24,8 @@ /** * @file vc4_opt_vpm.c * - * This modifies instructions that: - * 1. exclusively consume a value read from the VPM to directly read the VPM if - * other operands allow it. - * 2. generate the value consumed by a VPM write to write directly into the VPM. + * This modifies instructions that exclusively consume a value read from the + * VPM to directly read the VPM if other operands allow it. */ #include "vc4_qir.h" @@ -44,21 +42,11 @@ qir_opt_vpm(struct vc4_compile *c) return false; bool progress = false; - struct qinst *vpm_writes[64] = { 0 }; uint32_t use_count[c->num_temps]; - uint32_t vpm_write_count = 0; memset(&use_count, 0, sizeof(use_count)); qir_for_each_inst_inorder(inst, c) { - switch (inst->dst.file) { - case QFILE_VPM: - vpm_writes[vpm_write_count++] = inst; - break; - default: - break; - } - - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file == QFILE_TEMP) { uint32_t temp = inst->src[i].index; use_count[temp]++; @@ -81,7 +69,7 @@ qir_opt_vpm(struct vc4_compile *c) qir_is_tex(inst)) continue; - for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) { + for (int j = 0; j < qir_get_nsrc(inst); j++) { if (inst->src[j].file != QFILE_TEMP || inst->src[j].pack) continue; @@ -106,7 +94,7 @@ qir_opt_vpm(struct vc4_compile *c) } uint32_t temps = 0; - for (int k = 0; k < qir_get_op_nsrc(inst->op); k++) { + for (int k = 0; k < qir_get_nsrc(inst); k++) { if (inst->src[k].file == QFILE_TEMP) temps++; } @@ -127,42 +115,5 @@ qir_opt_vpm(struct vc4_compile *c) } } - for (int i = 0; i < vpm_write_count; i++) { - if (!qir_is_raw_mov(vpm_writes[i]) || - vpm_writes[i]->src[0].file != QFILE_TEMP) { - continue; - } - - uint32_t temp = vpm_writes[i]->src[0].index; - if (use_count[temp] != 1) - continue; - - struct qinst *inst = c->defs[temp]; - if (!inst) - continue; - - if (qir_depends_on_flags(inst) || inst->sf) - continue; - - if (qir_has_side_effects(c, inst) || - qir_has_side_effect_reads(c, inst)) { - continue; - } - - /* Move the generating instruction to the end of the program - * to maintain the order of the VPM writes. - */ - assert(!vpm_writes[i]->sf); - list_del(&inst->link); - list_addtail(&inst->link, &vpm_writes[i]->link); - qir_remove_instruction(c, vpm_writes[i]); - - c->defs[inst->dst.index] = NULL; - inst->dst.file = QFILE_VPM; - inst->dst.index = 0; - - progress = true; - } - return progress; } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c index 00e16e3db..59368734d 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_program.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_program.c @@ -24,7 +24,7 @@ #include <inttypes.h> #include "util/u_format.h" -#include "util/u_hash.h" +#include "util/crc32.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/ralloc.h" @@ -38,9 +38,6 @@ #include "vc4_qpu.h" #include "vc4_qir.h" #include "mesa/state_tracker/st_glsl_types.h" -#ifdef USE_VC4_SIMULATOR -#include "simpenrose/simpenrose.h" -#endif static struct qreg ntq_get_src(struct vc4_compile *c, nir_src src, int i); @@ -68,6 +65,23 @@ resize_qreg_array(struct vc4_compile *c, (*regs)[i] = c->undef; } +static void +ntq_emit_thrsw(struct vc4_compile *c) +{ + if (!c->fs_threaded) + return; + + /* Always thread switch after each texture operation for now. + * + * We could do better by batching a bunch of texture fetches up and + * then doing one thread switch and collecting all their results + * afterward. + */ + qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef, + c->undef, c->undef)); + c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL); +} + static struct qreg indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) { @@ -106,8 +120,14 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) qir_uniform_ui(c, (range->dst_offset + range->size - 4))); - qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0)); + qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), + indirect_offset, + qir_uniform(c, QUNIFORM_UBO_ADDR, 0)); + c->num_texture_samples++; + + ntq_emit_thrsw(c); + return qir_TEX_RESULT(c); } @@ -140,10 +160,33 @@ ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def) return qregs; } +/** + * This function is responsible for getting QIR results into the associated + * storage for a NIR instruction. + * + * If it's a NIR SSA def, then we just set the associated hash table entry to + * the new result. + * + * If it's a NIR reg, then we need to update the existing qreg assigned to the + * NIR destination with the incoming value. To do that without introducing + * new MOVs, we require that the incoming qreg either be a uniform, or be + * SSA-defined by the previous QIR instruction in the block and rewritable by + * this function. That lets us sneak ahead and insert the SF flag beforehand + * (knowing that the previous instruction doesn't depend on flags) and rewrite + * its destination to be the NIR reg's destination + */ static void ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan, struct qreg result) { + struct qinst *last_inst = NULL; + if (!list_empty(&c->cur_block->instructions)) + last_inst = (struct qinst *)c->cur_block->instructions.prev; + + assert(result.file == QFILE_UNIF || + (result.file == QFILE_TEMP && + last_inst && last_inst == c->defs[result.index])); + if (dest->is_ssa) { assert(chan < dest->ssa.num_components); @@ -165,17 +208,34 @@ ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan, _mesa_hash_table_search(c->def_ht, reg); struct qreg *qregs = entry->data; - /* Conditionally move the result to the destination if the - * channel is active. + /* Insert a MOV if the source wasn't an SSA def in the + * previous instruction. + */ + if (result.file == QFILE_UNIF) { + result = qir_MOV(c, result); + last_inst = c->defs[result.index]; + } + + /* We know they're both temps, so just rewrite index. */ + c->defs[last_inst->dst.index] = NULL; + last_inst->dst.index = qregs[chan].index; + + /* If we're in control flow, then make this update of the reg + * conditional on the execution mask. */ if (c->execute.file != QFILE_NULL) { - struct qinst *mov; + last_inst->dst.index = qregs[chan].index; + /* Set the flags to the current exec mask. To insert + * the SF, we temporarily remove our SSA instruction. + */ + list_del(&last_inst->link); qir_SF(c, c->execute); - mov = qir_MOV_cond(c, QPU_COND_ZS, qregs[chan], result); - mov->cond_is_exec_mask = true; - } else { - qir_MOV_dest(c, qregs[chan], result); + list_addtail(&last_inst->link, + &c->cur_block->instructions); + + last_inst->cond = QPU_COND_ZS; + last_inst->cond_is_exec_mask = true; } } } @@ -324,24 +384,24 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr) addr = qir_MAX(c, addr, qir_uniform_ui(c, 0)); addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4)); - qir_TEX_DIRECT(c, addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit)); + qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), + addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit)); + + ntq_emit_thrsw(c); struct qreg tex = qir_TEX_RESULT(c); c->num_texture_samples++; - struct qreg dest[4]; enum pipe_format format = c->key->tex[unit].format; if (util_format_is_depth_or_stencil(format)) { struct qreg scaled = ntq_scale_depth_texture(c, tex); for (int i = 0; i < 4; i++) - dest[i] = scaled; + ntq_store_dest(c, &instr->dest, i, qir_MOV(c, scaled)); } else { for (int i = 0; i < 4; i++) - dest[i] = qir_UNPACK_8_F(c, tex, i); + ntq_store_dest(c, &instr->dest, i, + qir_UNPACK_8_F(c, tex, i)); } - - for (int i = 0; i < 4; i++) - ntq_store_dest(c, &instr->dest, i, dest[i]); } static void @@ -375,7 +435,7 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) lod = ntq_get_src(c, instr->src[i].src, 0); is_txl = true; break; - case nir_tex_src_comparitor: + case nir_tex_src_comparator: compare = ntq_get_src(c, instr->src[i].src, 0); break; default: @@ -383,6 +443,16 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) } } + if (c->stage != QSTAGE_FRAG && !is_txl) { + /* From the GLSL 1.20 spec: + * + * "If it is mip-mapped and running on the vertex shader, + * then the base texture is used." + */ + is_txl = true; + lod = qir_uniform_ui(c, 0); + } + if (c->key->tex[unit].force_first_level) { lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit); is_txl = true; @@ -413,14 +483,20 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) unit | (is_txl << 16)); } + struct qinst *tmu; if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - qir_TEX_R(c, r, texture_u[next_texture_u++]); + tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r); + tmu->src[qir_get_tex_uniform_src(tmu)] = + texture_u[next_texture_u++]; } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER || c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP || c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER || c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) { - qir_TEX_R(c, qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR, unit), - texture_u[next_texture_u++]); + tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), + qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR, + unit)); + tmu->src[qir_get_tex_uniform_src(tmu)] = + texture_u[next_texture_u++]; } if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) { @@ -431,14 +507,23 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) t = qir_SAT(c, t); } - qir_TEX_T(c, t, texture_u[next_texture_u++]); + tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t); + tmu->src[qir_get_tex_uniform_src(tmu)] = + texture_u[next_texture_u++]; - if (is_txl || is_txb) - qir_TEX_B(c, lod, texture_u[next_texture_u++]); + if (is_txl || is_txb) { + tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod); + tmu->src[qir_get_tex_uniform_src(tmu)] = + texture_u[next_texture_u++]; + } - qir_TEX_S(c, s, texture_u[next_texture_u++]); + tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s); + tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++]; c->num_texture_samples++; + + ntq_emit_thrsw(c); + struct qreg tex = qir_TEX_RESULT(c); enum pipe_format format = c->key->tex[unit].format; @@ -514,8 +599,11 @@ ntq_ffract(struct vc4_compile *c, struct qreg src) struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src)); struct qreg diff = qir_FSUB(c, src, trunc); qir_SF(c, diff); - return qir_SEL(c, QPU_COND_NS, - qir_FADD(c, diff, qir_uniform_f(c, 1.0)), diff); + + qir_FADD_dest(c, diff, + diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS; + + return qir_MOV(c, diff); } /** @@ -525,15 +613,18 @@ ntq_ffract(struct vc4_compile *c, struct qreg src) static struct qreg ntq_ffloor(struct vc4_compile *c, struct qreg src) { - struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src)); + struct qreg result = qir_ITOF(c, qir_FTOI(c, src)); /* This will be < 0 if we truncated and the truncation was of a value * that was < 0 in the first place. */ - qir_SF(c, qir_FSUB(c, src, trunc)); + qir_SF(c, qir_FSUB(c, src, result)); + + struct qinst *sub = qir_FSUB_dest(c, result, + result, qir_uniform_f(c, 1.0)); + sub->cond = QPU_COND_NS; - return qir_SEL(c, QPU_COND_NS, - qir_FSUB(c, trunc, qir_uniform_f(c, 1.0)), trunc); + return qir_MOV(c, result); } /** @@ -543,15 +634,17 @@ ntq_ffloor(struct vc4_compile *c, struct qreg src) static struct qreg ntq_fceil(struct vc4_compile *c, struct qreg src) { - struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src)); + struct qreg result = qir_ITOF(c, qir_FTOI(c, src)); /* This will be < 0 if we truncated and the truncation was of a value * that was > 0 in the first place. */ - qir_SF(c, qir_FSUB(c, trunc, src)); + qir_SF(c, qir_FSUB(c, result, src)); + + qir_FADD_dest(c, result, + result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS; - return qir_SEL(c, QPU_COND_NS, - qir_FADD(c, trunc, qir_uniform_f(c, 1.0)), trunc); + return qir_MOV(c, result); } static struct qreg @@ -632,7 +725,7 @@ ntq_fsign(struct vc4_compile *c, struct qreg src) qir_MOV_dest(c, t, qir_uniform_f(c, 0.0)); qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC; qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS; - return t; + return qir_MOV(c, t); } static void @@ -811,7 +904,7 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr) qir_PACK_8_F(c, result, src, i); } - ntq_store_dest(c, &instr->dest.dest, 0, result); + ntq_store_dest(c, &instr->dest.dest, 0, qir_MOV(c, result)); } /** Handles sign-extended bitfield extracts for 16 bits. */ @@ -917,6 +1010,9 @@ ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest, break; } + /* Make the temporary for nir_store_dest(). */ + *dest = qir_MOV(c, *dest); + return true; } @@ -943,7 +1039,7 @@ static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr, out: qir_SF(c, src[0]); - return qir_SEL(c, QPU_COND_NS, src[1], src[2]); + return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2])); } static struct qreg @@ -962,9 +1058,9 @@ ntq_fddx(struct vc4_compile *c, struct qreg src) qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0), qir_uniform_ui(c, 1))); - return qir_SEL(c, QPU_COND_ZS, - qir_FSUB(c, from_right, src), - qir_FSUB(c, src, from_left)); + return qir_MOV(c, qir_SEL(c, QPU_COND_ZS, + qir_FSUB(c, from_right, src), + qir_FSUB(c, src, from_left))); } static struct qreg @@ -981,9 +1077,9 @@ ntq_fddy(struct vc4_compile *c, struct qreg src) qir_reg(QFILE_QPU_ELEMENT, 0), qir_uniform_ui(c, 2))); - return qir_SEL(c, QPU_COND_ZS, - qir_FSUB(c, from_top, src), - qir_FSUB(c, src, from_bottom)); + return qir_MOV(c, qir_SEL(c, QPU_COND_ZS, + qir_FSUB(c, from_top, src), + qir_FSUB(c, src, from_bottom))); } static void @@ -1004,7 +1100,8 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) srcs[i] = ntq_get_src(c, instr->src[i].src, instr->src[i].swizzle[0]); for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) - ntq_store_dest(c, &instr->dest.dest, i, srcs[i]); + ntq_store_dest(c, &instr->dest.dest, i, + qir_MOV(c, srcs[i])); return; } @@ -1053,12 +1150,12 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) result = qir_FMAX(c, src[0], src[1]); break; - case nir_op_f2i: - case nir_op_f2u: + case nir_op_f2i32: + case nir_op_f2u32: result = qir_FTOI(c, src[0]); break; - case nir_op_i2f: - case nir_op_u2f: + case nir_op_i2f32: + case nir_op_u2f32: result = qir_ITOF(c, src[0]); break; case nir_op_b2f: @@ -1070,9 +1167,9 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) case nir_op_i2b: case nir_op_f2b: qir_SF(c, src[0]); - result = qir_SEL(c, QPU_COND_ZC, - qir_uniform_ui(c, ~0), - qir_uniform_ui(c, 0)); + result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, + qir_uniform_ui(c, ~0), + qir_uniform_ui(c, 0))); break; case nir_op_iadd: @@ -1136,7 +1233,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) break; case nir_op_fcsel: qir_SF(c, src[0]); - result = qir_SEL(c, QPU_COND_ZC, src[1], src[2]); + result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2])); break; case nir_op_frcp: @@ -1250,7 +1347,7 @@ emit_frag_end(struct vc4_compile *c) } uint32_t discard_cond = QPU_COND_ALWAYS; - if (c->s->info.fs.uses_discard) { + if (c->s->info->fs.uses_discard) { qir_SF(c, c->discard); discard_cond = QPU_COND_ZS; } @@ -1414,7 +1511,7 @@ emit_vert_end(struct vc4_compile *c, static void emit_coord_end(struct vc4_compile *c) { - struct qreg rcp_w = qir_RCP(c, c->outputs[c->output_position_index + 3]); + struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]); emit_stub_vpm_read(c); @@ -1448,6 +1545,10 @@ vc4_optimize_nir(struct nir_shader *s) NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); NIR_PASS(progress, s, nir_opt_undef); + NIR_PASS(progress, s, nir_opt_loop_unroll, + nir_var_shader_in | + nir_var_shader_out | + nir_var_local); } while (progress); } @@ -1605,6 +1706,47 @@ ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr) } static void +ntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr) +{ + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + assert(const_offset->u32[0] == 0); + + /* Reads of the per-sample color need to be done in + * order. + */ + int sample_index = (nir_intrinsic_base(instr) - + VC4_NIR_TLB_COLOR_READ_INPUT); + for (int i = 0; i <= sample_index; i++) { + if (c->color_reads[i].file == QFILE_NULL) { + c->color_reads[i] = + qir_TLB_COLOR_READ(c); + } + } + ntq_store_dest(c, &instr->dest, 0, + qir_MOV(c, c->color_reads[sample_index])); +} + +static void +ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr) +{ + assert(instr->num_components == 1); + + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + assert(const_offset && "vc4 doesn't support indirect inputs"); + + if (c->stage == QSTAGE_FRAG && + nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) { + ntq_emit_color_read(c, instr); + return; + } + + uint32_t offset = nir_intrinsic_base(instr) + const_offset->u32[0]; + int comp = nir_intrinsic_component(instr); + ntq_store_dest(c, &instr->dest, 0, + qir_MOV(c, c->inputs[offset * 4 + comp])); +} + +static void ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) { nir_const_value *const_offset; @@ -1681,31 +1823,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_input: - assert(instr->num_components == 1); - const_offset = nir_src_as_const_value(instr->src[0]); - assert(const_offset && "vc4 doesn't support indirect inputs"); - if (c->stage == QSTAGE_FRAG && - nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) { - assert(const_offset->u32[0] == 0); - /* Reads of the per-sample color need to be done in - * order. - */ - int sample_index = (nir_intrinsic_base(instr) - - VC4_NIR_TLB_COLOR_READ_INPUT); - for (int i = 0; i <= sample_index; i++) { - if (c->color_reads[i].file == QFILE_NULL) { - c->color_reads[i] = - qir_TLB_COLOR_READ(c); - } - } - ntq_store_dest(c, &instr->dest, 0, - c->color_reads[sample_index]); - } else { - offset = nir_intrinsic_base(instr) + const_offset->u32[0]; - int comp = nir_intrinsic_component(instr); - ntq_store_dest(c, &instr->dest, 0, - c->inputs[offset * 4 + comp]); - } + ntq_emit_load_input(c, instr); break; case nir_intrinsic_store_output: @@ -1855,11 +1973,12 @@ ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt) qir_link_blocks(c->cur_block, after_block); qir_set_emit_block(c, after_block); - if (was_top_level) + if (was_top_level) { c->execute = c->undef; - else + c->last_top_block = c->cur_block; + } else { ntq_activate_execute_for_block(c); - + } } static void @@ -1983,10 +2102,12 @@ ntq_emit_loop(struct vc4_compile *c, nir_loop *loop) qir_link_blocks(c->cur_block, c->loop_break_block); qir_set_emit_block(c, c->loop_break_block); - if (was_top_level) + if (was_top_level) { c->execute = c->undef; - else + c->last_top_block = c->cur_block; + } else { ntq_activate_execute_for_block(c); + } c->loop_break_block = save_loop_break_block; c->loop_cont_block = save_loop_cont_block; @@ -2037,7 +2158,7 @@ ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl) static void nir_to_qir(struct vc4_compile *c) { - if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard) + if (c->stage == QSTAGE_FRAG && c->s->info->fs.uses_discard) c->discard = qir_MOV(c, qir_uniform_ui(c, 0)); ntq_setup_inputs(c); @@ -2063,11 +2184,13 @@ static const nir_shader_compiler_options nir_options = { .lower_fsqrt = true, .lower_negate = true, .native_integers = true, + .max_unroll_iterations = 32, }; const void * vc4_screen_get_compiler_options(struct pipe_screen *pscreen, - enum pipe_shader_ir ir, unsigned shader) + enum pipe_shader_ir ir, + enum pipe_shader_type shader) { return &nir_options; } @@ -2089,7 +2212,7 @@ count_nir_instrs(nir_shader *nir) static struct vc4_compile * vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, - struct vc4_key *key) + struct vc4_key *key, bool fs_threaded) { struct vc4_compile *c = qir_compile_init(); @@ -2099,6 +2222,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, c->program_id = key->shader_state->program_id; c->variant_id = p_atomic_inc_return(&key->shader_state->compiled_variant_count); + c->fs_threaded = fs_threaded; c->key = key; switch (stage) { @@ -2216,6 +2340,17 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, switch (stage) { case QSTAGE_FRAG: + /* FS threading requires that the thread execute + * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating + * (with no other THRSW afterwards, obviously). If we didn't + * fetch a texture at a top level block, this wouldn't be + * true. + */ + if (c->fs_threaded && !c->last_thrsw_at_top_level) { + c->failed = true; + return c; + } + emit_frag_end(c); break; case QSTAGE_VERT: @@ -2300,7 +2435,7 @@ vc4_shader_state_create(struct pipe_context *pctx, } NIR_PASS_V(s, nir_opt_global_to_local); - NIR_PASS_V(s, nir_convert_to_ssa); + NIR_PASS_V(s, nir_lower_regs_to_ssa); NIR_PASS_V(s, nir_normalize_cubemap_coords); NIR_PASS_V(s, nir_lower_load_const_to_scalar); @@ -2360,7 +2495,7 @@ vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c, memset(input_live, 0, sizeof(input_live)); qir_for_each_inst_inorder(inst, c) { - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file == QFILE_VARY) input_live[inst->src[i].index] = true; } @@ -2416,12 +2551,16 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, { struct hash_table *ht; uint32_t key_size; + bool try_threading; + if (stage == QSTAGE_FRAG) { ht = vc4->fs_cache; key_size = sizeof(struct vc4_fs_key); + try_threading = vc4->screen->has_threaded_fs; } else { ht = vc4->vs_cache; key_size = sizeof(struct vc4_vs_key); + try_threading = false; } struct vc4_compiled_shader *shader; @@ -2429,7 +2568,13 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, if (entry) return entry->data; - struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key); + struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading); + /* If the FS failed to compile threaded, fall back to single threaded. */ + if (try_threading && c->failed) { + qir_compile_destroy(c); + c = vc4_shader_ntq(vc4, stage, key, false); + } + shader = rzalloc(NULL, struct vc4_compiled_shader); shader->program_id = vc4->next_compiled_program_id++; @@ -2438,7 +2583,7 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, /* Note: the temporary clone in c->s has been freed. */ nir_shader *orig_shader = key->shader_state->base.ir.nir; - if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH)) + if (orig_shader->info->outputs_written & (1 << FRAG_RESULT_DEPTH)) shader->disable_early_z = true; } else { shader->num_inputs = c->num_inputs; @@ -2463,6 +2608,8 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, sizeof(uint64_t)); } + shader->fs_threaded = c->fs_threaded; + /* Copy the compiler UBO range state to the compiled shader, dropping * out arrays that were never referenced by an indirect load. * @@ -2496,10 +2643,17 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, } } + if ((vc4_debug & VC4_DEBUG_SHADERDB) && stage == QSTAGE_FRAG) { + fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d FS threads\n", + qir_get_stage_name(c->stage), + c->program_id, c->variant_id, + 1 + shader->fs_threaded); + } + qir_compile_destroy(c); struct vc4_key *dup_key; - dup_key = ralloc_size(shader, key_size); + dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */ memcpy(dup_key, key, key_size); _mesa_hash_table_insert(ht, dup_key, shader); @@ -2573,8 +2727,7 @@ vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode) } if (job->msaa) { key->msaa = vc4->rasterizer->base.multisample; - key->sample_coverage = (vc4->rasterizer->base.multisample && - vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1); + key->sample_coverage = (vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1); key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage; key->sample_alpha_to_one = vc4->blend->alpha_to_one; } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c index 4b94fcfb9..c829e7f93 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.c @@ -76,13 +76,10 @@ static const struct qir_op_info qir_op_info[] = { [QOP_FRAG_Z] = { "frag_z", 1, 0 }, [QOP_FRAG_W] = { "frag_w", 1, 0 }, - [QOP_TEX_S] = { "tex_s", 0, 2, true }, - [QOP_TEX_T] = { "tex_t", 0, 2, true }, - [QOP_TEX_R] = { "tex_r", 0, 2, true }, - [QOP_TEX_B] = { "tex_b", 0, 2, true }, - [QOP_TEX_DIRECT] = { "tex_direct", 0, 2, true }, [QOP_TEX_RESULT] = { "tex_result", 1, 0, true }, + [QOP_THRSW] = { "thrsw", 0, 0, true }, + [QOP_LOAD_IMM] = { "load_imm", 0, 1 }, [QOP_LOAD_IMM_U2] = { "load_imm_u2", 0, 1 }, [QOP_LOAD_IMM_I2] = { "load_imm_i2", 0, 1 }, @@ -103,12 +100,35 @@ qir_get_op_name(enum qop qop) } int -qir_get_op_nsrc(enum qop qop) +qir_get_non_sideband_nsrc(struct qinst *inst) { - if (qop < ARRAY_SIZE(qir_op_info) && qir_op_info[qop].name) - return qir_op_info[qop].nsrc; - else - abort(); + assert(qir_op_info[inst->op].name); + return qir_op_info[inst->op].nsrc; +} + +int +qir_get_nsrc(struct qinst *inst) +{ + assert(qir_op_info[inst->op].name); + + int nsrc = qir_get_non_sideband_nsrc(inst); + + /* Normal (non-direct) texture coordinate writes also implicitly load + * a uniform for the texture parameters. + */ + if (qir_is_tex(inst) && inst->dst.file != QFILE_TEX_S_DIRECT) + nsrc++; + + return nsrc; +} + +/* The sideband uniform for textures gets stored after the normal ALU + * arguments. + */ +int +qir_get_tex_uniform_src(struct qinst *inst) +{ + return qir_get_nsrc(inst) - 1; } /** @@ -123,6 +143,11 @@ qir_has_side_effects(struct vc4_compile *c, struct qinst *inst) case QFILE_TLB_COLOR_WRITE: case QFILE_TLB_COLOR_WRITE_MS: case QFILE_TLB_STENCIL_SETUP: + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: return true; default: break; @@ -139,7 +164,7 @@ qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst) * point/line coordinates reads, because they're generated by * fixed-function hardware. */ - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file == QFILE_VARY && c->input_slots[inst->src[i].index].slot == 0xff) { return true; @@ -156,6 +181,17 @@ qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst) } bool +qir_has_uniform_read(struct qinst *inst) +{ + for (int i = 0; i < qir_get_nsrc(inst); i++) { + if (inst->src[i].file == QFILE_UNIF) + return true; + } + + return false; +} + +bool qir_is_mul(struct qinst *inst) { switch (inst->op) { @@ -207,7 +243,30 @@ qir_is_raw_mov(struct qinst *inst) bool qir_is_tex(struct qinst *inst) { - return inst->op >= QOP_TEX_S && inst->op <= QOP_TEX_DIRECT; + switch (inst->dst.file) { + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: + return true; + default: + return false; + } +} + +bool +qir_has_implicit_tex_uniform(struct qinst *inst) +{ + switch (inst->dst.file) { + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: + return true; + default: + return false; + } } bool @@ -299,6 +358,11 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write) [QFILE_FRAG_Y] = "frag_y", [QFILE_FRAG_REV_FLAG] = "frag_rev_flag", [QFILE_QPU_ELEMENT] = "elem", + [QFILE_TEX_S_DIRECT] = "tex_s_direct", + [QFILE_TEX_S] = "tex_s", + [QFILE_TEX_T] = "tex_t", + [QFILE_TEX_R] = "tex_r", + [QFILE_TEX_B] = "tex_b", }; switch (reg.file) { @@ -331,6 +395,11 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write) case QFILE_TLB_COLOR_WRITE_MS: case QFILE_TLB_Z_WRITE: case QFILE_TLB_STENCIL_SETUP: + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: fprintf(stderr, "%s", files[reg.file]); break; @@ -371,7 +440,7 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst) } } - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { fprintf(stderr, ", "); qir_print_reg(c, inst->src[i], false); vc4_qpu_disasm_unpack(stderr, inst->src[i].pack); @@ -382,6 +451,7 @@ void qir_dump(struct vc4_compile *c) { int ip = 0; + int pressure = 0; qir_for_each_block(block, c) { fprintf(stderr, "BLOCK %d:\n", block->index); @@ -389,6 +459,8 @@ qir_dump(struct vc4_compile *c) if (c->temp_start) { bool first = true; + fprintf(stderr, "%3d ", pressure); + for (int i = 0; i < c->num_temps; i++) { if (c->temp_start[i] != ip) continue; @@ -399,6 +471,7 @@ qir_dump(struct vc4_compile *c) fprintf(stderr, ", "); } fprintf(stderr, "S%4d", i); + pressure++; } if (first) @@ -420,6 +493,7 @@ qir_dump(struct vc4_compile *c) fprintf(stderr, ", "); } fprintf(stderr, "E%4d", i); + pressure--; } if (first) @@ -471,7 +545,6 @@ qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1) inst->op = op; inst->dst = dst; - inst->src = calloc(2, sizeof(inst->src[0])); inst->src[0] = src0; inst->src[1] = src1; inst->cond = QPU_COND_ALWAYS; @@ -479,26 +552,6 @@ qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1) return inst; } -struct qinst * -qir_inst4(enum qop op, struct qreg dst, - struct qreg a, - struct qreg b, - struct qreg c, - struct qreg d) -{ - struct qinst *inst = CALLOC_STRUCT(qinst); - - inst->op = op; - inst->dst = dst; - inst->src = calloc(4, sizeof(*inst->src)); - inst->src[0] = a; - inst->src[1] = b; - inst->src[2] = c; - inst->src[3] = d; - - return inst; -} - static void qir_emit(struct vc4_compile *c, struct qinst *inst) { @@ -593,6 +646,7 @@ qir_compile_init(void) list_inithead(&c->blocks); qir_set_emit_block(c, qir_new_block(c)); + c->last_top_block = c->cur_block; c->output_position_index = -1; c->output_color_index = -1; @@ -612,7 +666,6 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst) c->defs[qinst->dst.index] = NULL; list_del(&qinst->link); - free(qinst->src); free(qinst); } @@ -744,6 +797,7 @@ qir_optimize(struct vc4_compile *c) OPTPASS(qir_opt_dead_code); OPTPASS(qir_opt_small_immediates); OPTPASS(qir_opt_vpm); + OPTPASS(qir_opt_coalesce_ff_writes); if (!progress) break; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h index b3cac6bf2..6469e51b0 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir.h @@ -55,6 +55,18 @@ enum qfile { QFILE_TLB_Z_WRITE, QFILE_TLB_STENCIL_SETUP, + /* If tex_s is written on its own without preceding t/r/b setup, it's + * a direct memory access using the input value, without the sideband + * uniform load. We represent these in QIR as a separate write + * destination so we can tell if the sideband uniform is present. + */ + QFILE_TEX_S_DIRECT, + + QFILE_TEX_S, + QFILE_TEX_T, + QFILE_TEX_R, + QFILE_TEX_B, + /* Payload registers that aren't in the physical register file, so we * can just use the corresponding qpu_reg at qpu_emit time. */ @@ -133,30 +145,22 @@ enum qop { QOP_FRAG_Z, QOP_FRAG_W, - /** Texture x coordinate parameter write */ - QOP_TEX_S, - /** Texture y coordinate parameter write */ - QOP_TEX_T, - /** Texture border color parameter or cube map z coordinate write */ - QOP_TEX_R, - /** Texture LOD bias parameter write */ - QOP_TEX_B, - - /** - * Texture-unit 4-byte read with address provided direct in S - * cooordinate. - * - * The first operand is the offset from the start of the UBO, and the - * second is the uniform that has the UBO's base pointer. - */ - QOP_TEX_DIRECT, - /** * Signal of texture read being necessary and then reading r4 into * the destination */ QOP_TEX_RESULT, + /** + * Insert the signal for switching threads in a threaded fragment + * shader. No value can be live in an accumulator across a thrsw. + * + * At the QPU level, this will have several delay slots before the + * switch happens. Those slots are the responsibility of the + * scheduler. + */ + QOP_THRSW, + /* 32-bit immediate loaded to each SIMD channel */ QOP_LOAD_IMM, @@ -194,7 +198,7 @@ struct qinst { enum qop op; struct qreg dst; - struct qreg *src; + struct qreg src[3]; bool sf; bool cond_is_exec_mask; uint8_t cond; @@ -502,9 +506,13 @@ struct vc4_compile { struct qblock *cur_block; struct qblock *loop_cont_block; struct qblock *loop_break_block; + struct qblock *last_top_block; struct list_head qpu_inst_list; + /* Pre-QPU-scheduled instruction containing the last THRSW */ + uint64_t *last_thrsw; + uint64_t *qpu_insts; uint32_t qpu_inst_count; uint32_t qpu_inst_size; @@ -524,6 +532,15 @@ struct vc4_compile { uint32_t program_id; uint32_t variant_id; + + /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH + * is used to hide texturing latency at the cost of limiting ourselves + * to the bottom half of physical reg space. + */ + bool fs_threaded; + + bool last_thrsw_at_top_level; + bool failed; }; @@ -543,11 +560,6 @@ struct qblock *qir_entry_block(struct vc4_compile *c); struct qblock *qir_exit_block(struct vc4_compile *c); struct qinst *qir_inst(enum qop op, struct qreg dst, struct qreg src0, struct qreg src1); -struct qinst *qir_inst4(enum qop op, struct qreg dst, - struct qreg a, - struct qreg b, - struct qreg c, - struct qreg d); void qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst); struct qreg qir_uniform(struct vc4_compile *c, enum quniform_contents contents, @@ -561,13 +573,17 @@ struct qinst *qir_emit_nondef(struct vc4_compile *c, struct qinst *inst); struct qreg qir_get_temp(struct vc4_compile *c); void qir_calculate_live_intervals(struct vc4_compile *c); -int qir_get_op_nsrc(enum qop qop); +int qir_get_nsrc(struct qinst *inst); +int qir_get_non_sideband_nsrc(struct qinst *inst); +int qir_get_tex_uniform_src(struct qinst *inst); bool qir_reg_equals(struct qreg a, struct qreg b); bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst); bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst); +bool qir_has_uniform_read(struct qinst *inst); bool qir_is_mul(struct qinst *inst); bool qir_is_raw_mov(struct qinst *inst); bool qir_is_tex(struct qinst *inst); +bool qir_has_implicit_tex_uniform(struct qinst *inst); bool qir_is_float_input(struct qinst *inst); bool qir_depends_on_flags(struct qinst *inst); bool qir_writes_r4(struct qinst *inst); @@ -582,6 +598,7 @@ void qir_validate(struct vc4_compile *c); void qir_optimize(struct vc4_compile *c); bool qir_opt_algebraic(struct vc4_compile *c); +bool qir_opt_coalesce_ff_writes(struct vc4_compile *c); bool qir_opt_constant_folding(struct vc4_compile *c); bool qir_opt_copy_propagation(struct vc4_compile *c); bool qir_opt_dead_code(struct vc4_compile *c); @@ -722,11 +739,6 @@ QIR_ALU1(RSQ) QIR_ALU1(EXP2) QIR_ALU1(LOG2) QIR_ALU1(VARY_ADD_C) -QIR_NODST_2(TEX_S) -QIR_NODST_2(TEX_T) -QIR_NODST_2(TEX_R) -QIR_NODST_2(TEX_B) -QIR_NODST_2(TEX_DIRECT) QIR_PAYLOAD(FRAG_Z) QIR_PAYLOAD(FRAG_W) QIR_ALU0(TEX_RESULT) @@ -737,10 +749,8 @@ static inline struct qreg qir_SEL(struct vc4_compile *c, uint8_t cond, struct qreg src0, struct qreg src1) { struct qreg t = qir_get_temp(c); - struct qinst *a = qir_MOV_dest(c, t, src0); - struct qinst *b = qir_MOV_dest(c, t, src1); - a->cond = cond; - b->cond = qpu_cond_complement(cond); + qir_MOV_dest(c, t, src1); + qir_MOV_dest(c, t, src0)->cond = cond; return t; } @@ -881,6 +891,6 @@ qir_BRANCH(struct vc4_compile *c, uint8_t cond) #define qir_for_each_inst_inorder(inst, c) \ qir_for_each_block(_block, c) \ - qir_for_each_inst(inst, _block) + qir_for_each_inst_safe(inst, _block) #endif /* VC4_QIR_H */ diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c index 3fd6358e3..443682a46 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c @@ -36,24 +36,10 @@ #include "util/u_math.h" static bool -inst_reads_a_uniform(struct qinst *inst) -{ - if (qir_is_tex(inst)) - return true; - - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { - if (inst->src[i].file == QFILE_UNIF) - return true; - } - - return false; -} - -static bool block_reads_any_uniform(struct qblock *block) { qir_for_each_inst(inst, block) { - if (inst_reads_a_uniform(inst)) + if (qir_has_uniform_read(inst)) return true; } @@ -94,7 +80,7 @@ qir_emit_uniform_stream_resets(struct vc4_compile *c) } qir_for_each_inst(inst, block) { - if (inst_reads_a_uniform(inst)) + if (qir_has_uniform_read(inst)) uniform_count++; } } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c index beefb0d7f..7108b3ee9 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_live_variables.c @@ -205,7 +205,7 @@ qir_setup_def_use(struct vc4_compile *c) _mesa_hash_table_clear(partial_update_ht, NULL); qir_for_each_inst(inst, block) { - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) + for (int i = 0; i < qir_get_nsrc(inst); i++) qir_setup_use(c, block, ip, inst->src[i]); qir_setup_def(c, block, ip, partial_update_ht, inst); @@ -301,8 +301,13 @@ qir_calculate_live_intervals(struct vc4_compile *c) { int bitset_words = BITSET_WORDS(c->num_temps); - c->temp_start = reralloc(c, c->temp_start, int, c->num_temps); - c->temp_end = reralloc(c, c->temp_end, int, c->num_temps); + /* If we called this function more than once, then we should be + * freeing the previous arrays. + */ + assert(!c->temp_start); + + c->temp_start = rzalloc_array(c, int, c->num_temps); + c->temp_end = rzalloc_array(c, int, c->num_temps); for (int i = 0; i < c->num_temps; i++) { c->temp_start[i] = MAX_INSTRUCTION; @@ -310,10 +315,10 @@ qir_calculate_live_intervals(struct vc4_compile *c) } qir_for_each_block(block, c) { - block->def = reralloc(c, block->def, BITSET_WORD, bitset_words); - block->use = reralloc(c, block->use, BITSET_WORD, bitset_words); - block->live_in = reralloc(c, block->live_in, BITSET_WORD, bitset_words); - block->live_out = reralloc(c, block->live_out, BITSET_WORD, bitset_words); + block->def = rzalloc_array(c, BITSET_WORD, bitset_words); + block->use = rzalloc_array(c, BITSET_WORD, bitset_words); + block->live_in = rzalloc_array(c, BITSET_WORD, bitset_words); + block->live_out = rzalloc_array(c, BITSET_WORD, bitset_words); } qir_setup_def_use(c); @@ -322,4 +327,27 @@ qir_calculate_live_intervals(struct vc4_compile *c) ; qir_compute_start_end(c, c->num_temps); + + if (vc4_debug & VC4_DEBUG_SHADERDB) { + int last_ip = 0; + for (int i = 0; i < c->num_temps; i++) + last_ip = MAX2(last_ip, c->temp_end[i]); + + int reg_pressure = 0; + int max_reg_pressure = 0; + for (int i = 0; i < last_ip; i++) { + for (int j = 0; j < c->num_temps; j++) { + if (c->temp_start[j] == i) + reg_pressure++; + if (c->temp_end[j] == i) + reg_pressure--; + } + max_reg_pressure = MAX2(max_reg_pressure, reg_pressure); + } + + fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d max temps\n", + qir_get_stage_name(c->stage), + c->program_id, c->variant_id, + max_reg_pressure); + } } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c index 8ec6c7973..9ecfe6521 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c @@ -77,7 +77,7 @@ is_lowerable_uniform(struct qinst *inst, int i) if (inst->src[i].file != QFILE_UNIF) return false; if (qir_is_tex(inst)) - return i != 1; + return i != qir_get_tex_uniform_src(inst); return true; } @@ -89,7 +89,7 @@ qir_get_instruction_uniform_count(struct qinst *inst) { uint32_t count = 0; - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file != QFILE_UNIF) continue; @@ -119,7 +119,7 @@ qir_lower_uniforms(struct vc4_compile *c) * ht. */ qir_for_each_inst_inorder(inst, c) { - uint32_t nsrc = qir_get_op_nsrc(inst->op); + uint32_t nsrc = qir_get_nsrc(inst); if (qir_get_instruction_uniform_count(inst) <= 1) continue; @@ -155,7 +155,7 @@ qir_lower_uniforms(struct vc4_compile *c) struct qinst *mov = NULL; qir_for_each_inst(inst, block) { - uint32_t nsrc = qir_get_op_nsrc(inst->op); + uint32_t nsrc = qir_get_nsrc(inst); uint32_t count = qir_get_instruction_uniform_count(inst); diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c index 69bd0dd62..5118caf31 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_schedule.c @@ -187,7 +187,7 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) * ignore uniforms accesses, because qir_reorder_uniforms() happens * after this. */ - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { switch (inst->src[i].file) { case QFILE_TEMP: add_dep(dir, @@ -212,23 +212,35 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) add_dep(dir, state->last_vary_read, n); break; - case QOP_TEX_S: - case QOP_TEX_T: - case QOP_TEX_R: - case QOP_TEX_B: - case QOP_TEX_DIRECT: - /* Texturing setup gets scheduled in order, because - * the uniforms referenced by them have to land in a - * specific order. - */ - add_write_dep(dir, &state->last_tex_coord, n); - break; - case QOP_TEX_RESULT: /* Results have to be fetched in order. */ add_write_dep(dir, &state->last_tex_result, n); break; + case QOP_THRSW: + /* After a new THRSW, one must collect all texture samples + * queued since the previous THRSW/program start. For now, we + * have one THRSW in between each texture setup and its + * results collection as our input, and we just make sure that + * that ordering is maintained. + */ + add_write_dep(dir, &state->last_tex_coord, n); + add_write_dep(dir, &state->last_tex_result, n); + + /* accumulators and flags are lost across thread switches. */ + add_write_dep(dir, &state->last_sf, n); + + /* Setup, like the varyings, will need to be drained before we + * thread switch. + */ + add_write_dep(dir, &state->last_vary_read, n); + + /* The TLB-locking operations have to stay after the last + * thread switch. + */ + add_write_dep(dir, &state->last_tlb, n); + break; + case QOP_TLB_COLOR_READ: case QOP_MS_MASK: add_write_dep(dir, &state->last_tlb, n); @@ -254,6 +266,18 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) add_write_dep(dir, &state->last_tlb, n); break; + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: + /* Texturing setup gets scheduled in order, because + * the uniforms referenced by them have to land in a + * specific order. + */ + add_write_dep(dir, &state->last_tex_coord, n); + break; + default: break; } @@ -281,7 +305,7 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, calculate_deps(&state, n); - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { switch (inst->src[i].file) { case QFILE_UNIF: add_dep(state.dir, state.last_uniforms_reset, n); @@ -291,26 +315,59 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, } } - switch (inst->op) { - case QOP_TEX_S: - case QOP_TEX_T: - case QOP_TEX_R: - case QOP_TEX_B: - case QOP_TEX_DIRECT: - /* If the texture coordinate fifo is full, - * block this on the last QOP_TEX_RESULT. + switch (inst->dst.file) { + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: + /* From the VC4 spec: + * + * "The TFREQ input FIFO holds two full lots of s, + * t, r, b data, plus associated setup data, per + * QPU, that is, there are eight data slots. For + * each texture request, slots are only consumed + * for the components of s, t, r, and b actually + * written. Thus the FIFO can hold four requests + * of just (s, t) data, or eight requests of just + * s data (for direct addressed data lookups). + * + * Note that there is one FIFO per QPU, and the + * FIFO has no concept of threads - that is, + * multi-threaded shaders must be careful to use + * only 1/2 the FIFO depth before reading + * back. Multi-threaded programs must also + * therefore always thread switch on texture + * fetch as the other thread may have data + * waiting in the FIFO." + * + * If the texture coordinate fifo is full, block this + * on the last QOP_TEX_RESULT. */ - if (state.tfreq_count == 8) { + if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) { block_until_tex_result(&state, n); } - /* If the texture result fifo is full, block - * adding any more to it until the last - * QOP_TEX_RESULT. + /* From the VC4 spec: + * + * "Since the maximum number of texture requests + * in the input (TFREQ) FIFO is four lots of (s, + * t) data, the output (TFRCV) FIFO is sized to + * holds four lots of max-size color data per + * QPU. For non-float color, reads are packed + * RGBA8888 data (one read per pixel). For 16-bit + * float color, two reads are necessary per + * pixel, with reads packed as RG1616 then + * BA1616. So per QPU there are eight color slots + * in the TFRCV FIFO." + * + * If the texture result fifo is full, block adding + * any more to it until the last QOP_TEX_RESULT. */ - if (inst->op == QOP_TEX_S || - inst->op == QOP_TEX_DIRECT) { - if (state.tfrcv_count == 4) + if (inst->dst.file == QFILE_TEX_S || + inst->dst.file == QFILE_TEX_S_DIRECT) { + if (state.tfrcv_count == + (c->fs_threaded ? 2 : 4)) block_until_tex_result(&state, n); state.tfrcv_count++; } @@ -319,6 +376,11 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, state.tfreq_count++; break; + default: + break; + } + + switch (inst->op) { case QOP_TEX_RESULT: /* Results have to be fetched after the * coordinate setup. Note that we're assuming @@ -341,7 +403,6 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, break; default: - assert(!qir_is_tex(inst)); break; } } @@ -372,11 +433,21 @@ get_register_pressure_cost(struct schedule_state *state, struct qinst *inst) state->temp_writes[inst->dst.index] == 1) cost--; - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { - if (inst->src[i].file == QFILE_TEMP && - !BITSET_TEST(state->temp_live, inst->src[i].index)) { - cost++; + for (int i = 0; i < qir_get_nsrc(inst); i++) { + if (inst->src[i].file != QFILE_TEMP || + BITSET_TEST(state->temp_live, inst->src[i].index)) { + continue; } + + bool already_counted = false; + for (int j = 0; j < i; j++) { + if (inst->src[i].file == inst->src[j].file && + inst->src[i].index == inst->src[j].index) { + already_counted = true; + } + } + if (!already_counted) + cost++; } return cost; @@ -503,11 +574,33 @@ dump_state(struct vc4_compile *c, struct schedule_state *state) static uint32_t latency_between(struct schedule_node *before, struct schedule_node *after) { - if ((before->inst->op == QOP_TEX_S || - before->inst->op == QOP_TEX_DIRECT) && + if ((before->inst->dst.file == QFILE_TEX_S || + before->inst->dst.file == QFILE_TEX_S_DIRECT) && after->inst->op == QOP_TEX_RESULT) return 100; + switch (before->inst->op) { + case QOP_RCP: + case QOP_RSQ: + case QOP_EXP2: + case QOP_LOG2: + for (int i = 0; i < qir_get_nsrc(after->inst); i++) { + if (after->inst->src[i].file == + before->inst->dst.file && + after->inst->src[i].index == + before->inst->dst.index) { + /* There are two QPU delay slots before we can + * read a math result, which could be up to 4 + * QIR instructions if they packed well. + */ + return 4; + } + } + break; + default: + break; + } + return 1; } @@ -532,7 +625,7 @@ compute_delay(struct schedule_node *n) compute_delay(n->children[i]); n->delay = MAX2(n->delay, n->children[i]->delay + - latency_between(n, n->children[i])); + latency_between(n->children[i], n)); } } } @@ -583,15 +676,15 @@ schedule_instructions(struct vc4_compile *c, child->unblocked_time = MAX2(child->unblocked_time, state->time + - latency_between(chosen, - child)); + latency_between(child, + chosen)); child->parent_count--; if (child->parent_count == 0) list_add(&child->link, &state->worklist); } /* Update our tracking of register pressure. */ - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file == QFILE_TEMP) BITSET_SET(state->temp_live, inst->src[i].index); } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c index e7cfe5ad2..302eb4826 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qir_validate.c @@ -84,9 +84,28 @@ void qir_validate(struct vc4_compile *c) case QFILE_LOAD_IMM: fail_instr(c, inst, "Bad dest file"); break; + + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: + if (inst->src[qir_get_tex_uniform_src(inst)].file != + QFILE_UNIF) { + fail_instr(c, inst, + "tex op missing implicit uniform"); + } + break; + + case QFILE_TEX_S_DIRECT: + if (inst->op != QOP_ADD) { + fail_instr(c, inst, + "kernel validation requires that " + "direct texture lookups use an ADD"); + } + break; } - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { struct qreg src = inst->src[i]; switch (src.file) { @@ -119,6 +138,11 @@ void qir_validate(struct vc4_compile *c) case QFILE_TLB_COLOR_WRITE_MS: case QFILE_TLB_Z_WRITE: case QFILE_TLB_STENCIL_SETUP: + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: fail_instr(c, inst, "Bad src file"); break; } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c index 67850a811..380b9f43c 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu.c @@ -323,6 +323,7 @@ qpu_waddr_ignores_ws(uint32_t waddr) case QPU_W_ACC1: case QPU_W_ACC2: case QPU_W_ACC3: + case QPU_W_NOP: case QPU_W_TLB_Z: case QPU_W_TLB_COLOR_MS: case QPU_W_TLB_COLOR_ALL: diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c index 529472272..9ea26455b 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_disasm.c @@ -86,11 +86,11 @@ static const char *qpu_sig[] = { static const char *qpu_pack_mul[] = { [QPU_PACK_MUL_NOP] = "", - [QPU_PACK_MUL_8888] = "8888", - [QPU_PACK_MUL_8A] = "8a", - [QPU_PACK_MUL_8B] = "8b", - [QPU_PACK_MUL_8C] = "8c", - [QPU_PACK_MUL_8D] = "8d", + [QPU_PACK_MUL_8888] = ".8888", + [QPU_PACK_MUL_8A] = ".8a", + [QPU_PACK_MUL_8B] = ".8b", + [QPU_PACK_MUL_8C] = ".8c", + [QPU_PACK_MUL_8D] = ".8d", }; /* The QPU unpack for A and R4 files can be described the same, it's just that @@ -264,7 +264,7 @@ get_special_write_desc(int reg, bool is_a) void vc4_qpu_disasm_pack_mul(FILE *out, uint32_t pack) { - fprintf(out, ".%s", DESC(qpu_pack_mul, pack)); + fprintf(out, "%s", DESC(qpu_pack_mul, pack)); } void diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c index 2ee52a497..aaa3a0412 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -157,7 +157,7 @@ setup_for_vpm_read(struct vc4_compile *c, struct qblock *block) * address. * * In that case, we need to move one to a temporary that can be used in the - * instruction, instead. We reserve ra31/rb31 for this purpose. + * instruction, instead. We reserve ra14/rb14 for this purpose. */ static void fixup_raddr_conflict(struct qblock *block, @@ -183,9 +183,9 @@ fixup_raddr_conflict(struct qblock *block, * in case of unpacks. */ if (qir_is_float_input(inst)) - queue(block, qpu_a_FMAX(qpu_rb(31), *src0, *src0)); + queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0)); else - queue(block, qpu_a_MOV(qpu_rb(31), *src0)); + queue(block, qpu_a_MOV(qpu_rb(14), *src0)); /* If we had an unpack on this A-file source, we need to put * it into this MOV, not into the later move from regfile B. @@ -194,10 +194,10 @@ fixup_raddr_conflict(struct qblock *block, *last_inst(block) |= *unpack; *unpack = 0; } - *src0 = qpu_rb(31); + *src0 = qpu_rb(14); } else { - queue(block, qpu_a_MOV(qpu_ra(31), *src0)); - *src0 = qpu_ra(31); + queue(block, qpu_a_MOV(qpu_ra(14), *src0)); + *src0 = qpu_ra(14); } } @@ -226,10 +226,14 @@ static void handle_r4_qpu_write(struct qblock *block, struct qinst *qinst, struct qpu_reg dst) { - if (dst.mux != QPU_MUX_R4) + if (dst.mux != QPU_MUX_R4) { queue(block, qpu_a_MOV(dst, qpu_r4())); - else if (qinst->sf) - queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4())); + set_last_cond_add(block, qinst->cond); + } else { + assert(qinst->cond == QPU_COND_ALWAYS); + if (qinst->sf) + queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4())); + } } static void @@ -290,8 +294,8 @@ vc4_generate_code_block(struct vc4_compile *c, }; uint64_t unpack = 0; - struct qpu_reg src[4]; - for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) { + struct qpu_reg src[ARRAY_SIZE(qinst->src)]; + for (int i = 0; i < qir_get_nsrc(qinst); i++) { int index = qinst->src[i].index; switch (qinst->src[i].file) { case QFILE_NULL: @@ -349,6 +353,11 @@ vc4_generate_code_block(struct vc4_compile *c, case QFILE_TLB_COLOR_WRITE_MS: case QFILE_TLB_Z_WRITE: case QFILE_TLB_STENCIL_SETUP: + case QFILE_TEX_S: + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: unreachable("bad qir src file"); } } @@ -381,6 +390,23 @@ vc4_generate_code_block(struct vc4_compile *c, dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP); break; + case QFILE_TEX_S: + case QFILE_TEX_S_DIRECT: + dst = qpu_rb(QPU_W_TMU0_S); + break; + + case QFILE_TEX_T: + dst = qpu_rb(QPU_W_TMU0_T); + break; + + case QFILE_TEX_R: + dst = qpu_rb(QPU_W_TMU0_R); + break; + + case QFILE_TEX_B: + dst = qpu_rb(QPU_W_TMU0_B); + break; + case QFILE_VARY: case QFILE_UNIF: case QFILE_SMALL_IMM: @@ -422,6 +448,7 @@ vc4_generate_code_block(struct vc4_compile *c, } handle_r4_qpu_write(block, qinst, dst); + handled_qinst_cond = true; break; @@ -473,33 +500,27 @@ vc4_generate_code_block(struct vc4_compile *c, *last_inst(block) = qpu_set_sig(*last_inst(block), QPU_SIG_COLOR_LOAD); handle_r4_qpu_write(block, qinst, dst); + handled_qinst_cond = true; break; case QOP_VARY_ADD_C: queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack); break; - case QOP_TEX_S: - case QOP_TEX_T: - case QOP_TEX_R: - case QOP_TEX_B: - queue(block, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S + - (qinst->op - QOP_TEX_S)), - src[0]) | unpack); - break; - - case QOP_TEX_DIRECT: - fixup_raddr_conflict(block, dst, &src[0], &src[1], - qinst, &unpack); - queue(block, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), - src[0], src[1]) | unpack); - break; case QOP_TEX_RESULT: queue(block, qpu_NOP()); *last_inst(block) = qpu_set_sig(*last_inst(block), QPU_SIG_LOAD_TMU0); handle_r4_qpu_write(block, qinst, dst); + handled_qinst_cond = true; + break; + + case QOP_THRSW: + queue(block, qpu_NOP()); + *last_inst(block) = qpu_set_sig(*last_inst(block), + QPU_SIG_THREAD_SWITCH); + c->last_thrsw = last_inst(block); break; case QOP_BRANCH: @@ -533,7 +554,7 @@ vc4_generate_code_block(struct vc4_compile *c, * argument slot as well so that we don't take up * another raddr just to get unused data. */ - if (qir_get_op_nsrc(qinst->op) == 1) + if (qir_get_non_sideband_nsrc(qinst) == 1) src[1] = src[0]; fixup_raddr_conflict(block, dst, &src[0], &src[1], @@ -587,6 +608,23 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) qir_for_each_block(block, c) vc4_generate_code_block(c, block, temp_registers); + /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW. + * + * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi) + * that ensures that a later thread doesn't try to lock the scoreboard + * and terminate before an earlier-spawned thread on the same QPU, by + * delaying switching back to the later shader until earlier has + * finished. Otherwise, if the earlier thread was hitting the same + * quad, the scoreboard would deadlock. + */ + if (c->last_thrsw) { + assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) == + QPU_SIG_THREAD_SWITCH); + *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) | + QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH, + QPU_SIG)); + } + uint32_t cycles = qpu_schedule_instructions(c); uint32_t inst_count_at_schedule_time = c->qpu_inst_count; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c index 25adbe671..9141396c8 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -385,12 +385,27 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) switch (sig) { case QPU_SIG_SW_BREAKPOINT: case QPU_SIG_NONE: - case QPU_SIG_THREAD_SWITCH: - case QPU_SIG_LAST_THREAD_SWITCH: case QPU_SIG_SMALL_IMM: case QPU_SIG_LOAD_IMM: break; + case QPU_SIG_THREAD_SWITCH: + case QPU_SIG_LAST_THREAD_SWITCH: + /* All accumulator contents and flags are undefined after the + * switch. + */ + for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) + add_write_dep(state, &state->last_r[i], n); + add_write_dep(state, &state->last_sf, n); + + /* Scoreboard-locking operations have to stay after the last + * thread switch. + */ + add_write_dep(state, &state->last_tlb, n); + + add_write_dep(state, &state->last_tmu_write, n); + break; + case QPU_SIG_LOAD_TMU0: case QPU_SIG_LOAD_TMU1: /* TMU loads are coming from a FIFO, so ordering is important. @@ -453,6 +468,7 @@ struct choose_scoreboard { int last_sfu_write_tick; int last_uniforms_reset_tick; uint32_t last_waddr_a, last_waddr_b; + bool tlb_locked; }; static bool @@ -461,6 +477,11 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst) uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + /* Full immediate loads don't read any registers. */ + if (sig == QPU_SIG_LOAD_IMM) + return false; + uint32_t src_muxes[] = { QPU_GET_FIELD(inst, QPU_ADD_A), QPU_GET_FIELD(inst, QPU_ADD_B), @@ -554,15 +575,28 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, struct schedule_node *chosen = NULL; int chosen_prio = 0; + /* Don't pair up anything with a thread switch signal -- emit_thrsw() + * will handle pairing it along with filling the delay slots. + */ + if (prev_inst) { + uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst, + QPU_SIG); + if (prev_sig == QPU_SIG_THREAD_SWITCH || + prev_sig == QPU_SIG_LAST_THREAD_SWITCH) { + return NULL; + } + } + list_for_each_entry(struct schedule_node, n, schedule_list, link) { uint64_t inst = n->inst->inst; + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); /* Don't choose the branch instruction until it's the last one * left. XXX: We could potentially choose it before it's the * last one, if the remaining instructions fit in the delay * slots. */ - if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH && + if (sig == QPU_SIG_BRANCH && !list_is_singular(schedule_list)) { continue; } @@ -586,9 +620,25 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, * that they're compatible. */ if (prev_inst) { + /* Don't pair up a thread switch signal -- we'll + * handle pairing it when we pick it on its own. + */ + if (sig == QPU_SIG_THREAD_SWITCH || + sig == QPU_SIG_LAST_THREAD_SWITCH) { + continue; + } + if (prev_inst->uniform != -1 && n->uniform != -1) continue; + /* Don't merge in something that will lock the TLB. + * Hopwefully what we have in inst will release some + * other instructions, allowing us to delay the + * TLB-locking instruction until later. + */ + if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) + continue; + inst = qpu_merge_inst(prev_inst->inst->inst, inst); if (!inst) continue; @@ -647,6 +697,9 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, waddr_mul == QPU_W_UNIFORMS_ADDRESS) { scoreboard->last_uniforms_reset_tick = scoreboard->tick; } + + if (qpu_inst_is_tlb(inst)) + scoreboard->tlb_locked = true; } static void @@ -678,6 +731,26 @@ static uint32_t waddr_latency(uint32_t waddr, uint64_t after) /* Apply some huge latency between texture fetch requests and getting * their results back. + * + * FIXME: This is actually pretty bogus. If we do: + * + * mov tmu0_s, a + * <a bit of math> + * mov tmu0_s, b + * load_tmu0 + * <more math> + * load_tmu0 + * + * we count that as worse than + * + * mov tmu0_s, a + * mov tmu0_s, b + * <lots of math> + * load_tmu0 + * <more math> + * load_tmu0 + * + * because we associate the first load_tmu0 with the *second* tmu0_s. */ if (waddr == QPU_W_TMU0_S) { if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0) @@ -768,6 +841,51 @@ mark_instruction_scheduled(struct list_head *schedule_list, } } +/** + * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair + * with another instruction. + */ +static void +emit_thrsw(struct vc4_compile *c, + struct choose_scoreboard *scoreboard, + uint64_t inst) +{ + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + /* There should be nothing in a thrsw inst being scheduled other than + * the signal bits. + */ + assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP); + assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP); + + /* Try to find an earlier scheduled instruction that we can merge the + * thrsw into. + */ + int thrsw_ip = c->qpu_inst_count; + for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) { + uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i]; + uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG); + + if (prev_sig == QPU_SIG_NONE) + thrsw_ip = c->qpu_inst_count - i; + } + + if (thrsw_ip != c->qpu_inst_count) { + /* Merge the thrsw into the existing instruction. */ + c->qpu_insts[thrsw_ip] = + QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG); + } else { + qpu_serialize_one_inst(c, inst); + update_scoreboard_for_chosen(scoreboard, inst); + } + + /* Fill the delay slots. */ + while (c->qpu_inst_count < thrsw_ip + 3) { + update_scoreboard_for_chosen(scoreboard, qpu_NOP()); + qpu_serialize_one_inst(c, qpu_NOP()); + } +} + static uint32_t schedule_instructions(struct vc4_compile *c, struct choose_scoreboard *scoreboard, @@ -860,10 +978,6 @@ schedule_instructions(struct vc4_compile *c, fprintf(stderr, "\n"); } - qpu_serialize_one_inst(c, inst); - - update_scoreboard_for_chosen(scoreboard, inst); - /* Now that we've scheduled a new instruction, some of its * children can be promoted to the list of instructions ready to * be scheduled. Update the children's unblocked time for this @@ -872,6 +986,14 @@ schedule_instructions(struct vc4_compile *c, mark_instruction_scheduled(schedule_list, time, chosen, false); mark_instruction_scheduled(schedule_list, time, merge, false); + if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH || + QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) { + emit_thrsw(c, scoreboard, inst); + } else { + qpu_serialize_one_inst(c, inst); + update_scoreboard_for_chosen(scoreboard, inst); + } + scoreboard->tick++; time++; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c index 02fadaf61..08dd6e5df 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_qpu_validate.c @@ -58,6 +58,10 @@ _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b) if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) return false; + /* Load immediates don't read any registers. */ + if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM) + return false; + for (int i = 0; i < ARRAY_SIZE(src_regs); i++) { if (!ignore_a && src_regs[i].mux == QPU_MUX_A && @@ -109,6 +113,7 @@ void vc4_qpu_validate(uint64_t *insts, uint32_t num_inst) { bool scoreboard_locked = false; + bool threaded = false; /* We don't want to do validation in release builds, but we want to * keep compiling the validation code to make sure it doesn't get @@ -120,11 +125,17 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst) for (int i = 0; i < num_inst; i++) { uint64_t inst = insts[i]; + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); - if (QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_PROG_END) { + if (sig != QPU_SIG_PROG_END) { if (qpu_inst_is_tlb(inst)) scoreboard_locked = true; + if (sig == QPU_SIG_THREAD_SWITCH || + sig == QPU_SIG_LAST_THREAD_SWITCH) { + threaded = true; + } + continue; } @@ -359,4 +370,98 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst) waddr_mul == QPU_W_UNIFORMS_ADDRESS) last_unif_pointer_update = i; } + + if (threaded) { + bool last_thrsw_found = false; + bool scoreboard_locked = false; + int tex_samples_outstanding = 0; + int last_tex_samples_outstanding = 0; + int thrsw_ip = -1; + + for (int i = 0; i < num_inst; i++) { + uint64_t inst = insts[i]; + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + if (i == thrsw_ip) { + /* In order to get texture results back in the + * correct order, before a new thrsw we have + * to read all the texture results from before + * the previous thrsw. + * + * FIXME: Is collecting the remaining results + * during the delay slots OK, or should we do + * this at THRSW signal time? + */ + if (last_tex_samples_outstanding != 0) { + fail_instr(inst, "THRSW with texture " + "results from the previous " + "THRSW still in the FIFO."); + } + + last_tex_samples_outstanding = + tex_samples_outstanding; + tex_samples_outstanding = 0; + } + + if (qpu_inst_is_tlb(inst)) + scoreboard_locked = true; + + switch (sig) { + case QPU_SIG_THREAD_SWITCH: + case QPU_SIG_LAST_THREAD_SWITCH: + /* No thread switching with the scoreboard + * locked. Doing so means we may deadlock + * when the other thread tries to lock + * scoreboard. + */ + if (scoreboard_locked) { + fail_instr(inst, "THRSW with the " + "scoreboard locked."); + } + + /* No thread switching after lthrsw, since + * lthrsw means that we get delayed until the + * other shader is ready for us to terminate. + */ + if (last_thrsw_found) { + fail_instr(inst, "THRSW after a " + "previous LTHRSW"); + } + + if (sig == QPU_SIG_LAST_THREAD_SWITCH) + last_thrsw_found = true; + + /* No THRSW while we already have a THRSW + * queued. + */ + if (i < thrsw_ip) { + fail_instr(inst, + "THRSW with a THRSW queued."); + } + + thrsw_ip = i + 3; + break; + + case QPU_SIG_LOAD_TMU0: + case QPU_SIG_LOAD_TMU1: + if (last_tex_samples_outstanding == 0) { + fail_instr(inst, "TMU load with nothing " + "in the results fifo from " + "the previous THRSW."); + } + + last_tex_samples_outstanding--; + break; + } + + uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); + uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); + if (waddr_add == QPU_W_TMU0_S || + waddr_add == QPU_W_TMU1_S || + waddr_mul == QPU_W_TMU0_S || + waddr_mul == QPU_W_TMU1_S) { + tex_samples_outstanding++; + } + } + } } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c b/lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c index ab343ee31..506fdb593 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_register_allocate.c @@ -115,37 +115,67 @@ vc4_alloc_reg_set(struct vc4_context *vc4) vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), true); - vc4->reg_class_any = ra_alloc_reg_class(vc4->regs); - vc4->reg_class_a_or_b_or_acc = ra_alloc_reg_class(vc4->regs); - vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs); - vc4->reg_class_a = ra_alloc_reg_class(vc4->regs); + /* The physical regfiles split us into two classes, with [0] being the + * whole space and [1] being the bottom half (for threaded fragment + * shaders). + */ + for (int i = 0; i < 2; i++) { + vc4->reg_class_any[i] = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_a_or_b[i] = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_a_or_b_or_acc[i] = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_r4_or_a[i] = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_a[i] = ra_alloc_reg_class(vc4->regs); + } vc4->reg_class_r0_r3 = ra_alloc_reg_class(vc4->regs); - for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) { - /* Reserve ra31/rb31 for spilling fixup_raddr_conflict() in + + /* r0-r3 */ + for (uint32_t i = ACC_INDEX; i < ACC_INDEX + 4; i++) { + ra_class_add_reg(vc4->regs, vc4->reg_class_r0_r3, i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i); + } + + /* R4 gets a special class because it can't be written as a general + * purpose register. (it's TMU_NOSWAP as a write address). + */ + for (int i = 0; i < 2; i++) { + ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[i], + ACC_INDEX + 4); + ra_class_add_reg(vc4->regs, vc4->reg_class_any[i], + ACC_INDEX + 4); + } + + /* A/B */ + for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i ++) { + /* Reserve ra14/rb14 for spilling fixup_raddr_conflict() in * vc4_qpu_emit.c */ - if (vc4_regs[i].addr == 31) + if (vc4_regs[i].addr == 14) continue; - /* R4 can't be written as a general purpose register. (it's - * TMU_NOSWAP as a write address). - */ - if (vc4_regs[i].mux == QPU_MUX_R4) { - ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i); - ra_class_add_reg(vc4->regs, vc4->reg_class_any, i); - continue; + ra_class_add_reg(vc4->regs, vc4->reg_class_any[0], i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[0], i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i); + + if (vc4_regs[i].addr < 16) { + ra_class_add_reg(vc4->regs, vc4->reg_class_any[1], i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[1], i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i); } - if (vc4_regs[i].mux <= QPU_MUX_R3) - ra_class_add_reg(vc4->regs, vc4->reg_class_r0_r3, i); - ra_class_add_reg(vc4->regs, vc4->reg_class_any, i); - ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc, i); - } + /* A only */ + if (((i - AB_INDEX) & 1) == 0) { + ra_class_add_reg(vc4->regs, vc4->reg_class_a[0], i); + ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[0], i); - for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) { - ra_class_add_reg(vc4->regs, vc4->reg_class_a, i); - ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i); + if (vc4_regs[i].addr < 16) { + ra_class_add_reg(vc4->regs, + vc4->reg_class_a[1], i); + ra_class_add_reg(vc4->regs, + vc4->reg_class_r4_or_a[1], i); + } + } } ra_set_finalize(vc4->regs, NULL); @@ -166,7 +196,7 @@ node_to_temp_priority(const void *in_a, const void *in_b) } #define CLASS_BIT_A (1 << 0) -#define CLASS_BIT_B_OR_ACC (1 << 1) +#define CLASS_BIT_B (1 << 1) #define CLASS_BIT_R4 (1 << 2) #define CLASS_BIT_R0_R3 (1 << 4) @@ -212,7 +242,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) * incrementally remove bits that the temp definitely can't be in. */ memset(class_bits, - CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4, + CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3, sizeof(class_bits)); int ip = 0; @@ -226,6 +256,14 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) if (c->temp_start[i] < ip && c->temp_end[i] > ip) class_bits[i] &= ~CLASS_BIT_R4; } + + /* If we're doing a conditional write of something + * writing R4 (math, tex results), then make sure that + * we store in a temp so that we actually + * conditionally move the result. + */ + if (inst->cond != QPU_COND_ALWAYS) + class_bits[inst->dst.index] &= ~CLASS_BIT_R4; } else { /* R4 can't be written as a general purpose * register. (it's TMU_NOSWAP as a write address). @@ -250,6 +288,17 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) class_bits[inst->src[0].index] &= CLASS_BIT_R0_R3; break; + case QOP_THRSW: + /* All accumulators are invalidated across a thread + * switch. + */ + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] < ip && c->temp_end[i] > ip) + class_bits[i] &= ~(CLASS_BIT_R0_R3 | + CLASS_BIT_R4); + } + break; + default: break; } @@ -265,7 +314,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) * can only be done from regfile A, while float unpacks can be * either A or R4. */ - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file == QFILE_TEMP && inst->src[i].pack) { if (qir_is_float_input(inst)) { @@ -285,22 +334,40 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) int node = temp_to_node[i]; switch (class_bits[i]) { - case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4: - ra_set_node_class(g, node, vc4->reg_class_any); + case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3: + ra_set_node_class(g, node, + vc4->reg_class_any[c->fs_threaded]); break; - case CLASS_BIT_A | CLASS_BIT_B_OR_ACC: - ra_set_node_class(g, node, vc4->reg_class_a_or_b_or_acc); + case CLASS_BIT_A | CLASS_BIT_B: + ra_set_node_class(g, node, + vc4->reg_class_a_or_b[c->fs_threaded]); + break; + case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R0_R3: + ra_set_node_class(g, node, + vc4->reg_class_a_or_b_or_acc[c->fs_threaded]); break; case CLASS_BIT_A | CLASS_BIT_R4: - ra_set_node_class(g, node, vc4->reg_class_r4_or_a); + ra_set_node_class(g, node, + vc4->reg_class_r4_or_a[c->fs_threaded]); break; case CLASS_BIT_A: - ra_set_node_class(g, node, vc4->reg_class_a); + ra_set_node_class(g, node, + vc4->reg_class_a[c->fs_threaded]); break; case CLASS_BIT_R0_R3: ra_set_node_class(g, node, vc4->reg_class_r0_r3); break; + default: + /* DDX/DDY used across thread switched might get us + * here. + */ + if (c->fs_threaded) { + c->failed = true; + free(temp_registers); + return NULL; + } + fprintf(stderr, "temp %d: bad class bits: 0x%x\n", i, class_bits[i]); abort(); @@ -321,9 +388,13 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) bool ok = ra_allocate(g); if (!ok) { - fprintf(stderr, "Failed to register allocate:\n"); - qir_dump(c); + if (!c->fs_threaded) { + fprintf(stderr, "Failed to register allocate:\n"); + qir_dump(c); + } + c->failed = true; + free(temp_registers); return NULL; } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c b/lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c index 7d5076f42..37acefdc0 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_reorder_uniforms.c @@ -46,7 +46,7 @@ qir_reorder_uniforms(struct vc4_compile *c) qir_for_each_inst_inorder(inst, c) { uint32_t new = ~0; - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + for (int i = 0; i < qir_get_nsrc(inst); i++) { if (inst->src[i].file != QFILE_UNIF) continue; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c index 704cd71ea..596f73dfb 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_resource.c @@ -165,7 +165,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx, prsc->width0 == box->width && prsc->height0 == box->height && prsc->depth0 == box->depth && - prsc->array_size == 1) { + prsc->array_size == 1 && + rsc->bo->private) { usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; } @@ -283,6 +284,20 @@ vc4_resource_transfer_map(struct pipe_context *pctx, if (usage & PIPE_TRANSFER_MAP_DIRECTLY) return NULL; + if (format == PIPE_FORMAT_ETC1_RGB8) { + /* ETC1 is arranged as 64-bit blocks, where each block + * is 4x4 pixels. Texture tiling operates on the + * 64-bit block the way it would an uncompressed + * pixels. + */ + assert(!(ptrans->box.x & 3)); + assert(!(ptrans->box.y & 3)); + ptrans->box.x >>= 2; + ptrans->box.y >>= 2; + ptrans->box.width = (ptrans->box.width + 3) >> 2; + ptrans->box.height = (ptrans->box.height + 3) >> 2; + } + /* We need to align the box to utile boundaries, since that's * what load/store operates on. This may cause us to need to * read out the original contents in that border area. Right @@ -387,6 +402,11 @@ vc4_setup_slices(struct vc4_resource *rsc) struct pipe_resource *prsc = &rsc->base.b; uint32_t width = prsc->width0; uint32_t height = prsc->height0; + if (prsc->format == PIPE_FORMAT_ETC1_RGB8) { + width = (width + 3) >> 2; + height = (height + 3) >> 2; + } + uint32_t pot_width = util_next_power_of_two(width); uint32_t pot_height = util_next_power_of_two(height); uint32_t offset = 0; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c index 72fd09aee..27d23dc96 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.c @@ -123,9 +123,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_SHADOW_MAP: case PIPE_CAP_BLEND_EQUATION_SEPARATE: case PIPE_CAP_TWO_SIDED_STENCIL: - case PIPE_CAP_USER_INDEX_BUFFERS: case PIPE_CAP_TEXTURE_MULTISAMPLE: case PIPE_CAP_TEXTURE_SWIZZLE: + case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: return 1; /* lying for GL 2.0 */ @@ -225,8 +225,8 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_STRING_MARKER: case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_BUFFER_OBJECT: - case PIPE_CAP_QUERY_MEMORY_INFO: - case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_PCI_GROUP: case PIPE_CAP_PCI_BUS: case PIPE_CAP_PCI_DEVICE: case PIPE_CAP_PCI_FUNCTION: @@ -239,11 +239,25 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED: case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS: case PIPE_CAP_TGSI_ARRAY_COMPONENTS: + case PIPE_CAP_TGSI_CAN_READ_OUTPUTS: + case PIPE_CAP_NATIVE_FENCE_FD: + case PIPE_CAP_TGSI_FS_FBFETCH: + case PIPE_CAP_TGSI_MUL_ZERO_WINS: + case PIPE_CAP_DOUBLES: + case PIPE_CAP_INT64: + case PIPE_CAP_INT64_DIVMOD: + case PIPE_CAP_TGSI_TEX_TXF_LZ: + case PIPE_CAP_TGSI_CLOCK: + case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE: + case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE: + case PIPE_CAP_TGSI_BALLOT: + case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT: return 0; /* Stream output. */ case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: + case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: return 0; @@ -336,8 +350,9 @@ vc4_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param) } static int -vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, - enum pipe_shader_cap param) +vc4_screen_get_shader_param(struct pipe_screen *pscreen, + enum pipe_shader_type shader, + enum pipe_shader_cap param) { if (shader != PIPE_SHADER_VERTEX && shader != PIPE_SHADER_FRAGMENT) { @@ -356,10 +371,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return vc4_screen(pscreen)->has_control_flow; case PIPE_SHADER_CAP_MAX_INPUTS: - if (shader == PIPE_SHADER_FRAGMENT) - return 8; - else - return 16; + return 8; case PIPE_SHADER_CAP_MAX_OUTPUTS: return shader == PIPE_SHADER_FRAGMENT ? 1 : 8; case PIPE_SHADER_CAP_MAX_TEMPS: @@ -368,8 +380,6 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return 16 * 1024 * sizeof(float); case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: return 1; - case PIPE_SHADER_CAP_MAX_PREDS: - return 0; /* nothing uses this */ case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: return 0; case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: @@ -384,7 +394,6 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return 0; case PIPE_SHADER_CAP_INTEGERS: return 1; - case PIPE_SHADER_CAP_DOUBLES: case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: @@ -401,6 +410,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return 32; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: return 0; default: fprintf(stderr, "unknown shader param %d\n", param); @@ -416,6 +426,7 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen, unsigned sample_count, unsigned usage) { + struct vc4_screen *screen = vc4_screen(pscreen); unsigned retval = 0; if (sample_count > 1 && sample_count != VC4_MAX_SAMPLES) @@ -485,7 +496,8 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen, } if ((usage & PIPE_BIND_SAMPLER_VIEW) && - vc4_tex_format_supported(format)) { + vc4_tex_format_supported(format) && + (format != PIPE_FORMAT_ETC1_RGB8 || screen->has_etc1)) { retval |= PIPE_BIND_SAMPLER_VIEW; } @@ -526,16 +538,12 @@ static int handle_compare(void *key1, void *key2) } static bool -vc4_supports_branches(struct vc4_screen *screen) +vc4_has_feature(struct vc4_screen *screen, uint32_t feature) { -#if USE_VC4_SIMULATOR - return true; -#endif - struct drm_vc4_get_param p = { - .param = DRM_VC4_PARAM_SUPPORTS_BRANCHES, + .param = feature, }; - int ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &p); + int ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &p); if (ret != 0) return false; @@ -546,11 +554,6 @@ vc4_supports_branches(struct vc4_screen *screen) static bool vc4_get_chip_info(struct vc4_screen *screen) { -#if USE_VC4_SIMULATOR - screen->v3d_ver = 21; - return true; -#endif - struct drm_vc4_get_param ident0 = { .param = DRM_VC4_PARAM_V3D_IDENT0, }; @@ -559,7 +562,7 @@ vc4_get_chip_info(struct vc4_screen *screen) }; int ret; - ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &ident0); + ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &ident0); if (ret != 0) { if (errno == EINVAL) { /* Backwards compatibility with 2835 kernels which @@ -573,7 +576,7 @@ vc4_get_chip_info(struct vc4_screen *screen) return false; } } - ret = drmIoctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &ident1); + ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_GET_PARAM, &ident1); if (ret != 0) { fprintf(stderr, "Couldn't get V3D IDENT1: %s\n", strerror(errno)); @@ -612,11 +615,15 @@ vc4_screen_create(int fd) screen->fd = fd; list_inithead(&screen->bo_cache.time_list); - pipe_mutex_init(screen->bo_handles_mutex); + (void) mtx_init(&screen->bo_handles_mutex, mtx_plain); screen->bo_handles = util_hash_table_create(handle_hash, handle_compare); - if (vc4_supports_branches(screen)) - screen->has_control_flow = true; + screen->has_control_flow = + vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_BRANCHES); + screen->has_etc1 = + vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_ETC1); + screen->has_threaded_fs = + vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_THREADED_FS); if (!vc4_get_chip_info(screen)) goto fail; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h index 16003cfcc..34d15381a 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_screen.h @@ -30,6 +30,10 @@ #include "util/list.h" #include "util/slab.h" +#ifndef DRM_VC4_PARAM_SUPPORTS_ETC1 +#define DRM_VC4_PARAM_SUPPORTS_ETC1 4 +#endif + struct vc4_bo; #define VC4_DEBUG_CL 0x0001 @@ -47,6 +51,8 @@ struct vc4_bo; #define VC4_MAX_MIP_LEVELS 12 #define VC4_MAX_TEXTURE_SAMPLERS 16 +struct vc4_simulator_file; + struct vc4_screen { struct pipe_screen base; int fd; @@ -55,9 +61,6 @@ struct vc4_screen { const char *name; - void *simulator_mem_base; - uint32_t simulator_mem_size; - /** The last seqno we've completed a wait for. * * This lets us slightly optimize our waits by skipping wait syscalls @@ -74,18 +77,22 @@ struct vc4_screen { struct list_head *size_list; uint32_t size_list_size; - pipe_mutex lock; + mtx_t lock; uint32_t bo_size; uint32_t bo_count; } bo_cache; struct util_hash_table *bo_handles; - pipe_mutex bo_handles_mutex; + mtx_t bo_handles_mutex; uint32_t bo_size; uint32_t bo_count; bool has_control_flow; + bool has_etc1; + bool has_threaded_fs; + + struct vc4_simulator_file *sim_file; }; static inline struct vc4_screen * @@ -105,7 +112,8 @@ vc4_screen_bo_from_handle(struct pipe_screen *pscreen, const void * vc4_screen_get_compiler_options(struct pipe_screen *pscreen, - enum pipe_shader_ir ir, unsigned shader); + enum pipe_shader_ir ir, + enum pipe_shader_type shader); extern uint32_t vc4_debug; diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c index 0291a4e14..9565c49ef 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator.c @@ -21,9 +21,37 @@ * IN THE SOFTWARE. */ +/** + * @file vc4_simulator.c + * + * Implements VC4 simulation on top of a non-VC4 GEM fd. + * + * This file's goal is to emulate the VC4 ioctls' behavior in the kernel on + * top of the simpenrose software simulator. Generally, VC4 driver BOs have a + * GEM-side copy of their contents and a simulator-side memory area that the + * GEM contents get copied into during simulation. Once simulation is done, + * the simulator's data is copied back out to the GEM BOs, so that rendering + * appears on the screen as if actual hardware rendering had been done. + * + * One of the limitations of this code is that we shouldn't really need a + * GEM-side BO for non-window-system BOs. However, do we need unique BO + * handles for each of our GEM bos so that this file can look up its state + * from the handle passed in at submit ioctl time (also, a couple of places + * outside of this file still call ioctls directly on the fd). + * + * Another limitation is that BO import doesn't work unless the underlying + * window system's BO size matches what VC4 is going to use, which of course + * doesn't work out in practice. This means that for now, only DRI3 (VC4 + * makes the winsys BOs) is supported, not DRI2 (window system makes the winys + * BOs). + */ + #ifdef USE_VC4_SIMULATOR +#include <sys/mman.h> +#include "xf86drm.h" #include "util/u_memory.h" +#include "util/u_mm.h" #include "util/ralloc.h" #include "vc4_screen.h" @@ -32,53 +60,160 @@ #include "vc4_simulator_validate.h" #include "simpenrose/simpenrose.h" -static mtx_t exec_mutex = _MTX_INITIALIZER_NP; +/** Global (across GEM fds) state for the simulator */ +static struct vc4_simulator_state { + mtx_t mutex; + + void *mem; + ssize_t mem_size; + struct mem_block *heap; + struct mem_block *overflow; + + /** Mapping from GEM handle to struct vc4_simulator_bo * */ + struct hash_table *fd_map; + + int refcount; +} sim_state = { + .mutex = _MTX_INITIALIZER_NP, +}; + +/** Per-GEM-fd state for the simulator. */ +struct vc4_simulator_file { + int fd; + + /* This is weird -- we make a "vc4_device" per file, even though on + * the kernel side this is a global. We do this so that kernel code + * calling us for BO allocation can get to our screen. + */ + struct drm_device dev; + + /** Mapping from GEM handle to struct vc4_simulator_bo * */ + struct hash_table *bo_map; +}; + +/** Wrapper for drm_vc4_bo tracking the simulator-specific state. */ +struct vc4_simulator_bo { + struct drm_vc4_bo base; + struct vc4_simulator_file *file; + + /** Area for this BO within sim_state->mem */ + struct mem_block *block; + void *winsys_map; + uint32_t winsys_stride; + + int handle; +}; + +static void * +int_to_key(int key) +{ + return (void *)(uintptr_t)key; +} + +static struct vc4_simulator_file * +vc4_get_simulator_file_for_fd(int fd) +{ + struct hash_entry *entry = _mesa_hash_table_search(sim_state.fd_map, + int_to_key(fd + 1)); + return entry ? entry->data : NULL; +} /* A marker placed just after each BO, then checked after rendering to make * sure it's still there. */ #define BO_SENTINEL 0xfedcba98 -#define OVERFLOW_SIZE (32 * 1024 * 1024) +#define PAGE_ALIGN2 12 -static struct drm_gem_cma_object * -vc4_wrap_bo_with_cma(struct drm_device *dev, struct vc4_bo *bo) +/** + * Allocates space in simulator memory and returns a tracking struct for it + * that also contains the drm_gem_cma_object struct. + */ +static struct vc4_simulator_bo * +vc4_create_simulator_bo(int fd, int handle, unsigned size) { - struct vc4_context *vc4 = dev->vc4; - struct vc4_screen *screen = vc4->screen; - struct drm_vc4_bo *drm_bo = CALLOC_STRUCT(drm_vc4_bo); - struct drm_gem_cma_object *obj = &drm_bo->base; - uint32_t size = align(bo->size, 4096); + struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd); + struct vc4_simulator_bo *sim_bo = rzalloc(file, + struct vc4_simulator_bo); + struct drm_vc4_bo *bo = &sim_bo->base; + struct drm_gem_cma_object *obj = &bo->base; + size = align(size, 4096); + + sim_bo->file = file; + sim_bo->handle = handle; + + mtx_lock(&sim_state.mutex); + sim_bo->block = u_mmAllocMem(sim_state.heap, size + 4, PAGE_ALIGN2, 0); + mtx_unlock(&sim_state.mutex); + assert(sim_bo->block); - drm_bo->bo = bo; obj->base.size = size; - obj->base.dev = dev; - obj->vaddr = screen->simulator_mem_base + dev->simulator_mem_next; + obj->base.dev = &file->dev; + obj->vaddr = sim_state.mem + sim_bo->block->ofs; obj->paddr = simpenrose_hw_addr(obj->vaddr); - dev->simulator_mem_next += size + sizeof(uint32_t); - dev->simulator_mem_next = align(dev->simulator_mem_next, 4096); - assert(dev->simulator_mem_next <= screen->simulator_mem_size); + *(uint32_t *)(obj->vaddr + size) = BO_SENTINEL; - *(uint32_t *)(obj->vaddr + bo->size) = BO_SENTINEL; + /* A handle of 0 is used for vc4_gem.c internal allocations that + * don't need to go in the lookup table. + */ + if (handle != 0) { + mtx_lock(&sim_state.mutex); + _mesa_hash_table_insert(file->bo_map, int_to_key(handle), bo); + mtx_unlock(&sim_state.mutex); + } + + return sim_bo; +} - return obj; +static void +vc4_free_simulator_bo(struct vc4_simulator_bo *sim_bo) +{ + struct vc4_simulator_file *sim_file = sim_bo->file; + struct drm_vc4_bo *bo = &sim_bo->base; + struct drm_gem_cma_object *obj = &bo->base; + + if (sim_bo->winsys_map) + munmap(sim_bo->winsys_map, obj->base.size); + + mtx_lock(&sim_state.mutex); + u_mmFreeMem(sim_bo->block); + if (sim_bo->handle) { + struct hash_entry *entry = + _mesa_hash_table_search(sim_file->bo_map, + int_to_key(sim_bo->handle)); + _mesa_hash_table_remove(sim_file->bo_map, entry); + } + mtx_unlock(&sim_state.mutex); + ralloc_free(sim_bo); +} + +static struct vc4_simulator_bo * +vc4_get_simulator_bo(struct vc4_simulator_file *file, int gem_handle) +{ + mtx_lock(&sim_state.mutex); + struct hash_entry *entry = + _mesa_hash_table_search(file->bo_map, int_to_key(gem_handle)); + mtx_unlock(&sim_state.mutex); + + return entry ? entry->data : NULL; } struct drm_gem_cma_object * drm_gem_cma_create(struct drm_device *dev, size_t size) { - struct vc4_context *vc4 = dev->vc4; - struct vc4_screen *screen = vc4->screen; - - struct vc4_bo *bo = vc4_bo_alloc(screen, size, "simulator validate"); - return vc4_wrap_bo_with_cma(dev, bo); + struct vc4_screen *screen = dev->screen; + struct vc4_simulator_bo *sim_bo = vc4_create_simulator_bo(screen->fd, + 0, size); + return &sim_bo->base.base; } static int vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_job *job, struct vc4_exec_info *exec) { + int fd = dev->screen->fd; + struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd); struct drm_vc4_submit_cl *args = exec->args; struct vc4_bo **bos = job->bo_pointers.base; @@ -86,9 +221,12 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_job *job, exec->bo = calloc(exec->bo_count, sizeof(void *)); for (int i = 0; i < exec->bo_count; i++) { struct vc4_bo *bo = bos[i]; - struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo); + struct vc4_simulator_bo *sim_bo = + vc4_get_simulator_bo(file, bo->handle); + struct drm_vc4_bo *drm_bo = &sim_bo->base; + struct drm_gem_cma_object *obj = &drm_bo->base; - struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base); + drm_bo->bo = bo; #if 0 fprintf(stderr, "bo hindex %d: %s\n", i, bo->name); #endif @@ -118,14 +256,14 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec) struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base); struct vc4_bo *bo = drm_bo->bo; - assert(*(uint32_t *)(obj->vaddr + bo->size) == BO_SENTINEL); + assert(*(uint32_t *)(obj->vaddr + + obj->base.size) == BO_SENTINEL); memcpy(bo->map, obj->vaddr, bo->size); if (drm_bo->validated_shader) { free(drm_bo->validated_shader->texture_samples); free(drm_bo->validated_shader); } - free(obj); } free(exec->bo); @@ -194,8 +332,8 @@ vc4_dump_to_file(struct vc4_exec_info *exec) /* Add the static overflow memory area. */ bo_state[i].handle = exec->bo_count; - bo_state[i].paddr = 0; - bo_state[i].size = OVERFLOW_SIZE; + bo_state[i].paddr = sim_state.overflow->ofs; + bo_state[i].size = sim_state.overflow->size; i++; fwrite(bo_state, sizeof(*bo_state), state->bo_count, f); @@ -211,8 +349,8 @@ vc4_dump_to_file(struct vc4_exec_info *exec) fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f); } - void *overflow = calloc(1, OVERFLOW_SIZE); - fwrite(overflow, 1, OVERFLOW_SIZE, f); + void *overflow = calloc(1, sim_state.overflow->size); + fwrite(overflow, 1, sim_state.overflow->size, f); free(overflow); free(state); @@ -225,23 +363,22 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args, struct vc4_job *job) { struct vc4_screen *screen = vc4->screen; + int fd = screen->fd; + struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd); struct vc4_surface *csurf = vc4_surface(vc4->framebuffer.cbufs[0]); struct vc4_resource *ctex = csurf ? vc4_resource(csurf->base.texture) : NULL; - uint32_t winsys_stride = ctex ? ctex->bo->simulator_winsys_stride : 0; + struct vc4_simulator_bo *csim_bo = ctex ? vc4_get_simulator_bo(file, ctex->bo->handle) : NULL; + uint32_t winsys_stride = ctex ? csim_bo->winsys_stride : 0; uint32_t sim_stride = ctex ? ctex->slices[0].stride : 0; uint32_t row_len = MIN2(sim_stride, winsys_stride); struct vc4_exec_info exec; - struct drm_device local_dev = { - .vc4 = vc4, - .simulator_mem_next = OVERFLOW_SIZE, - }; - struct drm_device *dev = &local_dev; + struct drm_device *dev = &file->dev; int ret; memset(&exec, 0, sizeof(exec)); list_inithead(&exec.unref_list); - if (ctex && ctex->bo->simulator_winsys_map) { + if (ctex && csim_bo->winsys_map) { #if 0 fprintf(stderr, "%dx%d %d %d %d\n", ctex->base.b.width0, ctex->base.b.height0, @@ -252,7 +389,7 @@ vc4_simulator_flush(struct vc4_context *vc4, for (int y = 0; y < ctex->base.b.height0; y++) { memcpy(ctex->bo->map + y * sim_stride, - ctex->bo->simulator_winsys_map + y * winsys_stride, + csim_bo->winsys_map + y * winsys_stride, row_len); } } @@ -269,7 +406,7 @@ vc4_simulator_flush(struct vc4_context *vc4, if (vc4_debug & VC4_DEBUG_CL) { fprintf(stderr, "RCL:\n"); - vc4_dump_cl(screen->simulator_mem_base + exec.ct1ca, + vc4_dump_cl(sim_state.mem + exec.ct1ca, exec.ct1ea - exec.ct1ca, true); } @@ -281,7 +418,7 @@ vc4_simulator_flush(struct vc4_context *vc4, fprintf(stderr, "Binning returned %d flushes, should be 1.\n", bfc); fprintf(stderr, "Relocated binning command list:\n"); - vc4_dump_cl(screen->simulator_mem_base + exec.ct0ca, + vc4_dump_cl(sim_state.mem + exec.ct0ca, exec.ct0ea - exec.ct0ca, false); abort(); } @@ -291,7 +428,7 @@ vc4_simulator_flush(struct vc4_context *vc4, fprintf(stderr, "Rendering returned %d frames, should be 1.\n", rfc); fprintf(stderr, "Relocated render command list:\n"); - vc4_dump_cl(screen->simulator_mem_base + exec.ct1ca, + vc4_dump_cl(sim_state.mem + exec.ct1ca, exec.ct1ea - exec.ct1ca, true); abort(); } @@ -302,16 +439,17 @@ vc4_simulator_flush(struct vc4_context *vc4, list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec.unref_list, unref_head) { + struct vc4_simulator_bo *sim_bo = (struct vc4_simulator_bo *)bo; + struct drm_gem_cma_object *obj = &sim_bo->base.base; list_del(&bo->unref_head); - assert(*(uint32_t *)(bo->base.vaddr + bo->bo->size) == + assert(*(uint32_t *)(obj->vaddr + obj->base.size) == BO_SENTINEL); - vc4_bo_unreference(&bo->bo); - free(bo); + vc4_free_simulator_bo(sim_bo); } - if (ctex && ctex->bo->simulator_winsys_map) { + if (ctex && csim_bo->winsys_map) { for (int y = 0; y < ctex->base.b.height0; y++) { - memcpy(ctex->bo->simulator_winsys_map + y * winsys_stride, + memcpy(csim_bo->winsys_map + y * winsys_stride, ctex->bo->map + y * sim_stride, row_len); } @@ -320,33 +458,234 @@ vc4_simulator_flush(struct vc4_context *vc4, return 0; } -static void *sim_mem_base = NULL; -static int sim_mem_refcount = 0; -static ssize_t sim_mem_size = 256 * 1024 * 1024; +/** + * Map the underlying GEM object from the real hardware GEM handle. + */ +static void * +vc4_simulator_map_winsys_bo(int fd, struct vc4_simulator_bo *sim_bo) +{ + struct drm_vc4_bo *bo = &sim_bo->base; + struct drm_gem_cma_object *obj = &bo->base; + int ret; + void *map; -void -vc4_simulator_init(struct vc4_screen *screen) + struct drm_mode_map_dumb map_dumb = { + .handle = sim_bo->handle, + }; + ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map_dumb); + if (ret != 0) { + fprintf(stderr, "map ioctl failure\n"); + abort(); + } + + map = mmap(NULL, obj->base.size, PROT_READ | PROT_WRITE, MAP_SHARED, + fd, map_dumb.offset); + if (map == MAP_FAILED) { + fprintf(stderr, + "mmap of bo %d (offset 0x%016llx, size %d) failed\n", + sim_bo->handle, (long long)map_dumb.offset, + (int)obj->base.size); + abort(); + } + + return map; +} + +/** + * Do fixups after a BO has been opened from a handle. + * + * This could be done at DRM_IOCTL_GEM_OPEN/DRM_IOCTL_GEM_PRIME_FD_TO_HANDLE + * time, but we're still using drmPrimeFDToHandle() so we have this helper to + * be called afterward instead. + */ +void vc4_simulator_open_from_handle(int fd, uint32_t winsys_stride, + int handle, uint32_t size) { - mtx_lock(&exec_mutex); - if (sim_mem_refcount++) { - screen->simulator_mem_size = sim_mem_size; - screen->simulator_mem_base = sim_mem_base; - mtx_unlock(&exec_mutex); + struct vc4_simulator_bo *sim_bo = + vc4_create_simulator_bo(fd, handle, size); + + sim_bo->winsys_stride = winsys_stride; + sim_bo->winsys_map = vc4_simulator_map_winsys_bo(fd, sim_bo); +} + +/** + * Simulated ioctl(fd, DRM_VC4_CREATE_BO) implementation. + * + * Making a VC4 BO is just a matter of making a corresponding BO on the host. + */ +static int +vc4_simulator_create_bo_ioctl(int fd, struct drm_vc4_create_bo *args) +{ + int ret; + struct drm_mode_create_dumb create = { + .width = 128, + .bpp = 8, + .height = (args->size + 127) / 128, + }; + + ret = drmIoctl(fd, DRM_IOCTL_MODE_CREATE_DUMB, &create); + assert(create.size >= args->size); + + args->handle = create.handle; + + vc4_create_simulator_bo(fd, create.handle, args->size); + + return ret; +} + +/** + * Simulated ioctl(fd, DRM_VC4_CREATE_SHADER_BO) implementation. + * + * In simulation we defer shader validation until exec time. Just make a host + * BO and memcpy the contents in. + */ +static int +vc4_simulator_create_shader_bo_ioctl(int fd, + struct drm_vc4_create_shader_bo *args) +{ + int ret; + struct drm_mode_create_dumb create = { + .width = 128, + .bpp = 8, + .height = (args->size + 127) / 128, + }; + + ret = drmIoctl(fd, DRM_IOCTL_MODE_CREATE_DUMB, &create); + if (ret) + return ret; + assert(create.size >= args->size); + + args->handle = create.handle; + + vc4_create_simulator_bo(fd, create.handle, args->size); + + struct drm_mode_map_dumb map = { + .handle = create.handle + }; + ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map); + if (ret) + return ret; + + void *shader = mmap(NULL, args->size, PROT_READ | PROT_WRITE, MAP_SHARED, + fd, map.offset); + memcpy(shader, (void *)(uintptr_t)args->data, args->size); + munmap(shader, args->size); + + return 0; +} + +/** + * Simulated ioctl(fd, DRM_VC4_MMAP_BO) implementation. + * + * We just pass this straight through to dumb mmap. + */ +static int +vc4_simulator_mmap_bo_ioctl(int fd, struct drm_vc4_mmap_bo *args) +{ + int ret; + struct drm_mode_map_dumb map = { + .handle = args->handle, + }; + + ret = drmIoctl(fd, DRM_IOCTL_MODE_MAP_DUMB, &map); + args->offset = map.offset; + + return ret; +} + +static int +vc4_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args) +{ + /* Free the simulator's internal tracking. */ + struct vc4_simulator_file *file = vc4_get_simulator_file_for_fd(fd); + struct vc4_simulator_bo *sim_bo = vc4_get_simulator_bo(file, + args->handle); + + vc4_free_simulator_bo(sim_bo); + + /* Pass the call on down. */ + return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args); +} + +static int +vc4_simulator_get_param_ioctl(int fd, struct drm_vc4_get_param *args) +{ + switch (args->param) { + case DRM_VC4_PARAM_SUPPORTS_BRANCHES: + case DRM_VC4_PARAM_SUPPORTS_ETC1: + case DRM_VC4_PARAM_SUPPORTS_THREADED_FS: + args->value = true; + return 0; + + case DRM_VC4_PARAM_V3D_IDENT0: + args->value = 0x02000000; + return 0; + + case DRM_VC4_PARAM_V3D_IDENT1: + args->value = 0x00000001; + return 0; + + default: + fprintf(stderr, "Unknown DRM_IOCTL_VC4_GET_PARAM(%lld)\n", + (long long)args->param); + abort(); + }; +} + +int +vc4_simulator_ioctl(int fd, unsigned long request, void *args) +{ + switch (request) { + case DRM_IOCTL_VC4_CREATE_BO: + return vc4_simulator_create_bo_ioctl(fd, args); + case DRM_IOCTL_VC4_CREATE_SHADER_BO: + return vc4_simulator_create_shader_bo_ioctl(fd, args); + case DRM_IOCTL_VC4_MMAP_BO: + return vc4_simulator_mmap_bo_ioctl(fd, args); + + case DRM_IOCTL_VC4_WAIT_BO: + case DRM_IOCTL_VC4_WAIT_SEQNO: + /* We do all of the vc4 rendering synchronously, so we just + * return immediately on the wait ioctls. This ignores any + * native rendering to the host BO, so it does mean we race on + * front buffer rendering. + */ + return 0; + + case DRM_IOCTL_VC4_GET_PARAM: + return vc4_simulator_get_param_ioctl(fd, args); + + case DRM_IOCTL_GEM_CLOSE: + return vc4_simulator_gem_close_ioctl(fd, args); + + case DRM_IOCTL_GEM_OPEN: + case DRM_IOCTL_GEM_FLINK: + return drmIoctl(fd, request, args); + default: + fprintf(stderr, "Unknown ioctl 0x%08x\n", (int)request); + abort(); + } +} + +static void +vc4_simulator_init_global(void) +{ + mtx_lock(&sim_state.mutex); + if (sim_state.refcount++) { + mtx_unlock(&sim_state.mutex); return; } - sim_mem_base = calloc(sim_mem_size, 1); - if (!sim_mem_base) + sim_state.mem_size = 256 * 1024 * 1024; + sim_state.mem = calloc(sim_state.mem_size, 1); + if (!sim_state.mem) abort(); - - screen->simulator_mem_size = sim_mem_size; - screen->simulator_mem_base = sim_mem_base; + sim_state.heap = u_mmInit(0, sim_state.mem_size); /* We supply our own memory so that we can have more aperture * available (256MB instead of simpenrose's default 64MB). */ - simpenrose_init_hardware_supply_mem(screen->simulator_mem_base, - screen->simulator_mem_size); + simpenrose_init_hardware_supply_mem(sim_state.mem, sim_state.mem_size); /* Carve out low memory for tile allocation overflow. The kernel * should be automatically handling overflow memory setup on real @@ -355,20 +694,50 @@ vc4_simulator_init(struct vc4_screen *screen) * up over the whole lifetime of simpenrose (not reused on each * flush), so it had better be big. */ - simpenrose_supply_overflow_mem(0, OVERFLOW_SIZE); + sim_state.overflow = u_mmAllocMem(sim_state.heap, 32 * 1024 * 1024, + PAGE_ALIGN2, 0); + simpenrose_supply_overflow_mem(sim_state.overflow->ofs, + sim_state.overflow->size); + + mtx_unlock(&sim_state.mutex); + + sim_state.fd_map = + _mesa_hash_table_create(NULL, + _mesa_hash_pointer, + _mesa_key_pointer_equal); +} + +void +vc4_simulator_init(struct vc4_screen *screen) +{ + vc4_simulator_init_global(); + + screen->sim_file = rzalloc(screen, struct vc4_simulator_file); + + screen->sim_file->bo_map = + _mesa_hash_table_create(screen->sim_file, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + + mtx_lock(&sim_state.mutex); + _mesa_hash_table_insert(sim_state.fd_map, int_to_key(screen->fd + 1), + screen->sim_file); + mtx_unlock(&sim_state.mutex); - mtx_unlock(&exec_mutex); + screen->sim_file->dev.screen = screen; } void vc4_simulator_destroy(struct vc4_screen *screen) { - mtx_lock(&exec_mutex); - if (!--sim_mem_refcount) { - free(sim_mem_base); - sim_mem_base = NULL; + mtx_lock(&sim_state.mutex); + if (!--sim_state.refcount) { + _mesa_hash_table_destroy(sim_state.fd_map, NULL); + u_mmDestroy(sim_state.heap); + free(sim_state.mem); + /* No memsetting it, because it contains the mutex. */ } - mtx_unlock(&exec_mutex); + mtx_unlock(&sim_state.mutex); } #endif /* USE_VC4_SIMULATOR */ diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h index 1352c9baf..d507b5fb6 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_simulator_validate.h @@ -78,8 +78,7 @@ typedef uint16_t u16; typedef uint32_t u32; struct drm_device { - struct vc4_context *vc4; - uint32_t simulator_mem_next; + struct vc4_screen *screen; }; struct drm_gem_object { diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_state.c b/lib/mesa/src/gallium/drivers/vc4/vc4_state.c index 124715895..2e00104e4 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_state.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_state.c @@ -374,7 +374,8 @@ vc4_vertex_state_bind(struct pipe_context *pctx, void *hwcso) } static void -vc4_set_constant_buffer(struct pipe_context *pctx, uint shader, uint index, +vc4_set_constant_buffer(struct pipe_context *pctx, + enum pipe_shader_type shader, uint index, const struct pipe_constant_buffer *cb) { struct vc4_context *vc4 = vc4_context(pctx); @@ -615,6 +616,9 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, VC4_SET_FIELD(prsc->height0 & 2047, VC4_TEX_P1_HEIGHT) | VC4_SET_FIELD(prsc->width0 & 2047, VC4_TEX_P1_WIDTH)); + if (prsc->format == PIPE_FORMAT_ETC1_RGB8) + so->texture_p1 |= VC4_TEX_P1_ETCFLIP_MASK; + return &so->base; } diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c index 4bcb85b16..07e1c9c5f 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.c @@ -52,41 +52,6 @@ #include "vc4_context.h" #include "vc4_tiling.h" -/** Return the width in pixels of a 64-byte microtile. */ -uint32_t -vc4_utile_width(int cpp) -{ - switch (cpp) { - case 1: - case 2: - return 8; - case 4: - return 4; - case 8: - return 2; - default: - fprintf(stderr, "unknown cpp: %d\n", cpp); - abort(); - } -} - -/** Return the height in pixels of a 64-byte microtile. */ -uint32_t -vc4_utile_height(int cpp) -{ - switch (cpp) { - case 1: - return 8; - case 2: - case 4: - case 8: - return 4; - default: - fprintf(stderr, "unknown cpp: %d\n", cpp); - abort(); - } -} - /** * The texture unit decides what tiling format a particular miplevel is using * this function, so we lay out our miptrees accordingly. @@ -98,32 +63,6 @@ vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) height <= 4 * vc4_utile_height(cpp)); } -void -vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp) -{ - uint32_t utile_h = vc4_utile_height(cpp); - uint32_t row_size = 64 / utile_h; - - for (int y = 0; y < utile_h; y++) { - memcpy(dst, src, row_size); - dst += dst_stride; - src += row_size; - } -} - -void -vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp) -{ - uint32_t utile_h = vc4_utile_height(cpp); - uint32_t row_size = 64 / utile_h; - - for (int y = 0; y < utile_h; y++) { - memcpy(dst, src, row_size); - dst += row_size; - src += src_stride; - } -} - static void check_box_utile_alignment(const struct pipe_box *box, int cpp) { @@ -133,48 +72,6 @@ check_box_utile_alignment(const struct pipe_box *box, int cpp) assert(!(box->height & (vc4_utile_height(cpp) - 1))); } -static void -vc4_load_lt_image(void *dst, uint32_t dst_stride, - void *src, uint32_t src_stride, - int cpp, const struct pipe_box *box) -{ - uint32_t utile_w = vc4_utile_width(cpp); - uint32_t utile_h = vc4_utile_height(cpp); - uint32_t xstart = box->x; - uint32_t ystart = box->y; - - for (uint32_t y = 0; y < box->height; y += utile_h) { - for (int x = 0; x < box->width; x += utile_w) { - vc4_load_utile(dst + (dst_stride * y + - x * cpp), - src + ((ystart + y) * src_stride + - (xstart + x) * 64 / utile_w), - dst_stride, cpp); - } - } -} - -static void -vc4_store_lt_image(void *dst, uint32_t dst_stride, - void *src, uint32_t src_stride, - int cpp, const struct pipe_box *box) -{ - uint32_t utile_w = vc4_utile_width(cpp); - uint32_t utile_h = vc4_utile_height(cpp); - uint32_t xstart = box->x; - uint32_t ystart = box->y; - - for (uint32_t y = 0; y < box->height; y += utile_h) { - for (int x = 0; x < box->width; x += utile_w) { - vc4_store_utile(dst + ((ystart + y) * dst_stride + - (xstart + x) * 64 / utile_w), - src + (src_stride * y + - x * cpp), - src_stride, cpp); - } - } -} - /** * Takes a utile x and y (and the number of utiles of width of the image) and * returns the offset to the utile within a VC4_TILING_FORMAT_TF image. @@ -209,7 +106,10 @@ t_utile_address(uint32_t utile_x, uint32_t utile_y, odd_stile_map[stile_index] : even_stile_map[stile_index]); - uint32_t utile_offset = 64 * ((utile_y & 3) * 4 + (utile_x & 3)); + /* This function no longer handles the utile offset within a subtile. + * Walking subtiles is the job of the LT image handler. + */ + assert(!(utile_x & 3) && !(utile_y & 3)); #if 0 fprintf(stderr, "utile %d,%d -> %d + %d + %d (stride %d,%d) = %d\n", @@ -219,29 +119,70 @@ t_utile_address(uint32_t utile_x, uint32_t utile_y, tile_offset + stile_offset + utile_offset); #endif - return tile_offset + stile_offset + utile_offset; + return tile_offset + stile_offset; } -static void -vc4_load_t_image(void *dst, uint32_t dst_stride, - void *src, uint32_t src_stride, - int cpp, const struct pipe_box *box) +/** + * Loads or stores a T texture image by breaking it down into subtiles + * (1024-byte, 4x4-utile) sub-images that we can use the LT tiling functions + * on. + */ +static inline void +vc4_t_image_helper(void *gpu, uint32_t gpu_stride, + void *cpu, uint32_t cpu_stride, + int cpp, const struct pipe_box *box, + bool to_cpu) { uint32_t utile_w = vc4_utile_width(cpp); uint32_t utile_h = vc4_utile_height(cpp); - uint32_t utile_stride = src_stride / cpp / utile_w; - uint32_t xstart = box->x / utile_w; - uint32_t ystart = box->y / utile_h; + uint32_t utile_w_shift = ffs(utile_w) - 1; + uint32_t utile_h_shift = ffs(utile_h) - 1; + uint32_t stile_w = 4 * utile_w; + uint32_t stile_h = 4 * utile_h; + assert(stile_w * stile_h * cpp == 1024); + uint32_t utile_stride = gpu_stride / cpp / utile_w; + uint32_t x1 = box->x; + uint32_t y1 = box->y; + uint32_t x2 = box->x + box->width; + uint32_t y2 = box->y + box->height; + struct pipe_box partial_box; + uint32_t gpu_lt_stride = stile_w * cpp; + + for (uint32_t y = y1; y < y2; y = align(y + 1, stile_h)) { + partial_box.y = y & (stile_h - 1); + partial_box.height = MIN2(y2 - y, stile_h - partial_box.y); + + uint32_t cpu_offset = 0; + for (uint32_t x = x1; x < x2; x = align(x + 1, stile_w)) { + partial_box.x = x & (stile_w - 1); + partial_box.width = MIN2(x2 - x, + stile_w - partial_box.x); + + /* The dst offset we want is the start of this + * subtile + */ + uint32_t gpu_offset = + t_utile_address((x >> utile_w_shift) & ~0x3, + (y >> utile_h_shift) & ~0x3, + utile_stride); - for (uint32_t y = 0; y < box->height / utile_h; y++) { - for (int x = 0; x < box->width / utile_w; x++) { - vc4_load_utile(dst + (y * utile_h * dst_stride + - x * utile_w * cpp), - src + t_utile_address(xstart + x, - ystart + y, - utile_stride), - dst_stride, cpp); + if (to_cpu) { + vc4_load_lt_image(cpu + cpu_offset, + cpu_stride, + gpu + gpu_offset, + gpu_lt_stride, + cpp, &partial_box); + } else { + vc4_store_lt_image(gpu + gpu_offset, + gpu_lt_stride, + cpu + cpu_offset, + cpu_stride, + cpp, &partial_box); + } + + cpu_offset += partial_box.width * cpp; } + cpu += cpu_stride * partial_box.height; } } @@ -250,22 +191,19 @@ vc4_store_t_image(void *dst, uint32_t dst_stride, void *src, uint32_t src_stride, int cpp, const struct pipe_box *box) { - uint32_t utile_w = vc4_utile_width(cpp); - uint32_t utile_h = vc4_utile_height(cpp); - uint32_t utile_stride = dst_stride / cpp / utile_w; - uint32_t xstart = box->x / utile_w; - uint32_t ystart = box->y / utile_h; + vc4_t_image_helper(dst, dst_stride, + src, src_stride, + cpp, box, false); +} - for (uint32_t y = 0; y < box->height / utile_h; y++) { - for (int x = 0; x < box->width / utile_w; x++) { - vc4_store_utile(dst + t_utile_address(xstart + x, - ystart + y, - utile_stride), - src + (y * utile_h * src_stride + - x * utile_w * cpp), - src_stride, cpp); - } - } +static void +vc4_load_t_image(void *dst, uint32_t dst_stride, + void *src, uint32_t src_stride, + int cpp, const struct pipe_box *box) +{ + vc4_t_image_helper(src, src_stride, + dst, dst_stride, + cpp, box, true); } /** diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h index b90bba702..ba1ad6fb3 100644 --- a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling.h @@ -24,11 +24,56 @@ #ifndef VC4_TILING_H #define VC4_TILING_H -uint32_t vc4_utile_width(int cpp) ATTRIBUTE_CONST; -uint32_t vc4_utile_height(int cpp) ATTRIBUTE_CONST; +#include <stdbool.h> +#include <stdint.h> +#include "util/macros.h" + +/** Return the width in pixels of a 64-byte microtile. */ +static inline uint32_t +vc4_utile_width(int cpp) +{ + switch (cpp) { + case 1: + case 2: + return 8; + case 4: + return 4; + case 8: + return 2; + default: + unreachable("unknown cpp"); + } +} + +/** Return the height in pixels of a 64-byte microtile. */ +static inline uint32_t +vc4_utile_height(int cpp) +{ + switch (cpp) { + case 1: + return 8; + case 2: + case 4: + case 8: + return 4; + default: + unreachable("unknown cpp"); + } +} + bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST; -void vc4_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp); -void vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp); +void vc4_load_lt_image_base(void *dst, uint32_t dst_stride, + void *src, uint32_t src_stride, + int cpp, const struct pipe_box *box); +void vc4_store_lt_image_base(void *dst, uint32_t dst_stride, + void *src, uint32_t src_stride, + int cpp, const struct pipe_box *box); +void vc4_load_lt_image_neon(void *dst, uint32_t dst_stride, + void *src, uint32_t src_stride, + int cpp, const struct pipe_box *box); +void vc4_store_lt_image_neon(void *dst, uint32_t dst_stride, + void *src, uint32_t src_stride, + int cpp, const struct pipe_box *box); void vc4_load_tiled_image(void *dst, uint32_t dst_stride, void *src, uint32_t src_stride, uint8_t tiling_format, int cpp, @@ -38,4 +83,34 @@ void vc4_store_tiled_image(void *dst, uint32_t dst_stride, uint8_t tiling_format, int cpp, const struct pipe_box *box); +/* If we're building for ARMv7 (Pi 2+), assume it has NEON. For Raspbian we + * should extend this to have some runtime detection of being built for ARMv6 + * on a Pi 2+. + */ +#if defined(__ARM_ARCH) && __ARM_ARCH == 7 +#define NEON_SUFFIX(x) x ## _neon +#else +#define NEON_SUFFIX(x) x ## _base +#endif + +static inline void +vc4_load_lt_image(void *dst, uint32_t dst_stride, + void *src, uint32_t src_stride, + int cpp, const struct pipe_box *box) +{ + NEON_SUFFIX(vc4_load_lt_image)(dst, dst_stride, src, src_stride, + cpp, box); +} + +static inline void +vc4_store_lt_image(void *dst, uint32_t dst_stride, + void *src, uint32_t src_stride, + int cpp, const struct pipe_box *box) +{ + NEON_SUFFIX(vc4_store_lt_image)(dst, dst_stride, src, src_stride, + cpp, box); +} + +#undef NEON_SUFFIX + #endif /* VC4_TILING_H */ diff --git a/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c new file mode 100644 index 000000000..f37a92e93 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/vc4/vc4_tiling_lt.c @@ -0,0 +1,212 @@ +/* + * Copyright © 2017 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file vc4_tiling_lt.c + * + * Helper functions from vc4_tiling.c that will be compiled for using NEON + * assembly or not. + * + * If VC4_BUILD_NEON is set, then the functions will be suffixed with _neon. + * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86 + * sim build working. + */ + +#include <string.h> +#include "pipe/p_state.h" +#include "vc4_tiling.h" + +#ifdef VC4_BUILD_NEON +#define NEON_TAG(x) x ## _neon +#else +#define NEON_TAG(x) x ## _base +#endif + +/** Returns the stride in bytes of a 64-byte microtile. */ +static uint32_t +vc4_utile_stride(int cpp) +{ + switch (cpp) { + case 1: + return 8; + case 2: + case 4: + case 8: + return 16; + default: + unreachable("bad cpp"); + } +} + +static void +vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp) +{ + uint32_t gpu_stride = vc4_utile_stride(cpp); +#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM) + if (gpu_stride == 8) { + __asm__ volatile ( + /* Load from the GPU in one shot, no interleave, to + * d0-d7. + */ + "vldm %0, {q0, q1, q2, q3}\n" + /* Store each 8-byte line to cpu-side destination, + * incrementing it by the stride each time. + */ + "vst1.8 d0, [%1], %2\n" + "vst1.8 d1, [%1], %2\n" + "vst1.8 d2, [%1], %2\n" + "vst1.8 d3, [%1], %2\n" + "vst1.8 d4, [%1], %2\n" + "vst1.8 d5, [%1], %2\n" + "vst1.8 d6, [%1], %2\n" + "vst1.8 d7, [%1]\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu_stride) + : "q0", "q1", "q2", "q3"); + } else { + assert(gpu_stride == 16); + __asm__ volatile ( + /* Load from the GPU in one shot, no interleave, to + * d0-d7. + */ + "vldm %0, {q0, q1, q2, q3};\n" + /* Store each 16-byte line in 2 parts to the cpu-side + * destination. (vld1 can only store one d-register + * at a time). + */ + "vst1.8 d0, [%1], %3\n" + "vst1.8 d1, [%2], %3\n" + "vst1.8 d2, [%1], %3\n" + "vst1.8 d3, [%2], %3\n" + "vst1.8 d4, [%1], %3\n" + "vst1.8 d5, [%2], %3\n" + "vst1.8 d6, [%1]\n" + "vst1.8 d7, [%2]\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) + : "q0", "q1", "q2", "q3"); + } +#else + for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { + memcpy(cpu, gpu + gpu_offset, gpu_stride); + cpu += cpu_stride; + } +#endif +} + +static void +vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp) +{ + uint32_t gpu_stride = vc4_utile_stride(cpp); + +#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM) + if (gpu_stride == 8) { + __asm__ volatile ( + /* Load each 8-byte line from cpu-side source, + * incrementing it by the stride each time. + */ + "vld1.8 d0, [%1], %2\n" + "vld1.8 d1, [%1], %2\n" + "vld1.8 d2, [%1], %2\n" + "vld1.8 d3, [%1], %2\n" + "vld1.8 d4, [%1], %2\n" + "vld1.8 d5, [%1], %2\n" + "vld1.8 d6, [%1], %2\n" + "vld1.8 d7, [%1]\n" + /* Load from the GPU in one shot, no interleave, to + * d0-d7. + */ + "vstm %0, {q0, q1, q2, q3}\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu_stride) + : "q0", "q1", "q2", "q3"); + } else { + assert(gpu_stride == 16); + __asm__ volatile ( + /* Load each 16-byte line in 2 parts from the cpu-side + * destination. (vld1 can only store one d-register + * at a time). + */ + "vld1.8 d0, [%1], %3\n" + "vld1.8 d1, [%2], %3\n" + "vld1.8 d2, [%1], %3\n" + "vld1.8 d3, [%2], %3\n" + "vld1.8 d4, [%1], %3\n" + "vld1.8 d5, [%2], %3\n" + "vld1.8 d6, [%1]\n" + "vld1.8 d7, [%2]\n" + /* Store to the GPU in one shot, no interleave. */ + "vstm %0, {q0, q1, q2, q3}\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) + : "q0", "q1", "q2", "q3"); + } +#else + for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { + memcpy(gpu + gpu_offset, cpu, gpu_stride); + cpu += cpu_stride; + } +#endif + +} + +void +NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride, + void *src, uint32_t src_stride, + int cpp, const struct pipe_box *box) +{ + uint32_t utile_w = vc4_utile_width(cpp); + uint32_t utile_h = vc4_utile_height(cpp); + uint32_t xstart = box->x; + uint32_t ystart = box->y; + + for (uint32_t y = 0; y < box->height; y += utile_h) { + for (int x = 0; x < box->width; x += utile_w) { + vc4_load_utile(dst + (dst_stride * y + + x * cpp), + src + ((ystart + y) * src_stride + + (xstart + x) * 64 / utile_w), + dst_stride, cpp); + } + } +} + +void +NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride, + void *src, uint32_t src_stride, + int cpp, const struct pipe_box *box) +{ + uint32_t utile_w = vc4_utile_width(cpp); + uint32_t utile_h = vc4_utile_height(cpp); + uint32_t xstart = box->x; + uint32_t ystart = box->y; + + for (uint32_t y = 0; y < box->height; y += utile_h) { + for (int x = 0; x < box->width; x += utile_w) { + vc4_store_utile(dst + ((ystart + y) * dst_stride + + (xstart + x) * 64 / utile_w), + src + (src_stride * y + + x * cpp), + src_stride, cpp); + } + } +} |