diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2021-07-22 10:17:30 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2021-07-22 10:17:30 +0000 |
commit | ca11beabae33eb59fb981b8adf50b1d47a2a98f0 (patch) | |
tree | 3e4691a396e6e54cd54224a190663d5cf976625b /lib/mesa/src/gallium/drivers/r600 | |
parent | 27c8a50e8bbde7d28b1fc46d715a4c469e24f2c4 (diff) |
Import Mesa 21.1.5
Diffstat (limited to 'lib/mesa/src/gallium/drivers/r600')
30 files changed, 1673 insertions, 497 deletions
diff --git a/lib/mesa/src/gallium/drivers/r600/Android.mk b/lib/mesa/src/gallium/drivers/r600/Android.mk index 19a3ba820..b87fc91e6 100644 --- a/lib/mesa/src/gallium/drivers/r600/Android.mk +++ b/lib/mesa/src/gallium/drivers/r600/Android.mk @@ -32,8 +32,10 @@ LOCAL_SRC_FILES := $(C_SOURCES) $(CXX_SOURCES) LOCAL_C_INCLUDES += \ $(MESA_TOP)/src/amd/common \ - $(MESA_TOP)/src/amd/llvm + $(MESA_TOP)/src/amd/llvm \ + $(MESA_TOP)/src/mesa +LOCAL_STATIC_LIBRARIES := libmesa_nir LOCAL_SHARED_LIBRARIES := libdrm_radeon LOCAL_MODULE := libmesa_pipe_r600 @@ -47,6 +49,15 @@ $(intermediates)/egd_tables.h: $(MESA_TOP)/src/gallium/drivers/r600/egd_tables.p @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" $(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/gallium/drivers/r600/egd_tables.py $(MESA_TOP)/src/gallium/drivers/r600/evergreend.h > $@ +sfn_nir_algebraic_gen := $(LOCAL_PATH)/sfn/sfn_nir_algebraic.py +sfn_nir_algebraic_deps := \ + $(LOCAL_PATH)/sfn/sfn_nir_algebraic.py \ + $(MESA_TOP)/src/compiler/nir/nir_algebraic.py + +$(intermediates)/sfn_nir_algebraic.c: $(sfn_nir_algebraic_deps) + @mkdir -p $(dir $@) + $(hide) $(MESA_PYTHON2) $(sfn_nir_algebraic_gen) -p $(MESA_TOP)/src/compiler/nir/ > $@ + ifeq ($(MESA_ENABLE_LLVM),true) $(call mesa-build-with-llvm) endif diff --git a/lib/mesa/src/gallium/drivers/r600/compute_memory_pool.c b/lib/mesa/src/gallium/drivers/r600/compute_memory_pool.c index 685c2b6d2..58a5dffdf 100644 --- a/lib/mesa/src/gallium/drivers/r600/compute_memory_pool.c +++ b/lib/mesa/src/gallium/drivers/r600/compute_memory_pool.c @@ -436,7 +436,7 @@ static void compute_memory_move_item(struct compute_memory_pool *pool, if (pool->item_list != item->link.prev) { ASSERTED struct compute_memory_item *prev; - prev = container_of(item->link.prev, item, link); + prev = container_of(item->link.prev, struct compute_memory_item, link); assert(prev->start_in_dw + prev->size_in_dw <= new_start_in_dw); } @@ -479,7 +479,7 @@ static void compute_memory_move_item(struct compute_memory_pool *pool, u_box_1d(new_start_in_dw * 4, (offset + item->size_in_dw) * 4, &box); - map = pipe->transfer_map(pipe, src, 0, PIPE_TRANSFER_READ_WRITE, + map = pipe->transfer_map(pipe, src, 0, PIPE_MAP_READ_WRITE, &box, &trans); assert(map); @@ -495,7 +495,7 @@ static void compute_memory_move_item(struct compute_memory_pool *pool, } /** - * Frees the memory asociated to the item with id \a id from the pool. + * Frees the memory associated to the item with id \a id from the pool. * \param id The id of the item to be freed. */ void compute_memory_free(struct compute_memory_pool* pool, int64_t id) @@ -614,7 +614,7 @@ static void compute_memory_transfer( offset_in_chunk, size); if (device_to_host) { - map = pipe->transfer_map(pipe, gart, 0, PIPE_TRANSFER_READ, + map = pipe->transfer_map(pipe, gart, 0, PIPE_MAP_READ, &(struct pipe_box) { .width = aligned_size * 4, .height = 1, .depth = 1 }, &xfer); assert(xfer); @@ -622,7 +622,7 @@ static void compute_memory_transfer( memcpy(data, map + internal_offset, size); pipe->transfer_unmap(pipe, xfer); } else { - map = pipe->transfer_map(pipe, gart, 0, PIPE_TRANSFER_WRITE, + map = pipe->transfer_map(pipe, gart, 0, PIPE_MAP_WRITE, &(struct pipe_box) { .width = aligned_size * 4, .height = 1, .depth = 1 }, &xfer); assert(xfer); diff --git a/lib/mesa/src/gallium/drivers/r600/compute_memory_pool.h b/lib/mesa/src/gallium/drivers/r600/compute_memory_pool.h index 2064e5635..3b9097627 100644 --- a/lib/mesa/src/gallium/drivers/r600/compute_memory_pool.h +++ b/lib/mesa/src/gallium/drivers/r600/compute_memory_pool.h @@ -47,7 +47,7 @@ struct compute_memory_item int64_t start_in_dw; int64_t size_in_dw; /**< Size of the chunk in dwords */ - /** Intermediate buffer asociated with an item. It is used mainly for mapping + /** Intermediate buffer associated with an item. It is used mainly for mapping * items against it. They are listed in the pool's unallocated list */ struct r600_resource *real_buffer; diff --git a/lib/mesa/src/gallium/drivers/r600/meson.build b/lib/mesa/src/gallium/drivers/r600/meson.build index 91f62a8a7..424ac3ca0 100644 --- a/lib/mesa/src/gallium/drivers/r600/meson.build +++ b/lib/mesa/src/gallium/drivers/r600/meson.build @@ -105,7 +105,80 @@ files_r600 = files( 'sb/sb_shader.h', 'sb/sb_ssa_builder.cpp', 'sb/sb_valtable.cpp', -) + 'sfn/sfn_alu_defines.cpp', + 'sfn/sfn_alu_defines.h', + 'sfn/sfn_callstack.cpp', + 'sfn/sfn_callstack.h', + 'sfn/sfn_conditionaljumptracker.cpp', + 'sfn/sfn_conditionaljumptracker.h', + 'sfn/sfn_defines.h', + 'sfn/sfn_debug.cpp', + 'sfn/sfn_debug.h', + 'sfn/sfn_emitaluinstruction.cpp', + 'sfn/sfn_emitaluinstruction.h', + 'sfn/sfn_emitinstruction.cpp', + 'sfn/sfn_emitinstruction.h', + 'sfn/sfn_emitssboinstruction.cpp', + 'sfn/sfn_emitssboinstruction.h', + 'sfn/sfn_emittexinstruction.cpp', + 'sfn/sfn_emittexinstruction.h', + 'sfn/sfn_emitinstruction.h', + 'sfn/sfn_instruction_alu.cpp', + 'sfn/sfn_instruction_alu.h', + 'sfn/sfn_instruction_base.cpp', + 'sfn/sfn_instruction_base.h', + 'sfn/sfn_instruction_block.cpp', + 'sfn/sfn_instruction_block.h', + 'sfn/sfn_instruction_cf.cpp', + 'sfn/sfn_instruction_cf.h', + 'sfn/sfn_instruction_export.cpp', + 'sfn/sfn_instruction_export.h', + 'sfn/sfn_instruction_fetch.cpp', + 'sfn/sfn_instruction_fetch.h', + 'sfn/sfn_instruction_gds.cpp', + 'sfn/sfn_instruction_gds.h', + 'sfn/sfn_instruction_lds.cpp', + 'sfn/sfn_instruction_lds.h', + 'sfn/sfn_instruction_misc.cpp', + 'sfn/sfn_instruction_misc.h', + 'sfn/sfn_instruction_tex.cpp', + 'sfn/sfn_instruction_tex.h', + 'sfn/sfn_ir_to_assembly.cpp', + 'sfn/sfn_ir_to_assembly.h', + 'sfn/sfn_liverange.cpp', + 'sfn/sfn_liverange.h', + 'sfn/sfn_nir.cpp', + 'sfn/sfn_nir.h', + 'sfn/sfn_nir_lower_64bit.cpp', + 'sfn/sfn_nir_lower_fs_out_to_vector.cpp', + 'sfn/sfn_nir_lower_fs_out_to_vector.h', + 'sfn/sfn_nir_lower_tess_io.cpp', + 'sfn/sfn_nir_vectorize_vs_inputs.c', + 'sfn/sfn_shader_base.cpp', + 'sfn/sfn_shader_base.h', + 'sfn/sfn_shader_compute.cpp', + 'sfn/sfn_shader_compute.h', + 'sfn/sfn_shader_fragment.cpp', + 'sfn/sfn_shader_fragment.h', + 'sfn/sfn_shader_geometry.cpp', + 'sfn/sfn_shader_geometry.h', + 'sfn/sfn_shader_tcs.cpp', + 'sfn/sfn_shader_tcs.h', + 'sfn/sfn_shader_tess_eval.cpp', + 'sfn/sfn_shader_tess_eval.h', + 'sfn/sfn_shader_vertex.cpp', + 'sfn/sfn_shader_vertex.h', + 'sfn/sfn_shaderio.cpp', + 'sfn/sfn_shaderio.h', + 'sfn/sfn_value.cpp', + 'sfn/sfn_value.h', + 'sfn/sfn_value_gpr.cpp', + 'sfn/sfn_value_gpr.h', + 'sfn/sfn_valuepool.cpp', + 'sfn/sfn_valuepool.h', + 'sfn/sfn_vertexstageexport.cpp', + 'sfn/sfn_vertexstageexport.h', + ) egd_tables_h = custom_target( 'egd_tables.h', @@ -115,6 +188,19 @@ egd_tables_h = custom_target( capture : true, ) +sfn_nir_algebraic_c = custom_target( + 'sfn_nir_algebraic.c', + input : 'sfn/sfn_nir_algebraic.py', + output : 'sfn_nir_algebraic.c', + command : [ + prog_python, '@INPUT@', + '-p', join_paths(meson.source_root(), 'src/compiler/nir/'), + ], + capture : true, + depend_files : nir_algebraic_py, +) + + r600_c_args = [] if with_gallium_opencl r600_c_args += '-DHAVE_OPENCL' @@ -122,17 +208,17 @@ endif libr600 = static_library( 'r600', - [files_r600, egd_tables_h], - c_args : [c_vis_args, r600_c_args, '-Wstrict-overflow=0'], - cpp_args : [cpp_vis_args], + [files_r600, egd_tables_h, sfn_nir_algebraic_c], + c_args : [r600_c_args, '-Wstrict-overflow=0'], + gnu_symbol_visibility : 'hidden', include_directories : [ - inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_amd_common, + inc_src, inc_mapi, inc_mesa, inc_include, inc_compiler, inc_gallium, inc_gallium_aux, inc_amd_common, inc_gallium_drivers, ], - dependencies: [dep_libdrm_radeon, dep_elf, dep_llvm], + dependencies: [dep_libdrm_radeon, dep_elf, dep_llvm, idep_nir, idep_nir_headers], ) driver_r600 = declare_dependency( compile_args : '-DGALLIUM_R600', - link_with : [libr600, libradeonwinsys], + link_with : [libr600, libmesa_gallium, libradeonwinsys], ) diff --git a/lib/mesa/src/gallium/drivers/r600/r600_buffer_common.c b/lib/mesa/src/gallium/drivers/r600/r600_buffer_common.c index d0f44dcb6..7dbf7a1ba 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_buffer_common.c +++ b/lib/mesa/src/gallium/drivers/r600/r600_buffer_common.c @@ -34,11 +34,11 @@ bool r600_rings_is_buffer_referenced(struct r600_common_context *ctx, struct pb_buffer *buf, enum radeon_bo_usage usage) { - if (ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, buf, usage)) { + if (ctx->ws->cs_is_buffer_referenced(&ctx->gfx.cs, buf, usage)) { return true; } - if (radeon_emitted(ctx->dma.cs, 0) && - ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, buf, usage)) { + if (radeon_emitted(&ctx->dma.cs, 0) && + ctx->ws->cs_is_buffer_referenced(&ctx->dma.cs, buf, usage)) { return true; } return false; @@ -53,19 +53,19 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, assert(!(resource->flags & RADEON_FLAG_SPARSE)); - if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) { - return ctx->ws->buffer_map(resource->buf, NULL, usage); + if (usage & PIPE_MAP_UNSYNCHRONIZED) { + return ctx->ws->buffer_map(ctx->ws, resource->buf, NULL, usage); } - if (!(usage & PIPE_TRANSFER_WRITE)) { + if (!(usage & PIPE_MAP_WRITE)) { /* have to wait for the last write */ rusage = RADEON_USAGE_WRITE; } - if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && - ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, + if (radeon_emitted(&ctx->gfx.cs, ctx->initial_gfx_cs_size) && + ctx->ws->cs_is_buffer_referenced(&ctx->gfx.cs, resource->buf, rusage)) { - if (usage & PIPE_TRANSFER_DONTBLOCK) { + if (usage & PIPE_MAP_DONTBLOCK) { ctx->gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL); return NULL; } else { @@ -73,10 +73,10 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, busy = true; } } - if (radeon_emitted(ctx->dma.cs, 0) && - ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, + if (radeon_emitted(&ctx->dma.cs, 0) && + ctx->ws->cs_is_buffer_referenced(&ctx->dma.cs, resource->buf, rusage)) { - if (usage & PIPE_TRANSFER_DONTBLOCK) { + if (usage & PIPE_MAP_DONTBLOCK) { ctx->dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL); return NULL; } else { @@ -85,20 +85,20 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, } } - if (busy || !ctx->ws->buffer_wait(resource->buf, 0, rusage)) { - if (usage & PIPE_TRANSFER_DONTBLOCK) { + if (busy || !ctx->ws->buffer_wait(ctx->ws, resource->buf, 0, rusage)) { + if (usage & PIPE_MAP_DONTBLOCK) { return NULL; } else { /* We will be wait for the GPU. Wait for any offloaded * CS flush to complete to avoid busy-waiting in the winsys. */ - ctx->ws->cs_sync_flush(ctx->gfx.cs); - if (ctx->dma.cs) - ctx->ws->cs_sync_flush(ctx->dma.cs); + ctx->ws->cs_sync_flush(&ctx->gfx.cs); + if (ctx->dma.cs.priv) + ctx->ws->cs_sync_flush(&ctx->dma.cs); } } /* Setting the CS to NULL will prevent doing checks we have done already. */ - return ctx->ws->buffer_map(resource->buf, NULL, usage); + return ctx->ws->buffer_map(ctx->ws, resource->buf, NULL, usage); } void r600_init_resource_fields(struct r600_common_screen *rscreen, @@ -116,7 +116,7 @@ void r600_init_resource_fields(struct r600_common_screen *rscreen, switch (res->b.b.usage) { case PIPE_USAGE_STREAM: res->flags = RADEON_FLAG_GTT_WC; - /* fall through */ + FALLTHROUGH; case PIPE_USAGE_STAGING: /* Transfers are likely to occur more often with these * resources. */ @@ -131,7 +131,7 @@ void r600_init_resource_fields(struct r600_common_screen *rscreen, res->flags |= RADEON_FLAG_GTT_WC; break; } - /* fall through */ + FALLTHROUGH; case PIPE_USAGE_DEFAULT: case PIPE_USAGE_IMMUTABLE: default: @@ -254,7 +254,7 @@ r600_invalidate_buffer(struct r600_common_context *rctx, /* Check if mapping this buffer would cause waiting for the GPU. */ if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || - !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { + !rctx->ws->buffer_wait(rctx->ws, rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b); } else { util_range_set_empty(&rbuffer->valid_buffer_range); @@ -334,7 +334,7 @@ static bool r600_can_dma_copy_buffer(struct r600_common_context *rctx, bool dword_aligned = !(dstx % 4) && !(srcx % 4) && !(size % 4); return rctx->screen->has_cp_dma || - (dword_aligned && (rctx->dma.cs || + (dword_aligned && (rctx->dma.cs.priv || rctx->screen->has_streamout)); } @@ -365,51 +365,51 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, * So don't ever use staging buffers. */ if (rbuffer->b.is_user_ptr) - usage |= PIPE_TRANSFER_PERSISTENT; + usage |= PIPE_MAP_PERSISTENT; /* See if the buffer range being mapped has never been initialized, * in which case it can be mapped unsynchronized. */ - if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | + if (!(usage & (PIPE_MAP_UNSYNCHRONIZED | TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) && - usage & PIPE_TRANSFER_WRITE && + usage & PIPE_MAP_WRITE && !rbuffer->b.is_shared && !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) { - usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + usage |= PIPE_MAP_UNSYNCHRONIZED; } /* If discarding the entire range, discard the whole resource instead. */ - if (usage & PIPE_TRANSFER_DISCARD_RANGE && + if (usage & PIPE_MAP_DISCARD_RANGE && box->x == 0 && box->width == resource->width0) { - usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; + usage |= PIPE_MAP_DISCARD_WHOLE_RESOURCE; } - if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && - !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | + if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE && + !(usage & (PIPE_MAP_UNSYNCHRONIZED | TC_TRANSFER_MAP_NO_INVALIDATE))) { - assert(usage & PIPE_TRANSFER_WRITE); + assert(usage & PIPE_MAP_WRITE); if (r600_invalidate_buffer(rctx, rbuffer)) { /* At this point, the buffer is always idle. */ - usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + usage |= PIPE_MAP_UNSYNCHRONIZED; } else { /* Fall back to a temporary buffer. */ - usage |= PIPE_TRANSFER_DISCARD_RANGE; + usage |= PIPE_MAP_DISCARD_RANGE; } } - if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && + if ((usage & PIPE_MAP_DISCARD_RANGE) && !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) && - ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | - PIPE_TRANSFER_PERSISTENT)) && + ((!(usage & (PIPE_MAP_UNSYNCHRONIZED | + PIPE_MAP_PERSISTENT)) && r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) || (rbuffer->flags & RADEON_FLAG_SPARSE))) { - assert(usage & PIPE_TRANSFER_WRITE); + assert(usage & PIPE_MAP_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (rbuffer->flags & RADEON_FLAG_SPARSE || r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || - !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { + !rctx->ws->buffer_wait(rctx->ws, rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ unsigned offset; struct r600_resource *staging = NULL; @@ -429,12 +429,12 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, } } else { /* At this point, the buffer is always idle (we checked it above). */ - usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + usage |= PIPE_MAP_UNSYNCHRONIZED; } } /* Use a staging buffer in cached GTT for reads. */ - else if (((usage & PIPE_TRANSFER_READ) && - !(usage & PIPE_TRANSFER_PERSISTENT) && + else if (((usage & PIPE_MAP_READ) && + !(usage & PIPE_MAP_PERSISTENT) && (rbuffer->domains & RADEON_DOMAIN_VRAM || rbuffer->flags & RADEON_FLAG_GTT_WC) && r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) || @@ -452,7 +452,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, 0, 0, resource, 0, box); data = r600_buffer_map_sync_with_rings(rctx, staging, - usage & ~PIPE_TRANSFER_UNSYNCHRONIZED); + usage & ~PIPE_MAP_UNSYNCHRONIZED); if (!data) { r600_resource_reference(&staging, NULL); return NULL; @@ -506,8 +506,8 @@ static void r600_buffer_flush_region(struct pipe_context *ctx, struct pipe_transfer *transfer, const struct pipe_box *rel_box) { - unsigned required_usage = PIPE_TRANSFER_WRITE | - PIPE_TRANSFER_FLUSH_EXPLICIT; + unsigned required_usage = PIPE_MAP_WRITE | + PIPE_MAP_FLUSH_EXPLICIT; if ((transfer->usage & required_usage) == required_usage) { struct pipe_box box; @@ -523,8 +523,8 @@ static void r600_buffer_transfer_unmap(struct pipe_context *ctx, struct r600_common_context *rctx = (struct r600_common_context*)ctx; struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; - if (transfer->usage & PIPE_TRANSFER_WRITE && - !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT)) + if (transfer->usage & PIPE_MAP_WRITE && + !(transfer->usage & PIPE_MAP_FLUSH_EXPLICIT)) r600_buffer_do_flush_region(ctx, transfer, &transfer->box); r600_resource_reference(&rtransfer->staging, NULL); @@ -545,10 +545,10 @@ void r600_buffer_subdata(struct pipe_context *ctx, struct pipe_box box; uint8_t *map = NULL; - usage |= PIPE_TRANSFER_WRITE; + usage |= PIPE_MAP_WRITE; - if (!(usage & PIPE_TRANSFER_MAP_DIRECTLY)) - usage |= PIPE_TRANSFER_DISCARD_RANGE; + if (!(usage & PIPE_MAP_DIRECTLY)) + usage |= PIPE_MAP_DISCARD_RANGE; u_box_1d(offset, size, &box); map = r600_buffer_transfer_map(ctx, buffer, 0, usage, &box, &transfer); diff --git a/lib/mesa/src/gallium/drivers/r600/r600_cs.h b/lib/mesa/src/gallium/drivers/r600/r600_cs.h index 424adba27..71e606b9b 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_cs.h +++ b/lib/mesa/src/gallium/drivers/r600/r600_cs.h @@ -45,8 +45,8 @@ radeon_cs_memory_below_limit(struct r600_common_screen *screen, struct radeon_cmdbuf *cs, uint64_t vram, uint64_t gtt) { - vram += cs->used_vram; - gtt += cs->used_gart; + vram += (uint64_t)cs->used_vram_kb * 1024; + gtt += (uint64_t)cs->used_gart_kb * 1024; /* Anything that goes above the VRAM size should go to GTT. */ if (vram > screen->info.vram_size) @@ -74,7 +74,7 @@ static inline unsigned radeon_add_to_buffer_list(struct r600_common_context *rct { assert(usage); return rctx->ws->cs_add_buffer( - ring->cs, rbo->buf, + &ring->cs, rbo->buf, (enum radeon_bo_usage)(usage | RADEON_USAGE_SYNCHRONIZED), rbo->domains, priority) * 4; } @@ -105,7 +105,7 @@ radeon_add_to_buffer_list_check_mem(struct r600_common_context *rctx, bool check_mem) { if (check_mem && - !radeon_cs_memory_below_limit(rctx->screen, ring->cs, + !radeon_cs_memory_below_limit(rctx->screen, &ring->cs, rctx->vram + rbo->vram_usage, rctx->gtt + rbo->gart_usage)) ring->flush(rctx, PIPE_FLUSH_ASYNC, NULL); @@ -118,7 +118,7 @@ static inline void r600_emit_reloc(struct r600_common_context *rctx, enum radeon_bo_usage usage, enum radeon_bo_priority priority) { - struct radeon_cmdbuf *cs = ring->cs; + struct radeon_cmdbuf *cs = &ring->cs; bool has_vm = ((struct r600_common_screen*)rctx->b.screen)->info.r600_has_virtual_memory; unsigned reloc = radeon_add_to_buffer_list(rctx, ring, rbo, usage, priority); diff --git a/lib/mesa/src/gallium/drivers/r600/r600_dump.c b/lib/mesa/src/gallium/drivers/r600/r600_dump.c index 29a89605e..76b56bc7d 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_dump.c +++ b/lib/mesa/src/gallium/drivers/r600/r600_dump.c @@ -26,6 +26,7 @@ #include "r600_dump.h" #include "r600_shader.h" +#include "tgsi/tgsi_strings.h" void print_shader_info(FILE *f , int id, struct r600_shader *shader) { @@ -165,4 +166,35 @@ void print_pipe_info(FILE *f, struct tgsi_shader_info *shader) PRINT_UINT_MEMBER(writes_memory); PRINT_UINT_MEMBER(file_mask[TGSI_FILE_HW_ATOMIC]); PRINT_UINT_MEMBER(file_count[TGSI_FILE_HW_ATOMIC]); + + for(unsigned int i = 0; i < TGSI_PROPERTY_COUNT; ++i) { + if (shader->properties[i] != 0) + fprintf(stderr, "PROP: %s = %d\n", tgsi_property_names[i], shader->properties[i]); + } + +#define PRINT_UINT_ARRAY_MEMBER(M, IDX) \ + if (shader-> M [ IDX ]) fprintf(f, #M "[%d] = %d\n", IDX, (unsigned) shader-> M [ IDX ]); + + for (int i = 0; i < shader->num_inputs; ++i) { + PRINT_UINT_ARRAY_MEMBER(input_semantic_name, i); /**< TGSI_SEMANTIC_x */ + PRINT_UINT_ARRAY_MEMBER(input_semantic_index, i); + PRINT_UINT_ARRAY_MEMBER(input_interpolate, i); + PRINT_UINT_ARRAY_MEMBER(input_interpolate_loc, i); + PRINT_UINT_ARRAY_MEMBER(input_usage_mask, i); + PRINT_UINT_ARRAY_MEMBER(input_cylindrical_wrap, i); + } + + for (int i = 0; i < shader->num_inputs; ++i) { + PRINT_UINT_ARRAY_MEMBER(output_semantic_name, i); + PRINT_UINT_ARRAY_MEMBER(output_semantic_index, i); + PRINT_UINT_ARRAY_MEMBER(output_usagemask, i); + PRINT_UINT_ARRAY_MEMBER(output_streams, i); + } + + for (int i = 0; i < shader->num_system_values; ++i) + PRINT_UINT_ARRAY_MEMBER(system_value_semantic_name, i); + + PRINT_UINT_MEMBER(reads_pervertex_outputs); + PRINT_UINT_MEMBER(reads_perpatch_outputs); + PRINT_UINT_MEMBER(reads_tessfactor_outputs); } diff --git a/lib/mesa/src/gallium/drivers/r600/r600_isa.c b/lib/mesa/src/gallium/drivers/r600/r600_isa.c index 57b0e044f..0a5c4dac1 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_isa.c +++ b/lib/mesa/src/gallium/drivers/r600/r600_isa.c @@ -194,8 +194,8 @@ const struct alu_op_info r600_alu_op_table[] = { {"MULADD_IEEE_PREV", 2, { -1, 0xD5 },{ 0, 0, AF_V, AF_V}, AF_PREV_INTERLEAVE | AF_IEEE }, {"INTERP_XY", 2, { -1, 0xD6 },{ 0, 0, AF_4V, AF_4V}, AF_INTERP }, {"INTERP_ZW", 2, { -1, 0xD7 },{ 0, 0, AF_4V, AF_4V}, AF_INTERP }, - {"INTERP_X", 2, { -1, 0xD8 },{ 0, 0, AF_V, AF_V}, AF_INTERP }, - {"INTERP_Z", 2, { -1, 0xD9 },{ 0, 0, AF_V, AF_V}, AF_INTERP }, + {"INTERP_X", 2, { -1, 0xD8 },{ 0, 0, AF_2V, AF_2V}, AF_INTERP }, + {"INTERP_Z", 2, { -1, 0xD9 },{ 0, 0, AF_2V, AF_2V}, AF_INTERP }, {"STORE_FLAGS", 1, { -1, 0xDA },{ 0, 0, AF_V, AF_V}, 0 }, {"LOAD_STORE_FLAGS", 1, { -1, 0xDB },{ 0, 0, AF_V, AF_V}, 0 }, {"LDS_1A", 2, { -1, 0xDC },{ 0, 0, AF_V, AF_V}, 0 }, diff --git a/lib/mesa/src/gallium/drivers/r600/r600_isa.h b/lib/mesa/src/gallium/drivers/r600/r600_isa.h index fcaf1f766..1c098fbb1 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_isa.h +++ b/lib/mesa/src/gallium/drivers/r600/r600_isa.h @@ -36,6 +36,7 @@ extern "C" { /* ALU flags */ enum alu_op_flags { + AF_NONE = 0, AF_V = (1<<0), /* allowed in vector slots */ /* allowed in scalar(trans) slot (slots xyz on cayman, may be replicated @@ -46,6 +47,9 @@ enum alu_op_flags AF_4V = (AF_V | AF_4SLOT), AF_VS = (AF_V | AF_S), /* allowed in any slot */ + AF_2SLOT = (1 << 3), + AF_2V = AF_V | AF_2SLOT, /* XY or ZW */ + AF_KILL = (1<<4), AF_PRED = (1<<5), AF_SET = (1<<6), @@ -54,6 +58,7 @@ enum alu_op_flags AF_PREV_INTERLEAVE = (1<<7), AF_MOVA = (1<<8), /* all MOVA instructions */ + AF_IEEE = (1<<10), AF_DST_TYPE_MASK = (3<<11), @@ -106,6 +111,7 @@ enum alu_op_flags /* condition codes - 3 bits */ AF_CC_SHIFT = 29, + AF_CC_MASK = (7U << AF_CC_SHIFT), AF_CC_E = (0U << AF_CC_SHIFT), AF_CC_GT = (1U << AF_CC_SHIFT), diff --git a/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.c b/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.c index 91607ca71..fe3d0c616 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.c +++ b/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.c @@ -38,6 +38,7 @@ #include "radeon_video.h" #include <inttypes.h> #include <sys/utsname.h> +#include <stdlib.h> #ifdef LLVM_AVAILABLE #include <llvm-c/TargetMachine.h> @@ -76,7 +77,7 @@ void r600_gfx_write_event_eop(struct r600_common_context *ctx, struct r600_resource *buf, uint64_t va, uint32_t new_fence, unsigned query_type) { - struct radeon_cmdbuf *cs = ctx->gfx.cs; + struct radeon_cmdbuf *cs = &ctx->gfx.cs; unsigned op = EVENT_TYPE(event) | EVENT_INDEX(5) | event_flags; @@ -108,7 +109,7 @@ void r600_gfx_wait_fence(struct r600_common_context *ctx, struct r600_resource *buf, uint64_t va, uint32_t ref, uint32_t mask) { - struct radeon_cmdbuf *cs = ctx->gfx.cs; + struct radeon_cmdbuf *cs = &ctx->gfx.cs; radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1)); @@ -205,7 +206,7 @@ void r600_draw_rectangle(struct blitter_context *blitter, vbuffer.stride = 2 * 4 * sizeof(float); /* vertex size */ vbuffer.buffer_offset = offset; - rctx->b.set_vertex_buffers(&rctx->b, blitter->vb_slot, 1, &vbuffer); + rctx->b.set_vertex_buffers(&rctx->b, blitter->vb_slot, 1, 0, false, &vbuffer); util_draw_arrays_instanced(&rctx->b, R600_PRIM_RECTANGLE_LIST, 0, 3, 0, num_instances); pipe_resource_reference(&buf, NULL); @@ -213,7 +214,7 @@ void r600_draw_rectangle(struct blitter_context *blitter, static void r600_dma_emit_wait_idle(struct r600_common_context *rctx) { - struct radeon_cmdbuf *cs = rctx->dma.cs; + struct radeon_cmdbuf *cs = &rctx->dma.cs; if (rctx->chip_class >= EVERGREEN) radeon_emit(cs, 0xf0000000); /* NOP */ @@ -226,8 +227,8 @@ static void r600_dma_emit_wait_idle(struct r600_common_context *rctx) void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src) { - uint64_t vram = ctx->dma.cs->used_vram; - uint64_t gtt = ctx->dma.cs->used_gart; + uint64_t vram = (uint64_t)ctx->dma.cs.used_vram_kb * 1024; + uint64_t gtt = (uint64_t)ctx->dma.cs.used_gart_kb * 1024; if (dst) { vram += dst->vram_usage; @@ -239,12 +240,12 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, } /* Flush the GFX IB if DMA depends on it. */ - if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && + if (radeon_emitted(&ctx->gfx.cs, ctx->initial_gfx_cs_size) && ((dst && - ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, dst->buf, + ctx->ws->cs_is_buffer_referenced(&ctx->gfx.cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && - ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, src->buf, + ctx->ws->cs_is_buffer_referenced(&ctx->gfx.cs, src->buf, RADEON_USAGE_WRITE)))) ctx->gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL); @@ -261,21 +262,21 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, * engine busy while uploads are being submitted. */ num_dw++; /* for emit_wait_idle below */ - if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw, false) || - ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 || - !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) { + if (!ctx->ws->cs_check_space(&ctx->dma.cs, num_dw, false) || + ctx->dma.cs.used_vram_kb + ctx->dma.cs.used_gart_kb > 64 * 1024 || + !radeon_cs_memory_below_limit(ctx->screen, &ctx->dma.cs, vram, gtt)) { ctx->dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL); - assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw); + assert((num_dw + ctx->dma.cs.current.cdw) <= ctx->dma.cs.current.max_dw); } /* Wait for idle if either buffer has been used in the IB before to * prevent read-after-write hazards. */ if ((dst && - ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf, + ctx->ws->cs_is_buffer_referenced(&ctx->dma.cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && - ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf, + ctx->ws->cs_is_buffer_referenced(&ctx->dma.cs, src->buf, RADEON_USAGE_WRITE))) r600_dma_emit_wait_idle(ctx); @@ -344,22 +345,22 @@ static void r600_flush_from_st(struct pipe_context *ctx, rflags |= PIPE_FLUSH_END_OF_FRAME; /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */ - if (rctx->dma.cs) + if (rctx->dma.cs.priv) rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL); - if (!radeon_emitted(rctx->gfx.cs, rctx->initial_gfx_cs_size)) { + if (!radeon_emitted(&rctx->gfx.cs, rctx->initial_gfx_cs_size)) { if (fence) ws->fence_reference(&gfx_fence, rctx->last_gfx_fence); if (!(flags & PIPE_FLUSH_DEFERRED)) - ws->cs_sync_flush(rctx->gfx.cs); + ws->cs_sync_flush(&rctx->gfx.cs); } else { /* Instead of flushing, create a deferred fence. Constraints: - * - The state tracker must allow a deferred flush. - * - The state tracker must request a fence. - * Thread safety in fence_finish must be ensured by the state tracker. + * - the gallium frontend must allow a deferred flush. + * - the gallium frontend must request a fence. + * Thread safety in fence_finish must be ensured by the gallium frontend. */ if (flags & PIPE_FLUSH_DEFERRED && fence) { - gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs); + gfx_fence = rctx->ws->cs_get_next_fence(&rctx->gfx.cs); deferred_fence = true; } else { rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL); @@ -391,9 +392,9 @@ static void r600_flush_from_st(struct pipe_context *ctx, } finish: if (!(flags & PIPE_FLUSH_DEFERRED)) { - if (rctx->dma.cs) - ws->cs_sync_flush(rctx->dma.cs); - ws->cs_sync_flush(rctx->gfx.cs); + if (rctx->dma.cs.priv) + ws->cs_sync_flush(&rctx->dma.cs); + ws->cs_sync_flush(&rctx->gfx.cs); } } @@ -401,7 +402,7 @@ static void r600_flush_dma_ring(void *ctx, unsigned flags, struct pipe_fence_handle **fence) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; - struct radeon_cmdbuf *cs = rctx->dma.cs; + struct radeon_cmdbuf *cs = &rctx->dma.cs; struct radeon_saved_cs saved; bool check_vm = (rctx->screen->debug_flags & DBG_CHECK_VM) && @@ -486,7 +487,7 @@ static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; - return rctx->ws->ctx_query_reset_status(rctx->ctx); + return rctx->ws->ctx_query_reset_status(rctx->ctx, false, NULL); } static void r600_set_debug_callback(struct pipe_context *ctx, @@ -555,23 +556,23 @@ static bool r600_resource_commit(struct pipe_context *pctx, * (b) wait for threaded submit to finish, including those that were * triggered by some other, earlier operation. */ - if (radeon_emitted(ctx->gfx.cs, ctx->initial_gfx_cs_size) && - ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, + if (radeon_emitted(&ctx->gfx.cs, ctx->initial_gfx_cs_size) && + ctx->ws->cs_is_buffer_referenced(&ctx->gfx.cs, res->buf, RADEON_USAGE_READWRITE)) { ctx->gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL); } - if (radeon_emitted(ctx->dma.cs, 0) && - ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, + if (radeon_emitted(&ctx->dma.cs, 0) && + ctx->ws->cs_is_buffer_referenced(&ctx->dma.cs, res->buf, RADEON_USAGE_READWRITE)) { ctx->dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL); } - ctx->ws->cs_sync_flush(ctx->dma.cs); - ctx->ws->cs_sync_flush(ctx->gfx.cs); + ctx->ws->cs_sync_flush(&ctx->dma.cs); + ctx->ws->cs_sync_flush(&ctx->gfx.cs); assert(resource->target == PIPE_BUFFER); - return ctx->ws->buffer_commit(res->buf, box->x, box->width, commit); + return ctx->ws->buffer_commit(ctx->ws, res->buf, box->x, box->width, commit); } bool r600_common_context_init(struct r600_common_context *rctx, @@ -615,11 +616,8 @@ bool r600_common_context_init(struct r600_common_context *rctx, r600_query_init(rctx); cayman_init_msaa(&rctx->b); - rctx->allocator_zeroed_memory = - u_suballocator_create(&rctx->b, rscreen->info.gart_page_size, - 0, PIPE_USAGE_DEFAULT, 0, true); - if (!rctx->allocator_zeroed_memory) - return false; + u_suballocator_init(&rctx->allocator_zeroed_memory, &rctx->b, rscreen->info.gart_page_size, + 0, PIPE_USAGE_DEFAULT, 0, true); rctx->b.stream_uploader = u_upload_create(&rctx->b, 1024 * 1024, 0, PIPE_USAGE_STREAM, 0); @@ -636,9 +634,8 @@ bool r600_common_context_init(struct r600_common_context *rctx, return false; if (rscreen->info.num_rings[RING_DMA] && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) { - rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA, - r600_flush_dma_ring, - rctx, false); + rctx->ws->cs_create(&rctx->dma.cs, rctx->ctx, RING_DMA, + r600_flush_dma_ring, rctx, false); rctx->dma.flush = r600_flush_dma_ring; } @@ -650,10 +647,8 @@ void r600_common_context_cleanup(struct r600_common_context *rctx) if (rctx->query_result_shader) rctx->b.delete_compute_state(&rctx->b, rctx->query_result_shader); - if (rctx->gfx.cs) - rctx->ws->cs_destroy(rctx->gfx.cs); - if (rctx->dma.cs) - rctx->ws->cs_destroy(rctx->dma.cs); + rctx->ws->cs_destroy(&rctx->gfx.cs); + rctx->ws->cs_destroy(&rctx->dma.cs); if (rctx->ctx) rctx->ws->ctx_destroy(rctx->ctx); @@ -665,9 +660,7 @@ void r600_common_context_cleanup(struct r600_common_context *rctx) slab_destroy_child(&rctx->pool_transfers); slab_destroy_child(&rctx->pool_transfers_unsync); - if (rctx->allocator_zeroed_memory) { - u_suballocator_destroy(rctx->allocator_zeroed_memory); - } + u_suballocator_destroy(&rctx->allocator_zeroed_memory); rctx->ws->fence_reference(&rctx->last_gfx_fence, NULL); rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL); r600_resource_reference(&rctx->eop_bug_scratch, NULL); @@ -810,8 +803,6 @@ static const char* r600_get_name(struct pipe_screen* pscreen) static float r600_get_paramf(struct pipe_screen* pscreen, enum pipe_capf param) { - struct r600_common_screen *rscreen = (struct r600_common_screen *)pscreen; - switch (param) { case PIPE_CAPF_MAX_LINE_WIDTH: case PIPE_CAPF_MAX_LINE_WIDTH_AA: @@ -909,7 +900,8 @@ const char *r600_get_llvm_processor_name(enum radeon_family family) static unsigned get_max_threads_per_block(struct r600_common_screen *screen, enum pipe_shader_ir ir_type) { - if (ir_type != PIPE_SHADER_IR_TGSI) + if (ir_type != PIPE_SHADER_IR_TGSI && + ir_type != PIPE_SHADER_IR_NIR) return 256; if (screen->chip_class >= EVERGREEN) return 1024; @@ -1177,6 +1169,18 @@ struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen, } } +static const void * +r600_get_compiler_options(struct pipe_screen *screen, + enum pipe_shader_ir ir, + enum pipe_shader_type shader) +{ + assert(ir == PIPE_SHADER_IR_NIR); + + struct r600_common_screen *rscreen = (struct r600_common_screen *)screen; + + return &rscreen->nir_options; +} + bool r600_common_screen_init(struct r600_common_screen *rscreen, struct radeon_winsys *ws) { @@ -1184,7 +1188,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, struct utsname uname_data; const char *chip_name; - ws->query_info(ws, &rscreen->info); + ws->query_info(ws, &rscreen->info, false, false); rscreen->ws = ws; chip_name = r600_get_family_name(rscreen); @@ -1210,6 +1214,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, rscreen->b.get_compute_param = r600_get_compute_param; rscreen->b.get_paramf = r600_get_paramf; rscreen->b.get_timestamp = r600_get_timestamp; + rscreen->b.get_compiler_options = r600_get_compiler_options; rscreen->b.fence_finish = r600_fence_finish; rscreen->b.fence_reference = r600_fence_reference; rscreen->b.resource_destroy = u_resource_destroy_vtbl; @@ -1284,17 +1289,51 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, printf("max_shader_clock = %i\n", rscreen->info.max_shader_clock); printf("num_good_compute_units = %i\n", rscreen->info.num_good_compute_units); printf("max_se = %i\n", rscreen->info.max_se); - printf("max_sh_per_se = %i\n", rscreen->info.max_sh_per_se); + printf("max_sh_per_se = %i\n", rscreen->info.max_sa_per_se); printf("r600_gb_backend_map = %i\n", rscreen->info.r600_gb_backend_map); printf("r600_gb_backend_map_valid = %i\n", rscreen->info.r600_gb_backend_map_valid); printf("r600_num_banks = %i\n", rscreen->info.r600_num_banks); - printf("num_render_backends = %i\n", rscreen->info.num_render_backends); + printf("num_render_backends = %i\n", rscreen->info.max_render_backends); printf("num_tile_pipes = %i\n", rscreen->info.num_tile_pipes); printf("pipe_interleave_bytes = %i\n", rscreen->info.pipe_interleave_bytes); printf("enabled_rb_mask = 0x%x\n", rscreen->info.enabled_rb_mask); printf("max_alignment = %u\n", (unsigned)rscreen->info.max_alignment); } + + const struct nir_shader_compiler_options nir_options = { + .fuse_ffma16 = true, + .fuse_ffma32 = true, + .fuse_ffma64 = true, + .lower_flrp32 = true, + .lower_flrp64 = true, + .lower_fpow = true, + .lower_fdiv = true, + .lower_isign = true, + .lower_fsign = true, + .lower_fmod = true, + .lower_doubles_options = nir_lower_fp64_full_software, + .lower_int64_options = ~0, + .lower_extract_byte = true, + .lower_extract_word = true, + .lower_rotate = true, + .max_unroll_iterations = 32, + .lower_interpolate_at = true, + .vectorize_io = true, + .has_umad24 = true, + .has_umul24 = true, + .use_interpolated_input_intrinsics = true, + .has_fsub = true, + .has_isub = true, + .lower_iabs = true, + .lower_bitfield_extract = true, + .lower_bitfield_insert_to_bitfield_select = true, + .has_fused_comp_and_csel = true, + .lower_find_msb_to_reverse = true, + }; + + rscreen->nir_options = nir_options; + return true; } diff --git a/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.h b/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.h index 2ecd03845..b55a27d63 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.h +++ b/lib/mesa/src/gallium/drivers/r600/r600_pipe_common.h @@ -45,6 +45,8 @@ #include "util/u_transfer.h" #include "util/u_threaded_context.h" +#include "compiler/nir/nir.h" + struct u_log_context; #define ATI_VENDOR_ID 0x1002 @@ -289,7 +291,7 @@ struct r600_mmio_counter { }; union r600_mmio_counters { - struct { + struct r600_mmio_counters_named { /* For global GPU load including SDMA. */ struct r600_mmio_counter gpu; @@ -320,7 +322,7 @@ union r600_mmio_counters { struct r600_mmio_counter cp_dma; struct r600_mmio_counter scratch_ram; } named; - unsigned array[0]; + unsigned array[sizeof(struct r600_mmio_counters_named) / sizeof(unsigned)]; }; struct r600_memory_object { @@ -404,6 +406,8 @@ struct r600_common_screen { */ unsigned compute_to_L2; } barrier_flags; + + struct nir_shader_compiler_options nir_options; }; /* This encapsulates a state or an operation which can emitted into the GPU @@ -474,7 +478,7 @@ struct r600_viewports { }; struct r600_ring { - struct radeon_cmdbuf *cs; + struct radeon_cmdbuf cs; void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence); }; @@ -508,7 +512,7 @@ struct r600_common_context { unsigned last_num_draw_calls; struct threaded_context *tc; - struct u_suballocator *allocator_zeroed_memory; + struct u_suballocator allocator_zeroed_memory; struct slab_child_pool pool_transfers; struct slab_child_pool pool_transfers_unsync; /* for threaded_context */ diff --git a/lib/mesa/src/gallium/drivers/r600/r600_query.c b/lib/mesa/src/gallium/drivers/r600/r600_query.c index 4ef7bc8ca..60e691f9e 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_query.c +++ b/lib/mesa/src/gallium/drivers/r600/r600_query.c @@ -430,7 +430,7 @@ static bool r600_query_sw_get_result(struct r600_common_context *rctx, result->u32 = rctx->screen->info.num_good_compute_units; return true; case R600_QUERY_GPIN_NUM_RB: - result->u32 = rctx->screen->info.num_render_backends; + result->u32 = rctx->screen->info.max_render_backends; return true; case R600_QUERY_GPIN_NUM_SPI: result->u32 = 1; /* all supported chips have one SPI per SE */ @@ -526,9 +526,9 @@ static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen, struct r600_resource *buffer) { /* Callers ensure that the buffer is currently unused by the GPU. */ - uint32_t *results = rscreen->ws->buffer_map(buffer->buf, NULL, - PIPE_TRANSFER_WRITE | - PIPE_TRANSFER_UNSYNCHRONIZED); + uint32_t *results = rscreen->ws->buffer_map(rscreen->ws, buffer->buf, NULL, + PIPE_MAP_WRITE | + PIPE_MAP_UNSYNCHRONIZED); if (!results) return false; @@ -537,7 +537,7 @@ static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen, if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER || query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE || query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) { - unsigned max_rbs = rscreen->info.num_render_backends; + unsigned max_rbs = rscreen->info.max_render_backends; unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask; unsigned num_results; unsigned i, j; @@ -622,7 +622,7 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_screen *rscree case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - query->result_size = 16 * rscreen->info.num_render_backends; + query->result_size = 16 * rscreen->info.max_render_backends; query->result_size += 16; /* for the fence + alignment */ query->num_cs_dw_begin = 6; query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen); @@ -728,7 +728,7 @@ static void r600_query_hw_do_emit_start(struct r600_common_context *ctx, struct r600_resource *buffer, uint64_t va) { - struct radeon_cmdbuf *cs = ctx->gfx.cs; + struct radeon_cmdbuf *cs = &ctx->gfx.cs; switch (query->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: @@ -808,7 +808,7 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, struct r600_resource *buffer, uint64_t va) { - struct radeon_cmdbuf *cs = ctx->gfx.cs; + struct radeon_cmdbuf *cs = &ctx->gfx.cs; uint64_t fence_va = 0; switch (query->b.type) { @@ -821,7 +821,7 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, radeon_emit(cs, va); radeon_emit(cs, va >> 32); - fence_va = va + ctx->screen->info.num_render_backends * 16 - 8; + fence_va = va + ctx->screen->info.max_render_backends * 16 - 8; break; case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_PRIMITIVES_GENERATED: @@ -837,7 +837,7 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, break; case PIPE_QUERY_TIME_ELAPSED: va += 8; - /* fall through */ + FALLTHROUGH; case PIPE_QUERY_TIMESTAMP: r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, EOP_DATA_SEL_TIMESTAMP, NULL, va, @@ -900,7 +900,7 @@ static void emit_set_predicate(struct r600_common_context *ctx, struct r600_resource *buf, uint64_t va, uint32_t op) { - struct radeon_cmdbuf *cs = ctx->gfx.cs; + struct radeon_cmdbuf *cs = &ctx->gfx.cs; radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); radeon_emit(cs, va); @@ -1021,7 +1021,7 @@ void r600_query_hw_reset_buffers(struct r600_common_context *rctx, /* Obtain a new buffer if the current one can't be mapped without a stall. */ if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) || - !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) { + !rctx->ws->buffer_wait(rctx->ws, query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) { r600_resource_reference(&query->buffer.buf, NULL); query->buffer.buf = r600_new_query_buffer(rctx->screen, query); } else { @@ -1082,7 +1082,7 @@ static void r600_get_hw_query_params(struct r600_common_context *rctx, struct r600_query_hw *rquery, int index, struct r600_hw_query_params *params) { - unsigned max_rbs = rctx->screen->info.num_render_backends; + unsigned max_rbs = rctx->screen->info.max_render_backends; params->pair_stride = 0; params->pair_count = 1; @@ -1125,6 +1125,7 @@ static void r600_get_hw_query_params(struct r600_common_context *rctx, case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: params->pair_count = R600_MAX_STREAMS; params->pair_stride = 32; + FALLTHROUGH; case PIPE_QUERY_SO_OVERFLOW_PREDICATE: params->start_offset = 0; params->end_offset = 16; @@ -1172,7 +1173,7 @@ static void r600_query_hw_add_result(struct r600_common_screen *rscreen, void *buffer, union pipe_query_result *result) { - unsigned max_rbs = rscreen->info.num_render_backends; + unsigned max_rbs = rscreen->info.max_render_backends; switch (query->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: { @@ -1336,13 +1337,13 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx, query->ops->clear_result(query, result); for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { - unsigned usage = PIPE_TRANSFER_READ | - (wait ? 0 : PIPE_TRANSFER_DONTBLOCK); + unsigned usage = PIPE_MAP_READ | + (wait ? 0 : PIPE_MAP_DONTBLOCK); unsigned results_base = 0; void *map; if (rquery->b.flushed) - map = rctx->ws->buffer_map(qbuf->buf->buf, NULL, usage); + map = rctx->ws->buffer_map(rctx->ws, qbuf->buf->buf, NULL, usage); else map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf, usage); @@ -1590,10 +1591,7 @@ static void r600_restore_qbo_state(struct r600_common_context *rctx, struct r600_qbo_state *st) { rctx->b.bind_compute_state(&rctx->b, st->saved_compute); - - rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0); - pipe_resource_reference(&st->saved_const0.buffer, NULL); - + rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, true, &st->saved_const0); rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo, ~0); for (unsigned i = 0; i < 3; ++i) pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL); @@ -1636,7 +1634,7 @@ static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, } if (query->buffer.previous) { - u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 256, + u_suballocator_alloc(&rctx->allocator_zeroed_memory, 16, 256, &tmp_buffer_offset, &tmp_buffer); if (!tmp_buffer) return; @@ -1726,7 +1724,7 @@ static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, } else consts.buffer_offset = 0; - rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer); + rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, false, &constant_buffer); rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, ~0); @@ -1833,7 +1831,7 @@ void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen) { struct r600_common_context *ctx = (struct r600_common_context*)rscreen->aux_context; - struct radeon_cmdbuf *cs = ctx->gfx.cs; + struct radeon_cmdbuf *cs = &ctx->gfx.cs; struct r600_resource *buffer; uint32_t *results; unsigned i, mask = 0; @@ -1847,9 +1845,9 @@ void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen) * written to. By increasing this number we'll write the * status bit for these as per the normal disabled rb logic. */ - ctx->screen->info.num_render_backends = 8; + ctx->screen->info.max_render_backends = 8; } - max_rbs = ctx->screen->info.num_render_backends; + max_rbs = ctx->screen->info.max_render_backends; assert(rscreen->chip_class <= CAYMAN); @@ -1895,7 +1893,7 @@ void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen) return; /* initialize buffer with zeroes */ - results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE); + results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_MAP_WRITE); if (results) { memset(results, 0, max_rbs * 4 * 4); @@ -1909,7 +1907,7 @@ void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen) RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); /* analyze results */ - results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ); + results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_MAP_READ); if (results) { for(i = 0; i < max_rbs; i++) { /* at least highest bit will be set if backend is used */ @@ -2122,7 +2120,7 @@ void r600_query_init(struct r600_common_context *rctx) rctx->b.get_query_result_resource = r600_get_query_result_resource; rctx->render_cond_atom.emit = r600_emit_query_predication; - if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0) + if (((struct r600_common_screen*)rctx->b.screen)->info.max_render_backends > 0) rctx->b.render_condition = r600_render_condition; list_inithead(&rctx->active_queries); diff --git a/lib/mesa/src/gallium/drivers/r600/r600_streamout.c b/lib/mesa/src/gallium/drivers/r600/r600_streamout.c index f925c07b2..f45561d29 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_streamout.c +++ b/lib/mesa/src/gallium/drivers/r600/r600_streamout.c @@ -51,7 +51,7 @@ r600_create_so_target(struct pipe_context *ctx, return NULL; } - u_suballocator_alloc(rctx->allocator_zeroed_memory, 4, 4, + u_suballocator_alloc(&rctx->allocator_zeroed_memory, 4, 4, &t->buf_filled_size_offset, (struct pipe_resource**)&t->buf_filled_size); if (!t->buf_filled_size) { @@ -154,7 +154,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx, static void r600_flush_vgt_streamout(struct r600_common_context *rctx) { - struct radeon_cmdbuf *cs = rctx->gfx.cs; + struct radeon_cmdbuf *cs = &rctx->gfx.cs; unsigned reg_strmout_cntl; /* The register is at different places on different ASICs. */ @@ -180,7 +180,7 @@ static void r600_flush_vgt_streamout(struct r600_common_context *rctx) static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom) { - struct radeon_cmdbuf *cs = rctx->gfx.cs; + struct radeon_cmdbuf *cs = &rctx->gfx.cs; struct r600_so_target **t = rctx->streamout.targets; uint16_t *stride_in_dw = rctx->streamout.stride_in_dw; unsigned i, update_flags = 0; @@ -253,7 +253,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r void r600_emit_streamout_end(struct r600_common_context *rctx) { - struct radeon_cmdbuf *cs = rctx->gfx.cs; + struct radeon_cmdbuf *cs = &rctx->gfx.cs; struct r600_so_target **t = rctx->streamout.targets; unsigned i; uint64_t va; @@ -315,8 +315,8 @@ static void r600_emit_streamout_enable(struct r600_common_context *rctx, S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) | S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx)); } - radeon_set_context_reg(rctx->gfx.cs, strmout_buffer_reg, strmout_buffer_val); - radeon_set_context_reg(rctx->gfx.cs, strmout_config_reg, strmout_config_val); + radeon_set_context_reg(&rctx->gfx.cs, strmout_buffer_reg, strmout_buffer_val); + radeon_set_context_reg(&rctx->gfx.cs, strmout_config_reg, strmout_config_val); } static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable) diff --git a/lib/mesa/src/gallium/drivers/r600/r600_test_dma.c b/lib/mesa/src/gallium/drivers/r600/r600_test_dma.c index 512e77420..e8e54fb99 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_test_dma.c +++ b/lib/mesa/src/gallium/drivers/r600/r600_test_dma.c @@ -59,7 +59,7 @@ static void set_random_pixels(struct pipe_context *ctx, uint8_t *map; unsigned x,y,z; - map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE, + map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_MAP_WRITE, 0, 0, 0, tex->width0, tex->height0, tex->array_size, &t); assert(map); @@ -94,7 +94,7 @@ static bool compare_textures(struct pipe_context *ctx, int y,z; bool pass = true; - map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ, + map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_MAP_READ, 0, 0, 0, tex->width0, tex->height0, tex->array_size, &t); assert(map); diff --git a/lib/mesa/src/gallium/drivers/r600/r600_texture.c b/lib/mesa/src/gallium/drivers/r600/r600_texture.c index 518e92d9f..c910bd08b 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_texture.c +++ b/lib/mesa/src/gallium/drivers/r600/r600_texture.c @@ -33,7 +33,7 @@ #include "util/u_pack_color.h" #include "util/u_surface.h" #include "util/os_time.h" -#include "state_tracker/winsys_handle.h" +#include "frontend/winsys_handle.h" #include <errno.h> #include <inttypes.h> @@ -52,7 +52,7 @@ bool r600_prepare_for_dma_blit(struct r600_common_context *rctx, unsigned src_level, const struct pipe_box *src_box) { - if (!rctx->dma.cs) + if (!rctx->dma.cs.priv) return false; if (rdst->surface.bpe != rsrc->surface.bpe) @@ -183,11 +183,11 @@ static unsigned r600_texture_get_offset(struct r600_common_screen *rscreen, *layer_stride = (uint64_t)rtex->surface.u.legacy.level[level].slice_size_dw * 4; if (!box) - return rtex->surface.u.legacy.level[level].offset; + return (uint64_t)rtex->surface.u.legacy.level[level].offset_256B * 256; /* Each texture is an array of mipmap levels. Each level is * an array of slices. */ - return rtex->surface.u.legacy.level[level].offset + + return (uint64_t)rtex->surface.u.legacy.level[level].offset_256B * 256 + box->z * (uint64_t)rtex->surface.u.legacy.level[level].slice_size_dw * 4 + (box->y / rtex->surface.blk_h * rtex->surface.u.legacy.level[level].nblk_x + @@ -243,8 +243,6 @@ static int r600_init_surface(struct r600_common_screen *rscreen, flags |= RADEON_SURF_SHAREABLE; if (is_imported) flags |= RADEON_SURF_IMPORTED | RADEON_SURF_SHAREABLE; - if (!(ptex->flags & R600_RESOURCE_FLAG_FORCE_TILING)) - flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE; r = rscreen->ws->surface_init(rscreen->ws, ptex, flags, bpe, array_mode, surface); @@ -264,7 +262,7 @@ static int r600_init_surface(struct r600_common_screen *rscreen, if (offset) { for (i = 0; i < ARRAY_SIZE(surface->u.legacy.level); ++i) - surface->u.legacy.level[i].offset += offset; + surface->u.legacy.level[i].offset_256B += offset / 256; } return 0; @@ -457,7 +455,7 @@ static void r600_texture_get_info(struct pipe_screen* screen, return; if (resource->target != PIPE_BUFFER) { - offset = rtex->surface.u.legacy.level[0].offset; + offset = (uint64_t)rtex->surface.u.legacy.level[0].offset_256B * 256; stride = rtex->surface.u.legacy.level[0].nblk_x * rtex->surface.bpe; } @@ -521,7 +519,7 @@ static bool r600_texture_get_handle(struct pipe_screen* screen, if (!res->b.is_shared || update_metadata) { r600_texture_init_metadata(rscreen, rtex, &metadata); - rscreen->ws->buffer_set_metadata(res->buf, &metadata); + rscreen->ws->buffer_set_metadata(rscreen->ws, res->buf, &metadata, NULL); } slice_size = (uint64_t)rtex->surface.u.legacy.level[0].slice_size_dw * 4; @@ -655,7 +653,7 @@ void r600_texture_get_fmask_info(struct r600_common_screen *rscreen, out->pitch_in_pixels = fmask.u.legacy.level[0].nblk_x; out->bank_height = fmask.u.legacy.bankh; out->tile_swizzle = fmask.tile_swizzle; - out->alignment = MAX2(256, fmask.surf_alignment); + out->alignment = MAX2(256, 1 << fmask.surf_alignment_log2); out->size = fmask.surf_size; } @@ -759,7 +757,7 @@ static void r600_texture_get_htile_size(struct r600_common_screen *rscreen, unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align; unsigned num_pipes = rscreen->info.num_tile_pipes; - rtex->surface.htile_size = 0; + rtex->surface.meta_size = 0; if (rscreen->chip_class <= EVERGREEN && rscreen->info.drm_minor < 26) @@ -806,8 +804,8 @@ static void r600_texture_get_htile_size(struct r600_common_screen *rscreen, pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes; base_align = num_pipes * pipe_interleave_bytes; - rtex->surface.htile_alignment = base_align; - rtex->surface.htile_size = + rtex->surface.meta_alignment_log2 = util_logbase2(base_align); + rtex->surface.meta_size = util_num_layers(&rtex->resource.b.b, 0) * align(slice_bytes, base_align); } @@ -817,11 +815,11 @@ static void r600_texture_allocate_htile(struct r600_common_screen *rscreen, { r600_texture_get_htile_size(rscreen, rtex); - if (!rtex->surface.htile_size) + if (!rtex->surface.meta_size) return; - rtex->htile_offset = align(rtex->size, rtex->surface.htile_alignment); - rtex->size = rtex->htile_offset + rtex->surface.htile_size; + rtex->htile_offset = align(rtex->size, 1 << rtex->surface.meta_alignment_log2); + rtex->size = rtex->htile_offset + rtex->surface.meta_size; } void r600_print_texture_info(struct r600_common_screen *rscreen, @@ -832,7 +830,7 @@ void r600_print_texture_info(struct r600_common_screen *rscreen, /* Common parameters. */ u_log_printf(log, " Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, " "blk_h=%u, array_size=%u, last_level=%u, " - "bpe=%u, nsamples=%u, flags=0x%x, %s\n", + "bpe=%u, nsamples=%u, flags=0x%"PRIx64", %s\n", rtex->resource.b.b.width0, rtex->resource.b.b.height0, rtex->resource.b.b.depth0, rtex->surface.blk_w, rtex->surface.blk_h, @@ -842,7 +840,7 @@ void r600_print_texture_info(struct r600_common_screen *rscreen, u_log_printf(log, " Layout: size=%"PRIu64", alignment=%u, bankw=%u, " "bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n", - rtex->surface.surf_size, rtex->surface.surf_alignment, rtex->surface.u.legacy.bankw, + rtex->surface.surf_size, 1 << rtex->surface.surf_alignment_log2, rtex->surface.u.legacy.bankw, rtex->surface.u.legacy.bankh, rtex->surface.u.legacy.num_banks, rtex->surface.u.legacy.mtilea, rtex->surface.u.legacy.tile_split, rtex->surface.u.legacy.pipe_config, (rtex->surface.flags & RADEON_SURF_SCANOUT) != 0); @@ -863,14 +861,14 @@ void r600_print_texture_info(struct r600_common_screen *rscreen, if (rtex->htile_offset) u_log_printf(log, " HTile: offset=%"PRIu64", size=%u " "alignment=%u\n", - rtex->htile_offset, rtex->surface.htile_size, - rtex->surface.htile_alignment); + rtex->htile_offset, rtex->surface.meta_size, + 1 << rtex->surface.meta_alignment_log2); for (i = 0; i <= rtex->resource.b.b.last_level; i++) u_log_printf(log, " Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", " "npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " "mode=%u, tiling_index = %u\n", - i, rtex->surface.u.legacy.level[i].offset, + i, (uint64_t)rtex->surface.u.legacy.level[i].offset_256B * 256, (uint64_t)rtex->surface.u.legacy.level[i].slice_size_dw * 4, u_minify(rtex->resource.b.b.width0, i), u_minify(rtex->resource.b.b.height0, i), @@ -888,15 +886,15 @@ void r600_print_texture_info(struct r600_common_screen *rscreen, "slice_size=%"PRIu64", npix_x=%u, " "npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, " "mode=%u, tiling_index = %u\n", - i, rtex->surface.u.legacy.stencil_level[i].offset, - (uint64_t)rtex->surface.u.legacy.stencil_level[i].slice_size_dw * 4, + i, (uint64_t)rtex->surface.u.legacy.zs.stencil_level[i].offset_256B * 256, + (uint64_t)rtex->surface.u.legacy.zs.stencil_level[i].slice_size_dw * 4, u_minify(rtex->resource.b.b.width0, i), u_minify(rtex->resource.b.b.height0, i), u_minify(rtex->resource.b.b.depth0, i), - rtex->surface.u.legacy.stencil_level[i].nblk_x, - rtex->surface.u.legacy.stencil_level[i].nblk_y, - rtex->surface.u.legacy.stencil_level[i].mode, - rtex->surface.u.legacy.stencil_tiling_index[i]); + rtex->surface.u.legacy.zs.stencil_level[i].nblk_x, + rtex->surface.u.legacy.zs.stencil_level[i].nblk_y, + rtex->surface.u.legacy.zs.stencil_level[i].mode, + rtex->surface.u.legacy.zs.stencil_tiling_index[i]); } } } @@ -918,7 +916,6 @@ r600_texture_create_object(struct pipe_screen *screen, resource = &rtex->resource; resource->b.b = *base; - resource->b.b.next = NULL; resource->b.vtbl = &r600_texture_vtbl; pipe_reference_init(&resource->b.b.reference, 1); resource->b.b.screen = screen; @@ -974,7 +971,7 @@ r600_texture_create_object(struct pipe_screen *screen, /* Now create the backing buffer. */ if (!buf) { r600_init_resource_fields(rscreen, resource, rtex->size, - rtex->surface.surf_alignment); + 1 << rtex->surface.surf_alignment_log2); if (!r600_alloc_resource(rscreen, resource)) { FREE(rtex); @@ -984,7 +981,7 @@ r600_texture_create_object(struct pipe_screen *screen, resource->buf = buf; resource->gpu_address = rscreen->ws->buffer_get_virtual_address(resource->buf); resource->bo_size = buf->size; - resource->bo_alignment = buf->alignment; + resource->bo_alignment = 1 << buf->alignment_log2; resource->domains = rscreen->ws->buffer_get_initial_domain(resource->buf); if (resource->domains & RADEON_DOMAIN_VRAM) resource->vram_usage = buf->size; @@ -1003,7 +1000,7 @@ r600_texture_create_object(struct pipe_screen *screen, r600_screen_clear_buffer(rscreen, &rtex->resource.b.b, rtex->htile_offset, - rtex->surface.htile_size, + rtex->surface.meta_size, clear_value); } @@ -1135,7 +1132,7 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen if (!buf) return NULL; - rscreen->ws->buffer_get_metadata(buf, &metadata); + rscreen->ws->buffer_get_metadata(rscreen->ws, buf, &metadata, NULL); r600_surface_import_metadata(rscreen, &surface, &metadata, &array_mode, &is_scanout); @@ -1262,7 +1259,7 @@ static bool r600_can_invalidate_texture(struct r600_common_screen *rscreen, /* r600g doesn't react to dirty_tex_descriptor_counter */ return rscreen->chip_class >= GFX6 && !rtex->resource.b.is_shared && - !(transfer_usage & PIPE_TRANSFER_READ) && + !(transfer_usage & PIPE_MAP_READ) && rtex->resource.b.b.last_level == 0 && util_texrange_covers_whole_level(&rtex->resource.b.b, 0, box->x, box->y, box->z, @@ -1339,14 +1336,14 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, */ if (!rtex->surface.is_linear) use_staging_texture = true; - else if (usage & PIPE_TRANSFER_READ) + else if (usage & PIPE_MAP_READ) use_staging_texture = rtex->resource.domains & RADEON_DOMAIN_VRAM || rtex->resource.flags & RADEON_FLAG_GTT_WC; /* Write & linear only: */ else if (r600_rings_is_buffer_referenced(rctx, rtex->resource.buf, RADEON_USAGE_READWRITE) || - !rctx->ws->buffer_wait(rtex->resource.buf, 0, + !rctx->ws->buffer_wait(rctx->ws, rtex->resource.buf, 0, RADEON_USAGE_READWRITE)) { /* It's busy. */ if (r600_can_invalidate_texture(rctx->screen, rtex, @@ -1389,7 +1386,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, return NULL; } - if (usage & PIPE_TRANSFER_READ) { + if (usage & PIPE_MAP_READ) { struct pipe_resource *temp = ctx->screen->resource_create(ctx->screen, &resource); if (!temp) { R600_ERR("failed to create a temporary depth texture\n"); @@ -1435,7 +1432,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, r600_init_temp_resource_from_box(&resource, texture, box, level, R600_RESOURCE_FLAG_TRANSFER); - resource.usage = (usage & PIPE_TRANSFER_READ) ? + resource.usage = (usage & PIPE_MAP_READ) ? PIPE_USAGE_STAGING : PIPE_USAGE_STREAM; /* Create the temporary texture. */ @@ -1452,10 +1449,10 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, &trans->b.b.stride, &trans->b.b.layer_stride); - if (usage & PIPE_TRANSFER_READ) + if (usage & PIPE_MAP_READ) r600_copy_to_staging_texture(ctx, trans); else - usage |= PIPE_TRANSFER_UNSYNCHRONIZED; + usage |= PIPE_MAP_UNSYNCHRONIZED; buf = trans->staging; } else { @@ -1484,7 +1481,7 @@ static void r600_texture_transfer_unmap(struct pipe_context *ctx, struct pipe_resource *texture = transfer->resource; struct r600_texture *rtex = (struct r600_texture*)texture; - if ((transfer->usage & PIPE_TRANSFER_WRITE) && rtransfer->staging) { + if ((transfer->usage & PIPE_MAP_WRITE) && rtransfer->staging) { if (rtex->is_depth && rtex->resource.b.b.nr_samples <= 1) { ctx->resource_copy_region(ctx, texture, transfer->level, transfer->box.x, transfer->box.y, transfer->box.z, @@ -1617,8 +1614,6 @@ static void r600_clear_texture(struct pipe_context *pipe, struct r600_texture *rtex = (struct r600_texture*)tex; struct pipe_surface tmpl = {{0}}; struct pipe_surface *sf; - const struct util_format_description *desc = - util_format_description(tex->format); tmpl.format = tex->format; tmpl.u.tex.first_layer = box->z; @@ -1635,11 +1630,11 @@ static void r600_clear_texture(struct pipe_context *pipe, /* Depth is always present. */ clear = PIPE_CLEAR_DEPTH; - desc->unpack_z_float(&depth, 0, data, 0, 1, 1); + util_format_unpack_z_float(tex->format, &depth, data, 1); if (rtex->surface.has_stencil) { clear |= PIPE_CLEAR_STENCIL; - desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1); + util_format_unpack_s_8uint(tex->format, &stencil, data, 1); } pipe->clear_depth_stencil(pipe, sf, clear, depth, stencil, @@ -1648,13 +1643,7 @@ static void r600_clear_texture(struct pipe_context *pipe, } else { union pipe_color_union color; - /* pipe_color_union requires the full vec4 representation. */ - if (util_format_is_pure_uint(tex->format)) - desc->unpack_rgba_uint(color.ui, 0, data, 0, 1, 1); - else if (util_format_is_pure_sint(tex->format)) - desc->unpack_rgba_sint(color.i, 0, data, 0, 1, 1); - else - desc->unpack_rgba_float(color.f, 0, data, 0, 1, 1); + util_format_unpack_rgba(tex->format, color.ui, data, 1); if (screen->is_format_supported(screen, tex->format, tex->target, 0, 0, @@ -1751,12 +1740,8 @@ static void evergreen_set_clear_color(struct r600_texture *rtex, color->ui[0] == color->ui[2]); uc.ui[0] = color->ui[0]; uc.ui[1] = color->ui[3]; - } else if (util_format_is_pure_uint(surface_format)) { - util_format_write_4ui(surface_format, color->ui, 0, &uc, 0, 0, 0, 1, 1); - } else if (util_format_is_pure_sint(surface_format)) { - util_format_write_4i(surface_format, color->i, 0, &uc, 0, 0, 0, 1, 1); } else { - util_pack_color(color->f, surface_format, &uc); + util_pack_color_union(surface_format, &uc, color); } memcpy(rtex->color_clear_value, &uc, 2 * sizeof(uint32_t)); @@ -1914,7 +1899,7 @@ r600_texture_from_memobj(struct pipe_screen *screen, struct pb_buffer *buf = NULL; if (memobj->b.dedicated) { - rscreen->ws->buffer_get_metadata(memobj->buf, &metadata); + rscreen->ws->buffer_get_metadata(rscreen->ws, memobj->buf, &metadata, NULL); r600_surface_import_metadata(rscreen, &surface, &metadata, &array_mode, &is_scanout); } else { diff --git a/lib/mesa/src/gallium/drivers/r600/r600_viewport.c b/lib/mesa/src/gallium/drivers/r600/r600_viewport.c index 7a5bf8f39..a8ed01a0c 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_viewport.c +++ b/lib/mesa/src/gallium/drivers/r600/r600_viewport.c @@ -185,7 +185,7 @@ static void r600_emit_one_scissor(struct r600_common_context *rctx, static void r600_emit_guardband(struct r600_common_context *rctx, struct r600_signed_scissor *vp_as_scissor) { - struct radeon_cmdbuf *cs = rctx->gfx.cs; + struct radeon_cmdbuf *cs = &rctx->gfx.cs; struct pipe_viewport_state vp; float left, top, right, bottom, max_range, guardband_x, guardband_y; @@ -235,7 +235,7 @@ static void r600_emit_guardband(struct r600_common_context *rctx, static void r600_emit_scissors(struct r600_common_context *rctx, struct r600_atom *atom) { - struct radeon_cmdbuf *cs = rctx->gfx.cs; + struct radeon_cmdbuf *cs = &rctx->gfx.cs; struct pipe_scissor_state *states = rctx->scissors.states; unsigned mask = rctx->scissors.dirty_mask; bool scissor_enabled = rctx->scissor_enabled; @@ -306,7 +306,7 @@ static void r600_set_viewport_states(struct pipe_context *ctx, static void r600_emit_one_viewport(struct r600_common_context *rctx, struct pipe_viewport_state *state) { - struct radeon_cmdbuf *cs = rctx->gfx.cs; + struct radeon_cmdbuf *cs = &rctx->gfx.cs; radeon_emit(cs, fui(state->scale[0])); radeon_emit(cs, fui(state->translate[0])); @@ -318,7 +318,7 @@ static void r600_emit_one_viewport(struct r600_common_context *rctx, static void r600_emit_viewports(struct r600_common_context *rctx) { - struct radeon_cmdbuf *cs = rctx->gfx.cs; + struct radeon_cmdbuf *cs = &rctx->gfx.cs; struct pipe_viewport_state *states = rctx->viewports.states; unsigned mask = rctx->viewports.dirty_mask; @@ -348,7 +348,7 @@ static void r600_emit_viewports(struct r600_common_context *rctx) static void r600_emit_depth_ranges(struct r600_common_context *rctx) { - struct radeon_cmdbuf *cs = rctx->gfx.cs; + struct radeon_cmdbuf *cs = &rctx->gfx.cs; struct pipe_viewport_state *states = rctx->viewports.states; unsigned mask = rctx->viewports.depth_range_dirty_mask; float zmin, zmax; diff --git a/lib/mesa/src/gallium/drivers/r600/radeon_uvd.c b/lib/mesa/src/gallium/drivers/r600/radeon_uvd.c index e7107c0b4..e4766a72f 100644 --- a/lib/mesa/src/gallium/drivers/r600/radeon_uvd.c +++ b/lib/mesa/src/gallium/drivers/r600/radeon_uvd.c @@ -73,7 +73,7 @@ struct ruvd_decoder { struct pipe_screen *screen; struct radeon_winsys* ws; - struct radeon_cmdbuf* cs; + struct radeon_cmdbuf cs; unsigned cur_buffer; @@ -102,14 +102,14 @@ struct ruvd_decoder { /* flush IB to the hardware */ static int flush(struct ruvd_decoder *dec, unsigned flags) { - return dec->ws->cs_flush(dec->cs, flags, NULL); + return dec->ws->cs_flush(&dec->cs, flags, NULL); } /* add a new set register command to the IB */ static void set_reg(struct ruvd_decoder *dec, unsigned reg, uint32_t val) { - radeon_emit(dec->cs, RUVD_PKT0(reg >> 2, 0)); - radeon_emit(dec->cs, val); + radeon_emit(&dec->cs, RUVD_PKT0(reg >> 2, 0)); + radeon_emit(&dec->cs, val); } /* send a command to the VCPU through the GPCOM registers */ @@ -119,7 +119,7 @@ static void send_cmd(struct ruvd_decoder *dec, unsigned cmd, { int reloc_idx; - reloc_idx = dec->ws->cs_add_buffer(dec->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED, + reloc_idx = dec->ws->cs_add_buffer(&dec->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED, domain, 0); if (!dec->use_legacy) { uint64_t addr; @@ -152,8 +152,8 @@ static void map_msg_fb_it_buf(struct ruvd_decoder *dec) buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; /* and map it for CPU access */ - ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, - PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); + ptr = dec->ws->buffer_map(dec->ws, buf->res->buf, &dec->cs, + PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY); /* calc buffer offsets */ dec->msg = (struct ruvd_msg *)ptr; @@ -177,7 +177,8 @@ static void send_msg_buf(struct ruvd_decoder *dec) buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; /* unmap the buffer */ - dec->ws->buffer_unmap(buf->res->buf); + dec->ws->buffer_unmap(dec->ws, buf->res->buf); + dec->bs_ptr = NULL; dec->msg = NULL; dec->fb = NULL; dec->it = NULL; @@ -225,55 +226,6 @@ static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family) } } -static unsigned calc_ctx_size_h265_main(struct ruvd_decoder *dec) -{ - unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); - unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); - - unsigned max_references = dec->base.max_references + 1; - - if (dec->base.width * dec->base.height >= 4096*2000) - max_references = MAX2(max_references, 8); - else - max_references = MAX2(max_references, 17); - - width = align (width, 16); - height = align (height, 16); - return ((width + 255) / 16) * ((height + 255) / 16) * 16 * max_references + 52 * 1024; -} - -static unsigned calc_ctx_size_h265_main10(struct ruvd_decoder *dec, struct pipe_h265_picture_desc *pic) -{ - unsigned log2_ctb_size, width_in_ctb, height_in_ctb, num_16x16_block_per_ctb; - unsigned context_buffer_size_per_ctb_row, cm_buffer_size, max_mb_address, db_left_tile_pxl_size; - unsigned db_left_tile_ctx_size = 4096 / 16 * (32 + 16 * 4); - - unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); - unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); - unsigned coeff_10bit = (pic->pps->sps->bit_depth_luma_minus8 || pic->pps->sps->bit_depth_chroma_minus8) ? 2 : 1; - - unsigned max_references = dec->base.max_references + 1; - - if (dec->base.width * dec->base.height >= 4096*2000) - max_references = MAX2(max_references, 8); - else - max_references = MAX2(max_references, 17); - - log2_ctb_size = pic->pps->sps->log2_min_luma_coding_block_size_minus3 + 3 + - pic->pps->sps->log2_diff_max_min_luma_coding_block_size; - - width_in_ctb = (width + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size; - height_in_ctb = (height + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size; - - num_16x16_block_per_ctb = ((1 << log2_ctb_size) >> 4) * ((1 << log2_ctb_size) >> 4); - context_buffer_size_per_ctb_row = align(width_in_ctb * num_16x16_block_per_ctb * 16, 256); - max_mb_address = (unsigned) ceil(height * 8 / 2048.0); - - cm_buffer_size = max_references * context_buffer_size_per_ctb_row * height_in_ctb; - db_left_tile_pxl_size = coeff_10bit * (max_mb_address * 2 * 2048 + 1024); - - return cm_buffer_size + db_left_tile_ctx_size + db_left_tile_pxl_size; -} static unsigned get_db_pitch_alignment(struct ruvd_decoder *dec) { @@ -514,156 +466,6 @@ static struct ruvd_h264 get_h264_msg(struct ruvd_decoder *dec, struct pipe_h264_ return result; } -/* get h265 specific message bits */ -static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video_buffer *target, - struct pipe_h265_picture_desc *pic) -{ - struct ruvd_h265 result; - unsigned i; - - memset(&result, 0, sizeof(result)); - - result.sps_info_flags = 0; - result.sps_info_flags |= pic->pps->sps->scaling_list_enabled_flag << 0; - result.sps_info_flags |= pic->pps->sps->amp_enabled_flag << 1; - result.sps_info_flags |= pic->pps->sps->sample_adaptive_offset_enabled_flag << 2; - result.sps_info_flags |= pic->pps->sps->pcm_enabled_flag << 3; - result.sps_info_flags |= pic->pps->sps->pcm_loop_filter_disabled_flag << 4; - result.sps_info_flags |= pic->pps->sps->long_term_ref_pics_present_flag << 5; - result.sps_info_flags |= pic->pps->sps->sps_temporal_mvp_enabled_flag << 6; - result.sps_info_flags |= pic->pps->sps->strong_intra_smoothing_enabled_flag << 7; - result.sps_info_flags |= pic->pps->sps->separate_colour_plane_flag << 8; - if (pic->UseRefPicList == true) - result.sps_info_flags |= 1 << 10; - - result.chroma_format = pic->pps->sps->chroma_format_idc; - result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8; - result.bit_depth_chroma_minus8 = pic->pps->sps->bit_depth_chroma_minus8; - result.log2_max_pic_order_cnt_lsb_minus4 = pic->pps->sps->log2_max_pic_order_cnt_lsb_minus4; - result.sps_max_dec_pic_buffering_minus1 = pic->pps->sps->sps_max_dec_pic_buffering_minus1; - result.log2_min_luma_coding_block_size_minus3 = pic->pps->sps->log2_min_luma_coding_block_size_minus3; - result.log2_diff_max_min_luma_coding_block_size = pic->pps->sps->log2_diff_max_min_luma_coding_block_size; - result.log2_min_transform_block_size_minus2 = pic->pps->sps->log2_min_transform_block_size_minus2; - result.log2_diff_max_min_transform_block_size = pic->pps->sps->log2_diff_max_min_transform_block_size; - result.max_transform_hierarchy_depth_inter = pic->pps->sps->max_transform_hierarchy_depth_inter; - result.max_transform_hierarchy_depth_intra = pic->pps->sps->max_transform_hierarchy_depth_intra; - result.pcm_sample_bit_depth_luma_minus1 = pic->pps->sps->pcm_sample_bit_depth_luma_minus1; - result.pcm_sample_bit_depth_chroma_minus1 = pic->pps->sps->pcm_sample_bit_depth_chroma_minus1; - result.log2_min_pcm_luma_coding_block_size_minus3 = pic->pps->sps->log2_min_pcm_luma_coding_block_size_minus3; - result.log2_diff_max_min_pcm_luma_coding_block_size = pic->pps->sps->log2_diff_max_min_pcm_luma_coding_block_size; - result.num_short_term_ref_pic_sets = pic->pps->sps->num_short_term_ref_pic_sets; - - result.pps_info_flags = 0; - result.pps_info_flags |= pic->pps->dependent_slice_segments_enabled_flag << 0; - result.pps_info_flags |= pic->pps->output_flag_present_flag << 1; - result.pps_info_flags |= pic->pps->sign_data_hiding_enabled_flag << 2; - result.pps_info_flags |= pic->pps->cabac_init_present_flag << 3; - result.pps_info_flags |= pic->pps->constrained_intra_pred_flag << 4; - result.pps_info_flags |= pic->pps->transform_skip_enabled_flag << 5; - result.pps_info_flags |= pic->pps->cu_qp_delta_enabled_flag << 6; - result.pps_info_flags |= pic->pps->pps_slice_chroma_qp_offsets_present_flag << 7; - result.pps_info_flags |= pic->pps->weighted_pred_flag << 8; - result.pps_info_flags |= pic->pps->weighted_bipred_flag << 9; - result.pps_info_flags |= pic->pps->transquant_bypass_enabled_flag << 10; - result.pps_info_flags |= pic->pps->tiles_enabled_flag << 11; - result.pps_info_flags |= pic->pps->entropy_coding_sync_enabled_flag << 12; - result.pps_info_flags |= pic->pps->uniform_spacing_flag << 13; - result.pps_info_flags |= pic->pps->loop_filter_across_tiles_enabled_flag << 14; - result.pps_info_flags |= pic->pps->pps_loop_filter_across_slices_enabled_flag << 15; - result.pps_info_flags |= pic->pps->deblocking_filter_override_enabled_flag << 16; - result.pps_info_flags |= pic->pps->pps_deblocking_filter_disabled_flag << 17; - result.pps_info_flags |= pic->pps->lists_modification_present_flag << 18; - result.pps_info_flags |= pic->pps->slice_segment_header_extension_present_flag << 19; - //result.pps_info_flags |= pic->pps->deblocking_filter_control_present_flag; ??? - - result.num_extra_slice_header_bits = pic->pps->num_extra_slice_header_bits; - result.num_long_term_ref_pic_sps = pic->pps->sps->num_long_term_ref_pics_sps; - result.num_ref_idx_l0_default_active_minus1 = pic->pps->num_ref_idx_l0_default_active_minus1; - result.num_ref_idx_l1_default_active_minus1 = pic->pps->num_ref_idx_l1_default_active_minus1; - result.pps_cb_qp_offset = pic->pps->pps_cb_qp_offset; - result.pps_cr_qp_offset = pic->pps->pps_cr_qp_offset; - result.pps_beta_offset_div2 = pic->pps->pps_beta_offset_div2; - result.pps_tc_offset_div2 = pic->pps->pps_tc_offset_div2; - result.diff_cu_qp_delta_depth = pic->pps->diff_cu_qp_delta_depth; - result.num_tile_columns_minus1 = pic->pps->num_tile_columns_minus1; - result.num_tile_rows_minus1 = pic->pps->num_tile_rows_minus1; - result.log2_parallel_merge_level_minus2 = pic->pps->log2_parallel_merge_level_minus2; - result.init_qp_minus26 = pic->pps->init_qp_minus26; - - for (i = 0; i < 19; ++i) - result.column_width_minus1[i] = pic->pps->column_width_minus1[i]; - - for (i = 0; i < 21; ++i) - result.row_height_minus1[i] = pic->pps->row_height_minus1[i]; - - result.num_delta_pocs_ref_rps_idx = pic->NumDeltaPocsOfRefRpsIdx; - result.curr_idx = pic->CurrPicOrderCntVal; - result.curr_poc = pic->CurrPicOrderCntVal; - - vl_video_buffer_set_associated_data(target, &dec->base, - (void *)(uintptr_t)pic->CurrPicOrderCntVal, - &ruvd_destroy_associated_data); - - for (i = 0; i < 16; ++i) { - struct pipe_video_buffer *ref = pic->ref[i]; - uintptr_t ref_pic = 0; - - result.poc_list[i] = pic->PicOrderCntVal[i]; - - if (ref) - ref_pic = (uintptr_t)vl_video_buffer_get_associated_data(ref, &dec->base); - else - ref_pic = 0x7F; - result.ref_pic_list[i] = ref_pic; - } - - for (i = 0; i < 8; ++i) { - result.ref_pic_set_st_curr_before[i] = 0xFF; - result.ref_pic_set_st_curr_after[i] = 0xFF; - result.ref_pic_set_lt_curr[i] = 0xFF; - } - - for (i = 0; i < pic->NumPocStCurrBefore; ++i) - result.ref_pic_set_st_curr_before[i] = pic->RefPicSetStCurrBefore[i]; - - for (i = 0; i < pic->NumPocStCurrAfter; ++i) - result.ref_pic_set_st_curr_after[i] = pic->RefPicSetStCurrAfter[i]; - - for (i = 0; i < pic->NumPocLtCurr; ++i) - result.ref_pic_set_lt_curr[i] = pic->RefPicSetLtCurr[i]; - - for (i = 0; i < 6; ++i) - result.ucScalingListDCCoefSizeID2[i] = pic->pps->sps->ScalingListDCCoeff16x16[i]; - - for (i = 0; i < 2; ++i) - result.ucScalingListDCCoefSizeID3[i] = pic->pps->sps->ScalingListDCCoeff32x32[i]; - - memcpy(dec->it, pic->pps->sps->ScalingList4x4, 6 * 16); - memcpy(dec->it + 96, pic->pps->sps->ScalingList8x8, 6 * 64); - memcpy(dec->it + 480, pic->pps->sps->ScalingList16x16, 6 * 64); - memcpy(dec->it + 864, pic->pps->sps->ScalingList32x32, 2 * 64); - - for (i = 0 ; i < 2 ; i++) { - for (int j = 0 ; j < 15 ; j++) - result.direct_reflist[i][j] = pic->RefPicList[i][j]; - } - - /* TODO - result.highestTid; - result.isNonRef; - - IDRPicFlag; - RAPPicFlag; - NumPocTotalCurr; - NumShortTermPictureSliceHeaderBits; - NumLongTermPictureSliceHeaderBits; - - IsLongTerm[16]; - */ - - return result; -} - /* get vc1 specific message bits */ static struct ruvd_vc1 get_vc1_msg(struct pipe_vc1_picture_desc *pic) { @@ -1007,7 +809,7 @@ static void ruvd_destroy(struct pipe_video_codec *decoder) flush(dec, 0); - dec->ws->cs_destroy(dec->cs); + dec->ws->cs_destroy(&dec->cs); for (i = 0; i < NUM_BUFFERS; ++i) { rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]); @@ -1038,9 +840,9 @@ static void ruvd_begin_frame(struct pipe_video_codec *decoder, &ruvd_destroy_associated_data); dec->bs_size = 0; - dec->bs_ptr = dec->ws->buffer_map( + dec->bs_ptr = dec->ws->buffer_map(dec->ws, dec->bs_buffers[dec->cur_buffer].res->buf, - dec->cs, PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); + &dec->cs, PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY); } /** @@ -1086,15 +888,16 @@ static void ruvd_decode_bitstream(struct pipe_video_codec *decoder, new_size += 2; /* save for EOI */ if (new_size > buf->res->buf->size) { - dec->ws->buffer_unmap(buf->res->buf); - if (!rvid_resize_buffer(dec->screen, dec->cs, buf, new_size)) { + dec->ws->buffer_unmap(dec->ws, buf->res->buf); + dec->bs_ptr = NULL; + if (!rvid_resize_buffer(dec->screen, &dec->cs, buf, new_size)) { RVID_ERR("Can't resize bitstream buffer!"); return; } - dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, - PIPE_TRANSFER_WRITE | - RADEON_TRANSFER_TEMPORARY); + dec->bs_ptr = dec->ws->buffer_map(dec->ws, buf->res->buf, &dec->cs, + PIPE_MAP_WRITE | + RADEON_MAP_TEMPORARY); if (!dec->bs_ptr) return; @@ -1136,7 +939,8 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, bs_size = align(dec->bs_size, 128); memset(dec->bs_ptr, 0, bs_size - dec->bs_size); - dec->ws->buffer_unmap(bs_buf->res->buf); + dec->ws->buffer_unmap(dec->ws, bs_buf->res->buf); + dec->bs_ptr = NULL; map_msg_fb_it_buf(dec); dec->msg->size = sizeof(*dec->msg); @@ -1240,14 +1044,14 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, struct ruvd_decoder *dec; int r, i; - ws->query_info(ws, &info); + ws->query_info(ws, &info, false, false); switch(u_reduce_video_profile(templ->profile)) { case PIPE_VIDEO_FORMAT_MPEG12: if (templ->entrypoint > PIPE_VIDEO_ENTRYPOINT_BITSTREAM || info.family < CHIP_PALM) return vl_create_mpeg12_decoder(context, templ); - /* fall through */ + FALLTHROUGH; case PIPE_VIDEO_FORMAT_MPEG4: width = align(width, VL_MACROBLOCK_WIDTH); height = align(height, VL_MACROBLOCK_HEIGHT); @@ -1286,8 +1090,8 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, dec->stream_handle = rvid_alloc_stream_handle(); dec->screen = context->screen; dec->ws = ws; - dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL, false); - if (!dec->cs) { + + if (!ws->cs_create(&dec->cs, rctx->ctx, RING_UVD, NULL, NULL, false)) { RVID_ERR("Can't get command submission context.\n"); goto error; } @@ -1347,7 +1151,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, return &dec->base; error: - if (dec->cs) dec->ws->cs_destroy(dec->cs); + dec->ws->cs_destroy(&dec->cs); for (i = 0; i < NUM_BUFFERS; ++i) { rvid_destroy_buffer(&dec->msg_fb_it_buffers[i]); @@ -1366,7 +1170,7 @@ error: /* calculate top/bottom offset */ static unsigned texture_offset(struct radeon_surf *surface, unsigned layer) { - return surface->u.legacy.level[0].offset + + return (uint64_t)surface->u.legacy.level[0].offset_256B * 256 + layer * (uint64_t)surface->u.legacy.level[0].slice_size_dw * 4; } diff --git a/lib/mesa/src/gallium/drivers/r600/radeon_vce.c b/lib/mesa/src/gallium/drivers/r600/radeon_vce.c index 16f48c694..1cf8522ba 100644 --- a/lib/mesa/src/gallium/drivers/r600/radeon_vce.c +++ b/lib/mesa/src/gallium/drivers/r600/radeon_vce.c @@ -63,7 +63,7 @@ static void (*get_pic_param)(struct rvce_encoder *enc, */ static void flush(struct rvce_encoder *enc) { - enc->ws->cs_flush(enc->cs, PIPE_FLUSH_ASYNC, NULL); + enc->ws->cs_flush(&enc->cs, PIPE_FLUSH_ASYNC, NULL); enc->task_info_idx = 0; enc->bs_idx = 0; } @@ -71,7 +71,7 @@ static void flush(struct rvce_encoder *enc) #if 0 static void dump_feedback(struct rvce_encoder *enc, struct rvid_buffer *fb) { - uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE); + uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, &enc->cs, PIPE_MAP_READ_WRITE); unsigned i = 0; fprintf(stderr, "\n"); fprintf(stderr, "encStatus:\t\t\t%08x\n", ptr[i++]); @@ -105,7 +105,7 @@ static void reset_cpb(struct rvce_encoder *enc) for (i = 0; i < enc->cpb_num; ++i) { struct rvce_cpb_slot *slot = &enc->cpb_array[i]; slot->index = i; - slot->picture_type = PIPE_H264_ENC_PICTURE_TYPE_SKIP; + slot->picture_type = PIPE_H2645_ENC_PICTURE_TYPE_SKIP; slot->frame_num = 0; slot->pic_order_cnt = 0; list_addtail(&slot->list, &enc->cpb_slots); @@ -126,10 +126,10 @@ static void sort_cpb(struct rvce_encoder *enc) if (i->frame_num == enc->pic.ref_idx_l1) l1 = i; - if (enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P && l0) + if (enc->pic.picture_type == PIPE_H2645_ENC_PICTURE_TYPE_P && l0) break; - if (enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B && + if (enc->pic.picture_type == PIPE_H2645_ENC_PICTURE_TYPE_B && l0 && l1) break; } @@ -256,7 +256,7 @@ static void rvce_destroy(struct pipe_video_codec *encoder) rvid_destroy_buffer(&fb); } rvid_destroy_buffer(&enc->cpb); - enc->ws->cs_destroy(enc->cs); + enc->ws->cs_destroy(&enc->cs); FREE(enc->cpb_array); FREE(enc); } @@ -281,10 +281,10 @@ static void rvce_begin_frame(struct pipe_video_codec *encoder, enc->get_buffer(vid_buf->resources[0], &enc->handle, &enc->luma); enc->get_buffer(vid_buf->resources[1], NULL, &enc->chroma); - if (pic->picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR) + if (pic->picture_type == PIPE_H2645_ENC_PICTURE_TYPE_IDR) reset_cpb(enc); - else if (pic->picture_type == PIPE_H264_ENC_PICTURE_TYPE_P || - pic->picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) + else if (pic->picture_type == PIPE_H2645_ENC_PICTURE_TYPE_P || + pic->picture_type == PIPE_H2645_ENC_PICTURE_TYPE_B) sort_cpb(enc); if (!enc->stream_handle) { @@ -323,7 +323,7 @@ static void rvce_encode_bitstream(struct pipe_video_codec *encoder, RVID_ERR("Can't create feedback buffer.\n"); return; } - if (!radeon_emitted(enc->cs, 0)) + if (!radeon_emitted(&enc->cs, 0)) enc->session(enc); enc->encode(enc); enc->feedback(enc); @@ -357,9 +357,9 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder, struct rvid_buffer *fb = feedback; if (size) { - uint32_t *ptr = enc->ws->buffer_map( - fb->res->buf, enc->cs, - PIPE_TRANSFER_READ_WRITE | RADEON_TRANSFER_TEMPORARY); + uint32_t *ptr = enc->ws->buffer_map(enc->ws, + fb->res->buf, &enc->cs, + PIPE_MAP_READ_WRITE | RADEON_MAP_TEMPORARY); if (ptr[1]) { *size = ptr[4] - ptr[9]; @@ -367,7 +367,7 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder, *size = 0; } - enc->ws->buffer_unmap(fb->res->buf); + enc->ws->buffer_unmap(enc->ws, fb->res->buf); } //dump_feedback(enc, fb); rvid_destroy_buffer(fb); @@ -431,14 +431,13 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, enc->screen = context->screen; enc->ws = ws; - enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc, false); - if (!enc->cs) { + + if (!ws->cs_create(&enc->cs, rctx->ctx, RING_VCE, rvce_cs_flush, enc, false)) { RVID_ERR("Can't get command submission context.\n"); goto error; } templat.buffer_format = PIPE_FORMAT_NV12; - templat.chroma_format = PIPE_VIDEO_CHROMA_FORMAT_420; templat.width = enc->base.width; templat.height = enc->base.height; templat.interlaced = false; @@ -478,8 +477,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, return &enc->base; error: - if (enc->cs) - enc->ws->cs_destroy(enc->cs); + enc->ws->cs_destroy(&enc->cs); rvid_destroy_buffer(&enc->cpb); @@ -520,7 +518,7 @@ void rvce_add_buffer(struct rvce_encoder *enc, struct pb_buffer *buf, { int reloc_idx; - reloc_idx = enc->ws->cs_add_buffer(enc->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED, + reloc_idx = enc->ws->cs_add_buffer(&enc->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED, domain, 0); if (enc->use_vm) { uint64_t addr; diff --git a/lib/mesa/src/gallium/drivers/r600/radeon_vce.h b/lib/mesa/src/gallium/drivers/r600/radeon_vce.h index c5e054777..a437336bc 100644 --- a/lib/mesa/src/gallium/drivers/r600/radeon_vce.h +++ b/lib/mesa/src/gallium/drivers/r600/radeon_vce.h @@ -36,14 +36,14 @@ #include "util/list.h" -#define RVCE_CS(value) (enc->cs->current.buf[enc->cs->current.cdw++] = (value)) +#define RVCE_CS(value) (enc->cs.current.buf[enc->cs.current.cdw++] = (value)) #define RVCE_BEGIN(cmd) { \ - uint32_t *begin = &enc->cs->current.buf[enc->cs->current.cdw++]; \ + uint32_t *begin = &enc->cs.current.buf[enc->cs.current.cdw++]; \ RVCE_CS(cmd) #define RVCE_READ(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READ, (domain), (off)) #define RVCE_WRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_WRITE, (domain), (off)) #define RVCE_READWRITE(buf, domain, off) rvce_add_buffer(enc, (buf), RADEON_USAGE_READWRITE, (domain), (off)) -#define RVCE_END() *begin = (&enc->cs->current.buf[enc->cs->current.cdw] - begin) * 4; } +#define RVCE_END() *begin = (&enc->cs.current.buf[enc->cs.current.cdw] - begin) * 4; } #define RVCE_MAX_BITSTREAM_OUTPUT_ROW_SIZE (4096 * 16 * 2.5) #define RVCE_MAX_AUX_BUFFER_NUM 4 @@ -60,7 +60,7 @@ struct rvce_cpb_slot { struct list_head list; unsigned index; - enum pipe_h264_enc_picture_type picture_type; + enum pipe_h2645_enc_picture_type picture_type; unsigned frame_num; unsigned pic_order_cnt; }; @@ -340,7 +340,7 @@ struct rvce_h264_enc_pic { unsigned quant_p_frames; unsigned quant_b_frames; - enum pipe_h264_enc_picture_type picture_type; + enum pipe_h2645_enc_picture_type picture_type; unsigned frame_num; unsigned frame_num_cnt; unsigned p_remain; @@ -387,7 +387,7 @@ struct rvce_encoder { struct pipe_screen *screen; struct radeon_winsys* ws; - struct radeon_cmdbuf* cs; + struct radeon_cmdbuf cs; rvce_get_buffer get_buffer; diff --git a/lib/mesa/src/gallium/drivers/r600/radeon_video.c b/lib/mesa/src/gallium/drivers/r600/radeon_video.c index 81c1a5e51..6ada9ba18 100644 --- a/lib/mesa/src/gallium/drivers/r600/radeon_video.c +++ b/lib/mesa/src/gallium/drivers/r600/radeon_video.c @@ -97,13 +97,13 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_cmdbuf *cs, if (!rvid_create_buffer(screen, new_buf, new_size, new_buf->usage)) goto error; - src = ws->buffer_map(old_buf.res->buf, cs, - PIPE_TRANSFER_READ | RADEON_TRANSFER_TEMPORARY); + src = ws->buffer_map(ws, old_buf.res->buf, cs, + PIPE_MAP_READ | RADEON_MAP_TEMPORARY); if (!src) goto error; - dst = ws->buffer_map(new_buf->res->buf, cs, - PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); + dst = ws->buffer_map(ws, new_buf->res->buf, cs, + PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY); if (!dst) goto error; @@ -113,14 +113,14 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_cmdbuf *cs, dst += bytes; memset(dst, 0, new_size); } - ws->buffer_unmap(new_buf->res->buf); - ws->buffer_unmap(old_buf.res->buf); + ws->buffer_unmap(ws, new_buf->res->buf); + ws->buffer_unmap(ws, old_buf.res->buf); rvid_destroy_buffer(&old_buf); return true; error: if (src) - ws->buffer_unmap(old_buf.res->buf); + ws->buffer_unmap(ws, old_buf.res->buf); rvid_destroy_buffer(new_buf); *new_buf = old_buf; return false; @@ -171,7 +171,7 @@ void rvid_join_surfaces(struct r600_common_context *rctx, continue; /* adjust the texture layer offsets */ - off = align(off, surfaces[i]->surf_alignment); + off = align(off, 1 << surfaces[i]->surf_alignment_log2); /* copy the tiling parameters */ surfaces[i]->u.legacy.bankw = surfaces[best_tiling]->u.legacy.bankw; @@ -180,7 +180,7 @@ void rvid_join_surfaces(struct r600_common_context *rctx, surfaces[i]->u.legacy.tile_split = surfaces[best_tiling]->u.legacy.tile_split; for (j = 0; j < ARRAY_SIZE(surfaces[i]->u.legacy.level); ++j) - surfaces[i]->u.legacy.level[j].offset += off; + surfaces[i]->u.legacy.level[j].offset_256B += off / 256; off += surfaces[i]->surf_size; } @@ -189,9 +189,9 @@ void rvid_join_surfaces(struct r600_common_context *rctx, if (!buffers[i] || !*buffers[i]) continue; - size = align(size, (*buffers[i])->alignment); + size = align(size, 1 << (*buffers[i])->alignment_log2); size += (*buffers[i])->size; - alignment = MAX2(alignment, (*buffers[i])->alignment * 1); + alignment = MAX2(alignment, 1 << (*buffers[i])->alignment_log2); } if (!size) @@ -224,7 +224,7 @@ int rvid_get_video_param(struct pipe_screen *screen, enum pipe_video_format codec = u_reduce_video_profile(profile); struct radeon_info info; - rscreen->ws->query_info(rscreen->ws, &info); + rscreen->ws->query_info(rscreen->ws, &info, false, false); if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) { switch (param) { diff --git a/lib/mesa/src/gallium/drivers/r600/radeon_video.h b/lib/mesa/src/gallium/drivers/r600/radeon_video.h index 4777c6c0e..59c9377de 100644 --- a/lib/mesa/src/gallium/drivers/r600/radeon_video.h +++ b/lib/mesa/src/gallium/drivers/r600/radeon_video.h @@ -65,7 +65,7 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_cmdbuf *cs, void rvid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer); /* join surfaces into the same buffer with identical tiling params - sumup their sizes and replace the backend buffers with a single bo */ + sum up their sizes and replace the backend buffers with a single bo */ void rvid_join_surfaces(struct r600_common_context *rctx, struct pb_buffer** buffers[VL_NUM_COMPONENTS], struct radeon_surf *surfaces[VL_NUM_COMPONENTS]); diff --git a/lib/mesa/src/gallium/drivers/r600/sb/sb_bc_decoder.cpp b/lib/mesa/src/gallium/drivers/r600/sb/sb_bc_decoder.cpp index 4a7f82ba7..b04cb73e2 100644 --- a/lib/mesa/src/gallium/drivers/r600/sb/sb_bc_decoder.cpp +++ b/lib/mesa/src/gallium/drivers/r600/sb/sb_bc_decoder.cpp @@ -540,7 +540,8 @@ int bc_decoder::decode_fetch_mem(unsigned & i, bc_fetch& bc) { uint32_t dw2 = dw[i+2]; i += 4; // MEM instructions align to 4 words boundaries - assert(i < ndw); + + assert(i <= ndw); MEM_RD_WORD0_R7EGCM w0(dw0); bc.elem_size = w0.get_ELEM_SIZE(); diff --git a/lib/mesa/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/lib/mesa/src/gallium/drivers/r600/sb/sb_bc_parser.cpp index 6b19d61ba..446486c36 100644 --- a/lib/mesa/src/gallium/drivers/r600/sb/sb_bc_parser.cpp +++ b/lib/mesa/src/gallium/drivers/r600/sb/sb_bc_parser.cpp @@ -385,6 +385,9 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { if (ctx.alu_slots(n->bc.op) & AF_4SLOT) n->flags |= NF_ALU_4SLOT; + if (ctx.alu_slots(n->bc.op) & AF_2SLOT) + n->flags |= NF_ALU_2SLOT; + n->src.resize(src_count); unsigned flags = n->bc.op_ptr->flags; @@ -476,7 +479,7 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { n->src[s] = sh->get_const_value(src.value); } else if (src.sel == ALU_SRC_PS || src.sel == ALU_SRC_PV) { unsigned pgroup = !cgroup, prev_slot = src.sel == ALU_SRC_PS ? - SLOT_TRANS : src.chan; + ((unsigned)SLOT_TRANS) : src.chan; // XXX shouldn't happen but llvm backend uses PS on cayman if (prev_slot == SLOT_TRANS && ctx.is_cayman()) @@ -586,12 +589,16 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { alu_node *a = static_cast<alu_node*>(*I); unsigned sflags = a->bc.slot_flags; - if (sflags == AF_4V || (ctx.is_cayman() && sflags == AF_S)) { + if (sflags == AF_4V || sflags == AF_2V || (ctx.is_cayman() && sflags == AF_S)) { if (!p) p = sh->create_alu_packed(); a->remove(); p->push_back(a); + if (sflags == AF_2V && p->count() == 2) { + g->push_front(p); + p = NULL; + } } } diff --git a/lib/mesa/src/gallium/drivers/r600/sb/sb_dump.cpp b/lib/mesa/src/gallium/drivers/r600/sb/sb_dump.cpp index 57dded5ef..402ba357f 100644 --- a/lib/mesa/src/gallium/drivers/r600/sb/sb_dump.cpp +++ b/lib/mesa/src/gallium/drivers/r600/sb/sb_dump.cpp @@ -396,6 +396,8 @@ void dump::dump_flags(node &n) { sblog << "CH_CONS "; if (n.flags & NF_ALU_4SLOT) sblog << "4S "; + if (n.flags & NF_ALU_2SLOT) + sblog << "2S "; } void dump::dump_val(value* v) { diff --git a/lib/mesa/src/gallium/drivers/r600/sb/sb_pass.h b/lib/mesa/src/gallium/drivers/r600/sb/sb_pass.h index a21b0bf99..179eab478 100644 --- a/lib/mesa/src/gallium/drivers/r600/sb/sb_pass.h +++ b/lib/mesa/src/gallium/drivers/r600/sb/sb_pass.h @@ -546,10 +546,10 @@ private: void add_prev_chan(unsigned chan); unsigned get_preferable_chan_mask(); - void ra_node(container_node *c); - void process_op(node *n); + bool ra_node(container_node *c); + bool process_op(node *n); - void color(value *v); + bool color(value *v); void color_bs_constraint(ra_constraint *c); diff --git a/lib/mesa/src/gallium/drivers/r600/sb/sb_peephole.cpp b/lib/mesa/src/gallium/drivers/r600/sb/sb_peephole.cpp index 4390a8f52..979f4bc13 100644 --- a/lib/mesa/src/gallium/drivers/r600/sb/sb_peephole.cpp +++ b/lib/mesa/src/gallium/drivers/r600/sb/sb_peephole.cpp @@ -131,8 +131,8 @@ void peephole::optimize_cc_op2(alu_node* a) { std::swap(a->src[0],a->src[1]); swapped = true; // clear modifiers - memset(&a->bc.src[0], 0, sizeof(bc_alu_src)); - memset(&a->bc.src[1], 0, sizeof(bc_alu_src)); + a->bc.src[0].clear(); + a->bc.src[1].clear(); } if (swapped || (a->src[1]->is_const() && diff --git a/lib/mesa/src/gallium/drivers/r600/sfn/sfn_instructionvisitor.cpp b/lib/mesa/src/gallium/drivers/r600/sfn/sfn_instructionvisitor.cpp new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/sfn/sfn_instructionvisitor.cpp diff --git a/lib/mesa/src/gallium/drivers/r600/sfn/sfn_instructionvisitor.h b/lib/mesa/src/gallium/drivers/r600/sfn/sfn_instructionvisitor.h new file mode 100644 index 000000000..9b34fcd4d --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/sfn/sfn_instructionvisitor.h @@ -0,0 +1,91 @@ +#ifndef INSTRUCTIONVISITOR_H +#define INSTRUCTIONVISITOR_H + +namespace r600 { + + +class AluInstruction; +class ExportInstruction; +class TexInstruction; +class FetchInstruction; +class IfInstruction; +class ElseInstruction; +class IfElseEndInstruction; +class LoopBeginInstruction; +class LoopEndInstruction; +class LoopBreakInstruction; +class LoopContInstruction; +class StreamOutIntruction; +class MemRingOutIntruction; +class EmitVertex; +class WaitAck; +class WriteScratchInstruction; +class GDSInstr; +class RatInstruction; +class LDSWriteInstruction; +class LDSReadInstruction; +class LDSAtomicInstruction; +class GDSStoreTessFactor; +class InstructionBlock; + +class InstructionVisitor +{ +public: + virtual ~InstructionVisitor() {}; + virtual bool visit(AluInstruction& i) = 0; + virtual bool visit(ExportInstruction& i) = 0; + virtual bool visit(TexInstruction& i) = 0; + virtual bool visit(FetchInstruction& i) = 0; + virtual bool visit(IfInstruction& i) = 0; + virtual bool visit(ElseInstruction& i) = 0; + virtual bool visit(IfElseEndInstruction& i) = 0; + virtual bool visit(LoopBeginInstruction& i) = 0; + virtual bool visit(LoopEndInstruction& i) = 0; + virtual bool visit(LoopBreakInstruction& i) = 0; + virtual bool visit(LoopContInstruction& i) = 0; + virtual bool visit(StreamOutIntruction& i) = 0; + virtual bool visit(MemRingOutIntruction& i) = 0; + virtual bool visit(EmitVertex& i) = 0; + virtual bool visit(WaitAck& i) = 0; + virtual bool visit(WriteScratchInstruction& i) = 0; + virtual bool visit(GDSInstr& i) = 0; + virtual bool visit(RatInstruction& i) = 0; + virtual bool visit(LDSWriteInstruction& i) = 0; + virtual bool visit(LDSReadInstruction& i) = 0; + virtual bool visit(LDSAtomicInstruction& i) = 0; + virtual bool visit(GDSStoreTessFactor& i) = 0; + virtual bool visit(InstructionBlock& i) = 0; +}; + +class ConstInstructionVisitor +{ +public: + virtual ~ConstInstructionVisitor() {}; + virtual bool visit(const AluInstruction& i) = 0; + virtual bool visit(const ExportInstruction& i) = 0; + virtual bool visit(const TexInstruction& i) = 0; + virtual bool visit(const FetchInstruction& i) = 0; + virtual bool visit(const IfInstruction& i) = 0; + virtual bool visit(const ElseInstruction& i) = 0; + virtual bool visit(const IfElseEndInstruction& i) = 0; + virtual bool visit(const LoopBeginInstruction& i) = 0; + virtual bool visit(const LoopEndInstruction& i) = 0; + virtual bool visit(const LoopBreakInstruction& i) = 0; + virtual bool visit(const LoopContInstruction& i) = 0; + virtual bool visit(const StreamOutIntruction& i) = 0; + virtual bool visit(const MemRingOutIntruction& i) = 0; + virtual bool visit(const EmitVertex& i) = 0; + virtual bool visit(const WaitAck& i) = 0; + virtual bool visit(const WriteScratchInstruction& i) = 0; + virtual bool visit(const GDSInstr& i) = 0; + virtual bool visit(const RatInstruction& i) = 0; + virtual bool visit(const LDSWriteInstruction& i) = 0; + virtual bool visit(const LDSReadInstruction& i) = 0; + virtual bool visit(const LDSAtomicInstruction& i) = 0; + virtual bool visit(const GDSStoreTessFactor& i) = 0; + virtual bool visit(const InstructionBlock& i) = 0; +}; + +} + +#endif // INSTRUCTIONVISITOR_H diff --git a/lib/mesa/src/gallium/drivers/r600/sfn/sfn_nir_algebraic.py b/lib/mesa/src/gallium/drivers/r600/sfn/sfn_nir_algebraic.py new file mode 100644 index 000000000..2ef064111 --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/sfn/sfn_nir_algebraic.py @@ -0,0 +1,49 @@ +# +# Copyright (C) 2021 Collabora Ltd. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +import argparse +import sys + +lower_alu = [ + # For chipfamily r600 one must do fma (2*pi ffract() - 0.5) + (('fsin', "a@32"), ('fsin_r600', ('fadd', ('ffract', ('ffma', 'a', 0.15915494, 0.5)), -0.5))), + (('fcos', "a@32"), ('fcos_r600', ('fadd', ('ffract', ('ffma', 'a', 0.15915494, 0.5)), -0.5))), +] + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--import-path', required=True) + args = parser.parse_args() + sys.path.insert(0, args.import_path) + run() + + +def run(): + import nir_algebraic # pylint: disable=import-error + + print('#include "sfn/sfn_nir.h"') + + print(nir_algebraic.AlgebraicPass("r600_lower_alu", + lower_alu).render()) + +if __name__ == '__main__': + main() diff --git a/lib/mesa/src/gallium/drivers/r600/sfn/sfn_nir_lower_64bit.cpp b/lib/mesa/src/gallium/drivers/r600/sfn/sfn_nir_lower_64bit.cpp new file mode 100644 index 000000000..88e0085fa --- /dev/null +++ b/lib/mesa/src/gallium/drivers/r600/sfn/sfn_nir_lower_64bit.cpp @@ -0,0 +1,1063 @@ +/* -*- mesa-c++ -*- + * + * Copyright (c) 2020 Collabora LTD + * + * Author: Gert Wollny <gert.wollny@collabora.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "sfn_nir.h" + +#include "nir.h" +#include "nir_builder.h" + +#include <map> +#include <vector> +#include <iostream> + +namespace r600 { + +using std::map; +using std::pair; +using std::make_pair; +using std::vector; + +class LowerSplit64BitVar : public NirLowerInstruction { +public: + + ~LowerSplit64BitVar(); + using VarSplit = pair<nir_variable*, nir_variable*>; + using VarMap = map<unsigned, VarSplit>; + + nir_ssa_def * + split_double_load_deref(nir_intrinsic_instr *intr); + + nir_ssa_def * + split_double_store_deref(nir_intrinsic_instr *intr); + +private: + nir_ssa_def * + split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index); + + nir_ssa_def * + split_load_deref_var(nir_intrinsic_instr *intr); + + nir_ssa_def * + split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref); + + nir_ssa_def * + split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref1); + + VarSplit get_var_pair(nir_variable *old_var); + + nir_ssa_def * + merge_64bit_loads(nir_ssa_def *load1, nir_ssa_def *load2, bool out_is_vec3); + + nir_ssa_def *split_double_load(nir_intrinsic_instr *load1); + + nir_ssa_def * + split_store_output(nir_intrinsic_instr *store1); + + nir_ssa_def *split_double_load_uniform(nir_intrinsic_instr *intr); + + nir_ssa_def * + split_double_load_ssbo(nir_intrinsic_instr *intr); + + nir_ssa_def * + split_double_load_ubo(nir_intrinsic_instr *intr); + + nir_ssa_def * + split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction); + + nir_ssa_def * + split_reduction3(nir_alu_instr *alu, + nir_op op1, nir_op op2, nir_op reduction); + + nir_ssa_def * + split_reduction4(nir_alu_instr *alu, + nir_op op1, nir_op op2, nir_op reduction); + + nir_ssa_def *split_bcsel(nir_alu_instr *alu); + + nir_ssa_def *split_load_const(nir_load_const_instr *lc); + + bool filter(const nir_instr *instr) const override; + nir_ssa_def *lower(nir_instr *instr) override; + + VarMap m_varmap; + vector<nir_variable*> m_old_vars; + vector<nir_instr *> m_old_stores; +}; + + +bool +LowerSplit64BitVar::filter(const nir_instr *instr) const +{ + switch (instr->type) { + case nir_instr_type_intrinsic: { + auto intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_deref: + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_input: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + if (nir_dest_bit_size(intr->dest) != 64) + return false; + return nir_dest_num_components(intr->dest) >= 3; + case nir_intrinsic_store_output: + if (nir_src_bit_size(intr->src[0]) != 64) + return false; + return nir_src_num_components(intr->src[0]) >= 3; + case nir_intrinsic_store_deref: + if (nir_src_bit_size(intr->src[1]) != 64) + return false; + return nir_src_num_components(intr->src[1]) >= 3; + default: + return false; + } + } + case nir_instr_type_alu: { + auto alu = nir_instr_as_alu(instr); + switch (alu->op) { + case nir_op_bcsel: + if (nir_dest_num_components(alu->dest.dest) < 3) + return false; + return nir_dest_bit_size(alu->dest.dest) == 64; + case nir_op_bany_fnequal3: + case nir_op_bany_fnequal4: + case nir_op_ball_fequal3: + case nir_op_ball_fequal4: + case nir_op_bany_inequal3: + case nir_op_bany_inequal4: + case nir_op_ball_iequal3: + case nir_op_ball_iequal4: + case nir_op_fdot3: + case nir_op_fdot4: + return nir_src_bit_size(alu->src[1].src) == 64; + default: + return false; + } + } + case nir_instr_type_load_const: { + auto lc = nir_instr_as_load_const(instr); + if (lc->def.bit_size != 64) + return false; + return lc->def.num_components >= 3; + } + default: + return false; + } +} + +nir_ssa_def * +LowerSplit64BitVar::merge_64bit_loads(nir_ssa_def *load1, + nir_ssa_def *load2, bool out_is_vec3) +{ + if (out_is_vec3) + return nir_vec3(b, nir_channel(b, load1, 0), + nir_channel(b, load1, 1), + nir_channel(b, load2, 0)); + else + return nir_vec4(b, nir_channel(b, load1, 0), + nir_channel(b, load1, 1), + nir_channel(b, load2, 0), + nir_channel(b, load2, 1)); +} + +LowerSplit64BitVar::~LowerSplit64BitVar() +{ + for(auto&& v: m_old_vars) + exec_node_remove(&v->node); + + for(auto&& v: m_old_stores) + nir_instr_remove(v); +} + +nir_ssa_def * +LowerSplit64BitVar::split_double_store_deref(nir_intrinsic_instr *intr) +{ + auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); + if (deref->deref_type == nir_deref_type_var) + return split_store_deref_var(intr, deref); + else if (deref->deref_type == nir_deref_type_array) + return split_store_deref_array(intr, deref); + else { + unreachable("only splitting of stores to vars and arrays is supported"); + } +} + +nir_ssa_def * +LowerSplit64BitVar::split_double_load_deref(nir_intrinsic_instr *intr) +{ + auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); + if (deref->deref_type == nir_deref_type_var) + return split_load_deref_var(intr); + else if (deref->deref_type == nir_deref_type_array) + return split_load_deref_array(intr, deref->arr.index); + else { + unreachable(0 && "only splitting of loads from vars and arrays is supported"); + } + m_old_stores.push_back(&intr->instr); +} + +nir_ssa_def * +LowerSplit64BitVar::split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index) +{ + auto old_var = nir_intrinsic_get_var(intr, 0); + unsigned old_components = old_var->type->without_array()->components(); + + assert(old_components > 2 && old_components <= 4); + + auto vars = get_var_pair(old_var); + + auto deref1 = nir_build_deref_var(b, vars.first); + auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, index, 1)); + auto load1 = nir_build_load_deref(b, 2, 64, &deref_array1->dest.ssa, (enum gl_access_qualifier)0); + + auto deref2 = nir_build_deref_var(b, vars.second); + auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, index, 1)); + + auto load2 = nir_build_load_deref(b, old_components - 2, 64, &deref_array2->dest.ssa, (enum gl_access_qualifier)0); + + return merge_64bit_loads(load1, load2, old_components == 3); +} + +nir_ssa_def * +LowerSplit64BitVar::split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref) +{ + auto old_var = nir_intrinsic_get_var(intr, 0); + unsigned old_components = old_var->type->without_array()->components(); + + assert(old_components > 2 && old_components <= 4); + + auto src_xy = nir_channels(b, intr->src[1].ssa, 3); + + auto vars = get_var_pair(old_var); + + auto deref1 = nir_build_deref_var(b, vars.first); + auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, deref->arr.index, 1)); + + nir_build_store_deref(b, &deref_array1->dest.ssa, src_xy, 3); + + auto deref2 = nir_build_deref_var(b, vars.second); + auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, deref->arr.index, 1)); + + if (old_components == 3) + nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1); + else + nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3); + + return NIR_LOWER_INSTR_PROGRESS_REPLACE; +} + +nir_ssa_def * +LowerSplit64BitVar::split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref) +{ + auto old_var = nir_intrinsic_get_var(intr, 0); + unsigned old_components = old_var->type->without_array()->components(); + + assert(old_components > 2 && old_components <= 4); + + auto src_xy = nir_channels(b, intr->src[1].ssa, 3); + + auto vars = get_var_pair(old_var); + + auto deref1 = nir_build_deref_var(b, vars.first); + nir_build_store_deref(b, &deref1->dest.ssa, src_xy, 3); + + auto deref2 = nir_build_deref_var(b, vars.second); + if (old_components == 3) + nir_build_store_deref(b, &deref2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1); + else + nir_build_store_deref(b, &deref2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3); + + return NIR_LOWER_INSTR_PROGRESS_REPLACE; +} + +nir_ssa_def * +LowerSplit64BitVar::split_load_deref_var(nir_intrinsic_instr *intr) +{ + auto old_var = nir_intrinsic_get_var(intr, 0); + auto vars = get_var_pair(old_var); + unsigned old_components = old_var->type->components(); + + nir_deref_instr *deref1 = nir_build_deref_var(b, vars.first); + auto *load1 = nir_load_deref(b, deref1); + + nir_deref_instr *deref2 = nir_build_deref_var(b, vars.second); + deref2->type = vars.second->type; + + auto *load2 = nir_load_deref(b, deref2); + + return merge_64bit_loads(load1, load2, old_components == 3); +} + +LowerSplit64BitVar::VarSplit +LowerSplit64BitVar::get_var_pair(nir_variable *old_var) +{ + auto split_vars = m_varmap.find(old_var->data.driver_location); + + assert(old_var->type->without_array()->components() > 2); + + if (split_vars == m_varmap.end()) { + auto var1 = nir_variable_clone(old_var, b->shader); + auto var2 = nir_variable_clone(old_var, b->shader); + + var1->type = glsl_dvec_type(2); + var2->type = glsl_dvec_type(old_var->type->without_array()->components() - 2); + + if (old_var->type->is_array()) { + var1->type = glsl_array_type(var1->type, old_var->type->array_size(), 0); + var2->type = glsl_array_type(var2->type, old_var->type->array_size(), 0); + } + + if (old_var->data.mode == nir_var_shader_in || + old_var->data.mode == nir_var_shader_out) { + ++var2->data.driver_location; + ++var2->data.location; + nir_shader_add_variable(b->shader, var1); + nir_shader_add_variable(b->shader, var2); + } else if (old_var->data.mode == nir_var_function_temp) { + exec_list_push_tail(&b->impl->locals, &var1->node); + exec_list_push_tail(&b->impl->locals, &var2->node); + } + + m_varmap[old_var->data.driver_location] = make_pair(var1, var2); + } + return m_varmap[old_var->data.driver_location]; +} + + +nir_ssa_def * +LowerSplit64BitVar::split_double_load(nir_intrinsic_instr *load1) +{ + unsigned old_components = nir_dest_num_components(load1->dest); + auto load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &load1->instr)); + nir_io_semantics sem = nir_intrinsic_io_semantics(load1); + + load1->dest.ssa.num_components = 2; + sem.num_slots = 1; + nir_intrinsic_set_io_semantics(load1, sem); + + load2->dest.ssa.num_components = old_components - 2; + sem.location += 1; + nir_intrinsic_set_io_semantics(load2, sem); + nir_intrinsic_set_base(load2, nir_intrinsic_base(load1) + 1); + nir_builder_instr_insert(b, &load2->instr); + + return merge_64bit_loads(&load1->dest.ssa, &load2->dest.ssa, old_components == 3); +} + + +nir_ssa_def * +LowerSplit64BitVar::split_store_output(nir_intrinsic_instr *store1) +{ + auto src = store1->src[0]; + unsigned old_components = nir_src_num_components(src); + nir_io_semantics sem = nir_intrinsic_io_semantics(store1); + + auto store2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &store1->instr)); + auto src1 = nir_channels(b, src.ssa, 3); + auto src2 = nir_channels(b, src.ssa, old_components == 3 ? 4 : 0xc); + + nir_instr_rewrite_src(&store1->instr, &src, nir_src_for_ssa(src1)); + nir_intrinsic_set_write_mask(store1, 3); + + nir_instr_rewrite_src(&store2->instr, &src, nir_src_for_ssa(src2)); + nir_intrinsic_set_write_mask(store2, old_components == 3 ? 1 : 3); + + sem.num_slots = 1; + nir_intrinsic_set_io_semantics(store1, sem); + + sem.location += 1; + nir_intrinsic_set_io_semantics(store2, sem); + nir_intrinsic_set_base(store2, nir_intrinsic_base(store1)); + + nir_builder_instr_insert(b, &store2->instr); + return NIR_LOWER_INSTR_PROGRESS; +} + + +nir_ssa_def * +LowerSplit64BitVar::split_double_load_uniform(nir_intrinsic_instr *intr) +{ + unsigned second_components = nir_dest_num_components(intr->dest) - 2; + nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform); + load2->src[0] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1)); + nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr)); + nir_intrinsic_set_base(load2, nir_intrinsic_base(intr)); + nir_intrinsic_set_range(load2, nir_intrinsic_range(intr)); + load2->num_components = second_components; + + nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr); + nir_builder_instr_insert(b, &load2->instr); + + intr->dest.ssa.num_components = intr->num_components = 2; + + if (second_components == 1) + return nir_vec3(b, nir_channel(b, &intr->dest.ssa, 0), + nir_channel(b, &intr->dest.ssa, 1), + nir_channel(b, &load2->dest.ssa, 0)); + else + return nir_vec4(b, nir_channel(b, &intr->dest.ssa, 0), + nir_channel(b, &intr->dest.ssa, 1), + nir_channel(b, &load2->dest.ssa, 0), + nir_channel(b, &load2->dest.ssa, 1)); +} + +nir_ssa_def * +LowerSplit64BitVar::split_double_load_ssbo(nir_intrinsic_instr *intr) +{ + unsigned second_components = nir_dest_num_components(intr->dest) - 2; + nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr)); + + auto new_src0 = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1)); + nir_instr_rewrite_src(&load2->instr, &load2->src[0], new_src0); + load2->num_components = second_components; + nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr); + + nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr)); + nir_builder_instr_insert(b, &load2->instr); + + intr->dest.ssa.num_components = intr->num_components = 2; + + return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1); +} + + +nir_ssa_def * +LowerSplit64BitVar::split_double_load_ubo(nir_intrinsic_instr *intr) +{ + unsigned second_components = nir_dest_num_components(intr->dest) - 2; + nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr)); + load2->src[0] = intr->src[0]; + load2->src[1] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[1].ssa, 16)); + nir_intrinsic_set_range_base(load2, nir_intrinsic_range_base(intr) + 16); + nir_intrinsic_set_range(load2, nir_intrinsic_range(intr)); + nir_intrinsic_set_access(load2, nir_intrinsic_access(intr)); + nir_intrinsic_set_align_mul(load2, nir_intrinsic_align_mul(intr)); + nir_intrinsic_set_align_offset(load2, nir_intrinsic_align_offset(intr) + 16); + + load2->num_components = second_components; + + nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr); + nir_builder_instr_insert(b, &load2->instr); + + intr->dest.ssa.num_components = intr->num_components = 2; + + return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1); +} + +nir_ssa_def * +LowerSplit64BitVar::split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction) +{ + auto cmp0 = nir_build_alu(b, op1, src[0][0], src[0][1], nullptr, nullptr); + auto cmp1 = nir_build_alu(b, op2, src[1][0], src[1][1], nullptr, nullptr); + return nir_build_alu(b, reduction, cmp0, cmp1, nullptr, nullptr); +} + +nir_ssa_def * +LowerSplit64BitVar::split_reduction3(nir_alu_instr *alu, + nir_op op1, nir_op op2, nir_op reduction) +{ + nir_ssa_def *src[2][2]; + + src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3); + src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3); + + src[1][0] = nir_channel(b, nir_ssa_for_src(b, alu->src[0].src, 3), 2); + src[1][1] = nir_channel(b, nir_ssa_for_src(b, alu->src[1].src, 3), 2); + + return split_reduction(src, op1, op2, reduction); +} + +nir_ssa_def * +LowerSplit64BitVar::split_reduction4(nir_alu_instr *alu, + nir_op op1, nir_op op2, nir_op reduction) +{ + nir_ssa_def *src[2][2]; + + src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3); + src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3); + + src[1][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 4), 0xc); + src[1][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 4), 0xc); + + return split_reduction(src, op1, op2, reduction); +} + +nir_ssa_def * +LowerSplit64BitVar::split_bcsel(nir_alu_instr *alu) +{ + static nir_ssa_def *dest[4]; + for (unsigned i = 0; i < nir_dest_num_components(alu->dest.dest); ++i) { + dest[i] = nir_bcsel(b, + nir_channel(b, alu->src[0].src.ssa, i), + nir_channel(b, alu->src[1].src.ssa, i), + nir_channel(b, alu->src[2].src.ssa, i)); + } + return nir_vec(b, dest, nir_dest_num_components(alu->dest.dest)); +} + +nir_ssa_def * +LowerSplit64BitVar::split_load_const(nir_load_const_instr *lc) +{ + nir_ssa_def *ir[4]; + for (unsigned i = 0; i < lc->def.num_components; ++i) + ir[i] = nir_imm_double(b, lc->value[i].f64); + + return nir_vec(b, ir, lc->def.num_components); +} + +nir_ssa_def * +LowerSplit64BitVar::lower(nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_intrinsic: { + auto intr = nir_instr_as_intrinsic(instr); + switch (intr->intrinsic) { + case nir_intrinsic_load_deref: + return this->split_double_load_deref(intr); + case nir_intrinsic_load_uniform: + return split_double_load_uniform(intr); + case nir_intrinsic_load_ubo: + return split_double_load_ubo(intr); + case nir_intrinsic_load_ssbo: + return split_double_load_ssbo(intr); + case nir_intrinsic_load_input: + return split_double_load(intr); + case nir_intrinsic_store_output: + return split_store_output(intr); + case nir_intrinsic_store_deref: + return split_double_store_deref(intr); + default: + assert(0); + } + } + case nir_instr_type_alu: { + auto alu = nir_instr_as_alu(instr); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + switch (alu->op) { + case nir_op_bany_fnequal3: + return split_reduction3(alu, nir_op_bany_fnequal2, nir_op_fneu, nir_op_ior); + case nir_op_ball_fequal3: + return split_reduction3(alu, nir_op_ball_fequal2, nir_op_feq, nir_op_iand); + case nir_op_bany_inequal3: + return split_reduction3(alu, nir_op_bany_inequal2, nir_op_ine, nir_op_ior); + case nir_op_ball_iequal3: + return split_reduction3(alu, nir_op_ball_iequal2, nir_op_ieq, nir_op_iand); + case nir_op_fdot3: + return split_reduction3(alu, nir_op_fdot2, nir_op_fmul, nir_op_fadd); + case nir_op_bany_fnequal4: + return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior); + case nir_op_ball_fequal4: + return split_reduction4(alu, nir_op_ball_fequal2, nir_op_ball_fequal2, nir_op_iand); + case nir_op_bany_inequal4: + return split_reduction4(alu, nir_op_bany_inequal2, nir_op_bany_inequal2, nir_op_ior); + case nir_op_ball_iequal4: + return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior); + case nir_op_fdot4: + return split_reduction4(alu, nir_op_fdot2, nir_op_fdot2, nir_op_fadd); + case nir_op_bcsel: + return split_bcsel(alu); + default: + assert(0); + } + } + case nir_instr_type_load_const: { + auto lc = nir_instr_as_load_const(instr); + return split_load_const(lc); + } + default: + assert(0); + } + return nullptr; +} + +/* Split 64 bit instruction so that at most two 64 bit components are + * used in one instruction */ + +bool +r600_nir_split_64bit_io(nir_shader *sh) +{ + return LowerSplit64BitVar().run(sh); +} + +/* */ +class Lower64BitToVec2 : public NirLowerInstruction { + +private: + bool filter(const nir_instr *instr) const override; + nir_ssa_def *lower(nir_instr *instr) override; + + nir_ssa_def *load_deref_64_to_vec2(nir_intrinsic_instr *intr); + nir_ssa_def *load_uniform_64_to_vec2(nir_intrinsic_instr *intr); + nir_ssa_def *load_ssbo_64_to_vec2(nir_intrinsic_instr *intr); + nir_ssa_def *load_64_to_vec2(nir_intrinsic_instr *intr); + nir_ssa_def *store_64_to_vec2(nir_intrinsic_instr *intr); +}; + +bool +Lower64BitToVec2::filter(const nir_instr *instr) const +{ + switch (instr->type) { + case nir_instr_type_intrinsic: { + auto intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_deref: + case nir_intrinsic_load_input: + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_vec4: + case nir_intrinsic_load_ssbo: + return nir_dest_bit_size(intr->dest) == 64; + case nir_intrinsic_store_deref: { + if (nir_src_bit_size(intr->src[1]) == 64) + return true; + auto var = nir_intrinsic_get_var(intr, 0); + if (var->type->without_array()->bit_size() == 64) + return true; + return (var->type->without_array()->components() != intr->num_components); + } + default: + return false; + } + } + case nir_instr_type_alu: { + auto alu = nir_instr_as_alu(instr); + return nir_dest_bit_size(alu->dest.dest) == 64; + } + case nir_instr_type_phi: { + auto phi = nir_instr_as_phi(instr); + return nir_dest_bit_size(phi->dest) == 64; + } + case nir_instr_type_load_const: { + auto lc = nir_instr_as_load_const(instr); + return lc->def.bit_size == 64; + } + case nir_instr_type_ssa_undef: { + auto undef = nir_instr_as_ssa_undef(instr); + return undef->def.bit_size == 64; + } + default: + return false; + } +} + +nir_ssa_def * +Lower64BitToVec2::lower(nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_intrinsic: { + auto intr = nir_instr_as_intrinsic(instr); + switch (intr->intrinsic) { + case nir_intrinsic_load_deref: + return load_deref_64_to_vec2(intr); + case nir_intrinsic_load_uniform: + return load_uniform_64_to_vec2(intr); + case nir_intrinsic_load_ssbo: + return load_ssbo_64_to_vec2(intr); + case nir_intrinsic_load_input: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_vec4: + return load_64_to_vec2(intr); + case nir_intrinsic_store_deref: + return store_64_to_vec2(intr); + default: + + return nullptr; + } + } + case nir_instr_type_alu: { + auto alu = nir_instr_as_alu(instr); + alu->dest.dest.ssa.bit_size = 32; + alu->dest.dest.ssa.num_components *= 2; + alu->dest.write_mask = (1 << alu->dest.dest.ssa.num_components) - 1; + switch (alu->op) { + case nir_op_pack_64_2x32_split: + alu->op = nir_op_vec2; + break; + case nir_op_pack_64_2x32: + alu->op = nir_op_mov; + break; + case nir_op_vec2: + return nir_vec4(b, + nir_channel(b, alu->src[0].src.ssa, 0), + nir_channel(b, alu->src[0].src.ssa, 1), + nir_channel(b, alu->src[1].src.ssa, 0), + nir_channel(b, alu->src[1].src.ssa, 1)); + default: + return NULL; + } + return NIR_LOWER_INSTR_PROGRESS; + } + case nir_instr_type_phi: { + auto phi = nir_instr_as_phi(instr); + phi->dest.ssa.bit_size = 32; + phi->dest.ssa.num_components = 2; + return NIR_LOWER_INSTR_PROGRESS; + } + case nir_instr_type_load_const: { + auto lc = nir_instr_as_load_const(instr); + assert(lc->def.num_components < 3); + nir_const_value val[4] = {0}; + for (uint i = 0; i < lc->def.num_components; ++i) { + uint64_t v = lc->value[i].u64; + val[0].u32 = v & 0xffffffff; + val[1].u32 = (v >> 32) & 0xffffffff; + } + + return nir_build_imm(b, 2 * lc->def.num_components, 32, val); + } + case nir_instr_type_ssa_undef: { + auto undef = nir_instr_as_ssa_undef(instr); + undef->def.num_components *= 2; + undef->def.bit_size = 32; + return NIR_LOWER_INSTR_PROGRESS; + } + default: + return nullptr; + } + +} + + +nir_ssa_def * +Lower64BitToVec2::load_deref_64_to_vec2(nir_intrinsic_instr *intr) +{ + auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); + auto var = nir_intrinsic_get_var(intr, 0); + unsigned components = var->type->without_array()->components(); + if (var->type->without_array()->bit_size() == 64) { + components *= 2; + if (deref->deref_type == nir_deref_type_var) { + var->type = glsl_vec_type(components); + } else if (deref->deref_type == nir_deref_type_array) { + + var->type = glsl_array_type(glsl_vec_type(components), + var->type->array_size(), 0); + + } else { + nir_print_shader(b->shader, stderr); + assert(0 && "Only lowring of var and array derefs supported\n"); + } + } + deref->type = var->type; + if (deref->deref_type == nir_deref_type_array) { + auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr); + deref_array->type = var->type; + deref->type = deref_array->type->without_array(); + } + + intr->num_components = components; + intr->dest.ssa.bit_size = 32; + intr->dest.ssa.num_components = components; + return NIR_LOWER_INSTR_PROGRESS; +} + +nir_ssa_def * +Lower64BitToVec2::store_64_to_vec2(nir_intrinsic_instr *intr) +{ + auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); + auto var = nir_intrinsic_get_var(intr, 0); + + unsigned components = var->type->without_array()->components(); + unsigned wrmask = nir_intrinsic_write_mask(intr); + if (var->type->without_array()->bit_size() == 64) { + components *= 2; + if (deref->deref_type == nir_deref_type_var) { + var->type = glsl_vec_type(components); + } else if (deref->deref_type == nir_deref_type_array) { + var->type = glsl_array_type(glsl_vec_type(components), + var->type->array_size(), 0); + } else { + nir_print_shader(b->shader, stderr); + assert(0 && "Only lowring of var and array derefs supported\n"); + } + } + deref->type = var->type; + if (deref->deref_type == nir_deref_type_array) { + auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr); + deref_array->type = var->type; + deref->type = deref_array->type->without_array(); + } + intr->num_components = components; + nir_intrinsic_set_write_mask(intr, wrmask == 1 ? 3 : 0xf); + return NIR_LOWER_INSTR_PROGRESS; +} + + +nir_ssa_def * +Lower64BitToVec2::load_uniform_64_to_vec2(nir_intrinsic_instr *intr) +{ + intr->num_components *= 2; + intr->dest.ssa.bit_size = 32; + intr->dest.ssa.num_components *= 2; + nir_intrinsic_set_dest_type(intr, nir_type_float32); + return NIR_LOWER_INSTR_PROGRESS; +} + +nir_ssa_def * +Lower64BitToVec2::load_64_to_vec2(nir_intrinsic_instr *intr) +{ + intr->num_components *= 2; + intr->dest.ssa.bit_size = 32; + intr->dest.ssa.num_components *= 2; + nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) * 2); + return NIR_LOWER_INSTR_PROGRESS; +} + +nir_ssa_def * +Lower64BitToVec2::load_ssbo_64_to_vec2(nir_intrinsic_instr *intr) +{ + intr->num_components *= 2; + intr->dest.ssa.bit_size = 32; + intr->dest.ssa.num_components *= 2; + return NIR_LOWER_INSTR_PROGRESS; +} + +static bool store_64bit_intr(nir_src *src, void *state) +{ + bool *s = (bool *)state; + *s = nir_src_bit_size(*src) == 64; + return !*s; +} + +static bool double2vec2(nir_src *src, void *state) +{ + if (nir_src_bit_size(*src) != 64) + return true; + + assert(src->is_ssa); + src->ssa->bit_size = 32; + src->ssa->num_components *= 2; + return true; +} + +bool +r600_nir_64_to_vec2(nir_shader *sh) +{ + vector<nir_instr*> intr64bit; + nir_foreach_function(function, sh) { + if (function->impl) { + nir_builder b; + nir_builder_init(&b, function->impl); + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + switch (instr->type) { + case nir_instr_type_alu: { + bool success = false; + nir_foreach_src(instr, store_64bit_intr, &success); + if (success) + intr64bit.push_back(instr); + break; + } + case nir_instr_type_intrinsic: { + auto ir = nir_instr_as_intrinsic(instr); + switch (ir->intrinsic) { + case nir_intrinsic_store_output: + case nir_intrinsic_store_ssbo: { + bool success = false; + nir_foreach_src(instr, store_64bit_intr, &success); + if (success) { + auto wm = nir_intrinsic_write_mask(ir); + nir_intrinsic_set_write_mask(ir, (wm == 1) ? 3 : 0xf); + ir->num_components *= 2; + } + break; + } + default: + ; + } + } + default: + ; + } + } + } + } + } + + bool result = Lower64BitToVec2().run(sh); + + if (result || !intr64bit.empty()) { + + for(auto&& instr: intr64bit) { + if (instr->type == nir_instr_type_alu) { + auto alu = nir_instr_as_alu(instr); + auto alu_info = nir_op_infos[alu->op]; + for (unsigned i = 0; i < alu_info.num_inputs; ++i) { + int swizzle[NIR_MAX_VEC_COMPONENTS] = {0}; + for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS / 2; k++) { + if (!nir_alu_instr_channel_used(alu, i, k)) { + continue; + } + + switch (alu->op) { + case nir_op_unpack_64_2x32_split_x: + swizzle[2 * k] = alu->src[i].swizzle[k] * 2; + alu->op = nir_op_mov; + break; + case nir_op_unpack_64_2x32_split_y: + swizzle[2 * k] = alu->src[i].swizzle[k] * 2 + 1; + alu->op = nir_op_mov; + break; + case nir_op_unpack_64_2x32: + alu->op = nir_op_mov; + break; + case nir_op_bcsel: + if (i == 0) { + swizzle[2 * k] = swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2; + break; + } + FALLTHROUGH; + default: + swizzle[2 * k] = alu->src[i].swizzle[k] * 2; + swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2 + 1; + } + } + for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS; ++k) { + alu->src[i].swizzle[k] = swizzle[k]; + } + } + } else + nir_foreach_src(instr, double2vec2, nullptr); + } + result = true; + } + + return result; +} + +using std::map; +using std::vector; +using std::pair; + +class StoreMerger { +public: + StoreMerger(nir_shader *shader); + void collect_stores(); + bool combine(); + void combine_one_slot(vector<nir_intrinsic_instr*>& stores); + + using StoreCombos = map<unsigned, vector<nir_intrinsic_instr*>>; + + StoreCombos m_stores; + nir_shader *sh; +}; + +StoreMerger::StoreMerger(nir_shader *shader): + sh(shader) +{ +} + + +void StoreMerger::collect_stores() +{ + unsigned vertex = 0; + nir_foreach_function(function, sh) { + if (function->impl) { + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + auto ir = nir_instr_as_intrinsic(instr); + if (ir->intrinsic == nir_intrinsic_emit_vertex || + ir->intrinsic == nir_intrinsic_emit_vertex_with_counter) { + ++vertex; + continue; + } + if (ir->intrinsic != nir_intrinsic_store_output) + continue; + + unsigned index = nir_intrinsic_base(ir) + 64 * vertex + + 8 * 64 * nir_intrinsic_io_semantics(ir).gs_streams; + m_stores[index].push_back(ir); + } + } + } + } +} + +bool StoreMerger::combine() +{ + bool progress = false; + for(auto&& i : m_stores) { + if (i.second.size() < 2) + continue; + + combine_one_slot(i.second); + progress = true; + } + return progress; +} + +void StoreMerger::combine_one_slot(vector<nir_intrinsic_instr*>& stores) +{ + nir_ssa_def *srcs[4] = {nullptr}; + + nir_builder b; + nir_builder_init(&b, nir_shader_get_entrypoint(sh)); + auto last_store = *stores.rbegin(); + + b.cursor = nir_before_instr(&last_store->instr); + + unsigned comps = 0; + unsigned writemask = 0; + unsigned first_comp = 4; + for (auto&& store : stores) { + int cmp = nir_intrinsic_component(store); + for (unsigned i = 0; i < nir_src_num_components(store->src[0]); ++i, ++comps) { + unsigned out_comp = i + cmp; + srcs[out_comp] = nir_channel(&b, store->src[0].ssa, i); + writemask |= 1 << out_comp; + if (first_comp > out_comp) + first_comp = out_comp; + } + } + + auto new_src = nir_vec(&b, srcs, comps); + + nir_instr_rewrite_src(&last_store->instr, &last_store->src[0], nir_src_for_ssa(new_src)); + last_store->num_components = comps; + nir_intrinsic_set_component(last_store, first_comp); + nir_intrinsic_set_write_mask(last_store, writemask); + + for (auto i = stores.begin(); i != stores.end() - 1; ++i) + nir_instr_remove(&(*i)->instr); +} + +bool r600_merge_vec2_stores(nir_shader *shader) +{ + r600::StoreMerger merger(shader); + merger.collect_stores(); + return merger.combine(); +} + +} // end namespace r600 + + |