diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2020-01-22 02:09:34 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2020-01-22 02:09:34 +0000 |
commit | 53b0736c56ca5142a5722eb827a3675ca08e123d (patch) | |
tree | 52fd72557407af997e5b871b29a378c9bfa58299 /lib/mesa/src/freedreno | |
parent | 4bb763fef12ec314b7ed27d8c928ee833fddb0a3 (diff) |
Import Mesa 19.2.8
Diffstat (limited to 'lib/mesa/src/freedreno')
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_android.c | 391 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_cmd_buffer.c | 6752 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_cs.c | 237 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_descriptor_set.c | 1500 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_device.c | 3527 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_drm.c | 1287 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_formats.c | 1143 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_image.c | 1021 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_pass.c | 1294 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_pipeline.c | 5917 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_query.c | 1692 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_shader.c | 1050 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_util.c | 318 | ||||
-rw-r--r-- | lib/mesa/src/freedreno/vulkan/tu_wsi.c | 278 |
14 files changed, 6494 insertions, 19913 deletions
diff --git a/lib/mesa/src/freedreno/vulkan/tu_android.c b/lib/mesa/src/freedreno/vulkan/tu_android.c index d1f6bb3ab..1ebc9e726 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_android.c +++ b/lib/mesa/src/freedreno/vulkan/tu_android.c @@ -1,26 +1,35 @@ /* * Copyright © 2017, Google Inc. - * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ -#include "tu_android.h" +#include "tu_private.h" #include <hardware/gralloc.h> - -#if ANDROID_API_LEVEL >= 26 -#include <hardware/gralloc1.h> -#endif - #include <hardware/hardware.h> #include <hardware/hwvulkan.h> +#include <libsync.h> -#include "drm-uapi/drm_fourcc.h" - -#include "util/libsync.h" -#include "util/os_file.h" - -#include "tu_device.h" -#include "tu_image.h" +#include <vulkan/vk_android_native_buffer.h> +#include <vulkan/vk_icd.h> static int tu_hal_open(const struct hw_module_t *mod, @@ -42,7 +51,7 @@ PUBLIC struct hwvulkan_module_t HAL_MODULE_INFO_SYM = { .module_api_version = HWVULKAN_MODULE_API_VERSION_0_1, .hal_api_version = HARDWARE_MAKE_API_VERSION(1, 0), .id = HWVULKAN_HARDWARE_MODULE_ID, - .name = "Turnip Vulkan HAL", + .name = "AMD Vulkan HAL", .author = "Google", .methods = &(hw_module_methods_t){ @@ -97,161 +106,41 @@ tu_hal_close(struct hw_device_t *dev) return -1; } -/* get dma-buf and modifier from gralloc info */ -static VkResult -tu_gralloc_info_other(struct tu_device *device, +VkResult +tu_image_from_gralloc(VkDevice device_h, + const VkImageCreateInfo *base_info, const VkNativeBufferANDROID *gralloc_info, - int *dma_buf, - uint64_t *modifier) + const VkAllocationCallbacks *alloc, + VkImage *out_image_h) { - const uint32_t *handle_fds = (uint32_t *)gralloc_info->handle->data; - const uint32_t *handle_data = &handle_fds[gralloc_info->handle->numFds]; - bool ubwc = false; - - if (gralloc_info->handle->numFds == 1) { - /* gbm_gralloc. TODO: modifiers support */ - *dma_buf = handle_fds[0]; - } else if (gralloc_info->handle->numFds == 2) { - /* Qualcomm gralloc, find it at: - * - * https://android.googlesource.com/platform/hardware/qcom/display/. - * - * The gralloc_info->handle is a pointer to a struct private_handle_t - * from your platform's gralloc. On msm8996 (a5xx) and newer grallocs - * that's libgralloc1/gr_priv_handle.h, while previously it was - * libgralloc/gralloc_priv.h. - */ - - if (gralloc_info->handle->numInts < 2) { - return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, - "VkNativeBufferANDROID::handle::numInts is %d, " - "expected at least 2 for qcom gralloc", - gralloc_info->handle->numFds); - } + TU_FROM_HANDLE(tu_device, device, device_h); + VkImage image_h = VK_NULL_HANDLE; + struct tu_image *image = NULL; + struct tu_bo *bo = NULL; + VkResult result; - uint32_t gmsm = ('g' << 24) | ('m' << 16) | ('s' << 8) | 'm'; - if (handle_data[0] != gmsm) { - return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, - "private_handle_t::magic is %x, expected %x", - handle_data[0], gmsm); - } + result = tu_image_create( + device_h, + &(struct tu_image_create_info) { + .vk_info = base_info, .scanout = true, .no_metadata_planes = true }, + alloc, &image_h); - /* This UBWC flag was introduced in a5xx. */ - ubwc = handle_data[1] & 0x08000000; + if (result != VK_SUCCESS) + return result; - /* QCOM gralloc has two fds passed in: the actual GPU buffer, and a buffer - * of CPU-side metadata. I haven't found any need for the metadata buffer - * yet. See qdMetaData.h for what's in the metadata fd. - */ - *dma_buf = handle_fds[0]; - } else { - return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + if (gralloc_info->handle->numFds != 1) { + return vk_errorf(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE, "VkNativeBufferANDROID::handle::numFds is %d, " - "expected 1 (gbm_gralloc) or 2 (qcom gralloc)", + "expected 1", gralloc_info->handle->numFds); } - *modifier = ubwc ? DRM_FORMAT_MOD_QCOM_COMPRESSED : DRM_FORMAT_MOD_LINEAR; - return VK_SUCCESS; -} - -static const char cros_gralloc_module_name[] = "CrOS Gralloc"; - -#define CROS_GRALLOC_DRM_GET_BUFFER_INFO 4 -#define CROS_GRALLOC_DRM_GET_USAGE 5 -#define CROS_GRALLOC_DRM_GET_USAGE_FRONT_RENDERING_BIT 0x1 - -struct cros_gralloc0_buffer_info { - uint32_t drm_fourcc; - int num_fds; - int fds[4]; - uint64_t modifier; - int offset[4]; - int stride[4]; -}; - -static VkResult -tu_gralloc_info_cros(struct tu_device *device, - const VkNativeBufferANDROID *gralloc_info, - int *dma_buf, - uint64_t *modifier) - -{ - const gralloc_module_t *gralloc = device->gralloc; - struct cros_gralloc0_buffer_info info; - int ret; - - ret = gralloc->perform(gralloc, CROS_GRALLOC_DRM_GET_BUFFER_INFO, - gralloc_info->handle, &info); - if (ret) - return VK_ERROR_INVALID_EXTERNAL_HANDLE; - - *dma_buf = info.fds[0]; - *modifier = info.modifier; - - return VK_SUCCESS; -} - -VkResult -tu_gralloc_info(struct tu_device *device, - const VkNativeBufferANDROID *gralloc_info, - int *dma_buf, - uint64_t *modifier) - -{ - if (!device->gralloc) { - /* get gralloc module for gralloc buffer info query */ - int ret = hw_get_module(GRALLOC_HARDWARE_MODULE_ID, - (const hw_module_t **)&device->gralloc); - - if (ret) { - /* This is *slightly* awkward, but if we are asked to import - * a gralloc handle, and there is no gralloc, it is some sort - * of invalid handle. - */ - return vk_startup_errorf(device->instance, - VK_ERROR_INVALID_EXTERNAL_HANDLE, - "Could not open gralloc\n"); - } - - const gralloc_module_t *gralloc = device->gralloc; - - mesa_logi("opened gralloc module name: %s", gralloc->common.name); - - /* TODO not sure qcom gralloc module name, but we should check - * for it here and move the special gmsm handling out of - * tu_gralloc_info_other() - */ - if (!strcmp(gralloc->common.name, cros_gralloc_module_name) && gralloc->perform) { - device->gralloc_type = TU_GRALLOC_CROS; - } else { - device->gralloc_type = TU_GRALLOC_OTHER; - } - } - - if (device->gralloc_type == TU_GRALLOC_CROS) { - return tu_gralloc_info_cros(device, gralloc_info, dma_buf, modifier); - } else { - return tu_gralloc_info_other(device, gralloc_info, dma_buf, modifier); - } -} - -/** - * Creates the VkImage using the gralloc handle in *gralloc_info. - * - * We support two different grallocs here, gbm_gralloc, and the qcom gralloc - * used on Android phones. - */ -VkResult -tu_import_memory_from_gralloc_handle(VkDevice device_h, - int dma_buf, - const VkAllocationCallbacks *alloc, - VkImage image_h) - -{ - struct tu_image *image = NULL; - VkResult result; + /* Do not close the gralloc handle's dma_buf. The lifetime of the dma_buf + * must exceed that of the gralloc handle, and we do not own the gralloc + * handle. + */ + int dma_buf = gralloc_info->handle->data[0]; image = tu_image_from_handle(image_h); @@ -264,52 +153,70 @@ tu_import_memory_from_gralloc_handle(VkDevice device_h, .image = image_h }; - const VkImportMemoryFdInfoKHR import_info = { - .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, + const VkImportMemoryFdInfo import_info = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO, .pNext = &ded_alloc, .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, - .fd = os_dupfd_cloexec(dma_buf), + .fd = dup(dma_buf), }; + /* Find the first VRAM memory type, or GART for PRIME images. */ + int memory_type_index = -1; + for (int i = 0; + i < device->physical_device->memory_properties.memoryTypeCount; ++i) { + bool is_local = + !!(device->physical_device->memory_properties.memoryTypes[i] + .propertyFlags & + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + if (is_local) { + memory_type_index = i; + break; + } + } + + /* fallback */ + if (memory_type_index == -1) + memory_type_index = 0; result = tu_AllocateMemory(device_h, &(VkMemoryAllocateInfo) { .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, .pNext = &import_info, - .allocationSize = image->total_size, - .memoryTypeIndex = 0, + .allocationSize = image->size, + .memoryTypeIndex = memory_type_index, }, alloc, &memory_h); if (result != VK_SUCCESS) goto fail_create_image; - VkBindImageMemoryInfo bind_info = { - .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO, - .image = image_h, - .memory = memory_h, - .memoryOffset = 0, - }; - tu_BindImageMemory2(device_h, 1, &bind_info); + tu_BindImageMemory(device_h, image_h, memory_h, 0); image->owned_memory = memory_h; + /* Don't clobber the out-parameter until success is certain. */ + *out_image_h = image_h; return VK_SUCCESS; fail_create_image: +fail_size: tu_DestroyImage(device_h, image_h, alloc); return result; } -static VkResult -format_supported_with_usage(VkDevice device_h, VkFormat format, - VkImageUsageFlags imageUsage) +VkResult +tu_GetSwapchainGrallocUsageANDROID(VkDevice device_h, + VkFormat format, + VkImageUsageFlags imageUsage, + int *grallocUsage) { TU_FROM_HANDLE(tu_device, device, device_h); struct tu_physical_device *phys_dev = device->physical_device; VkPhysicalDevice phys_dev_h = tu_physical_device_to_handle(phys_dev); VkResult result; + *grallocUsage = 0; + /* WARNING: Android Nougat's libvulkan.so hardcodes the VkImageUsageFlags * returned to applications via * VkSurfaceCapabilitiesKHR::supportedUsageFlags. @@ -340,19 +247,12 @@ format_supported_with_usage(VkDevice device_h, VkFormat format, result = tu_GetPhysicalDeviceImageFormatProperties2( phys_dev_h, &image_format_info, &image_format_props); if (result != VK_SUCCESS) { - return vk_errorf(device, result, + return vk_errorf(device->instance, result, "tu_GetPhysicalDeviceImageFormatProperties2 failed " "inside %s", __func__); } - return VK_SUCCESS; -} - -static VkResult -setup_gralloc0_usage(struct tu_device *device, VkFormat format, - VkImageUsageFlags imageUsage, int *grallocUsage) -{ if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)) *grallocUsage |= GRALLOC_USAGE_HW_RENDER; @@ -367,7 +267,7 @@ setup_gralloc0_usage(struct tu_device *device, VkFormat format, * gralloc swapchains. */ if (imageUsage != 0) { - return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED, + return vk_errorf(device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED, "unsupported VkImageUsageFlags(0x%x) for gralloc " "swapchain", imageUsage); @@ -390,66 +290,93 @@ setup_gralloc0_usage(struct tu_device *device, VkFormat format, return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL -tu_GetSwapchainGrallocUsageANDROID(VkDevice device_h, - VkFormat format, - VkImageUsageFlags imageUsage, - int *grallocUsage) +VkResult +tu_AcquireImageANDROID(VkDevice device, + VkImage image_h, + int nativeFenceFd, + VkSemaphore semaphore, + VkFence fence) { - TU_FROM_HANDLE(tu_device, device, device_h); - VkResult result; + VkResult semaphore_result = VK_SUCCESS, fence_result = VK_SUCCESS; + + if (semaphore != VK_NULL_HANDLE) { + int semaphore_fd = + nativeFenceFd >= 0 ? dup(nativeFenceFd) : nativeFenceFd; + semaphore_result = tu_ImportSemaphoreFdKHR( + device, &(VkImportSemaphoreFdInfoKHR) { + .sType = VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR, + .flags = VK_SEMAPHORE_IMPORT_TEMPORARY_BIT, + .fd = semaphore_fd, + .semaphore = semaphore, + }); + } - result = format_supported_with_usage(device_h, format, imageUsage); - if (result != VK_SUCCESS) - return result; + if (fence != VK_NULL_HANDLE) { + int fence_fd = nativeFenceFd >= 0 ? dup(nativeFenceFd) : nativeFenceFd; + fence_result = tu_ImportFenceFdKHR( + device, &(VkImportFenceFdInfoKHR) { + .sType = VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR, + .flags = VK_FENCE_IMPORT_TEMPORARY_BIT, + .fd = fence_fd, + .fence = fence, + }); + } - *grallocUsage = 0; - return setup_gralloc0_usage(device, format, imageUsage, grallocUsage); + close(nativeFenceFd); + + if (semaphore_result != VK_SUCCESS) + return semaphore_result; + return fence_result; } -#if ANDROID_API_LEVEL >= 26 -VKAPI_ATTR VkResult VKAPI_CALL -tu_GetSwapchainGrallocUsage2ANDROID(VkDevice device_h, - VkFormat format, - VkImageUsageFlags imageUsage, - VkSwapchainImageUsageFlagsANDROID swapchainImageUsage, - uint64_t *grallocConsumerUsage, - uint64_t *grallocProducerUsage) +VkResult +tu_QueueSignalReleaseImageANDROID(VkQueue _queue, + uint32_t waitSemaphoreCount, + const VkSemaphore *pWaitSemaphores, + VkImage image, + int *pNativeFenceFd) { - TU_FROM_HANDLE(tu_device, device, device_h); - VkResult result; - - *grallocConsumerUsage = 0; - *grallocProducerUsage = 0; - mesa_logd("%s: format=%d, usage=0x%x", __func__, format, imageUsage); + TU_FROM_HANDLE(tu_queue, queue, _queue); + VkResult result = VK_SUCCESS; - result = format_supported_with_usage(device_h, format, imageUsage); - if (result != VK_SUCCESS) - return result; - - int32_t grallocUsage = 0; - result = setup_gralloc0_usage(device, format, imageUsage, &grallocUsage); - if (result != VK_SUCCESS) - return result; + if (waitSemaphoreCount == 0) { + if (pNativeFenceFd) + *pNativeFenceFd = -1; + return VK_SUCCESS; + } - /* Setup gralloc1 usage flags from gralloc0 flags. */ + int fd = -1; - if (grallocUsage & GRALLOC_USAGE_HW_RENDER) { - *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET; - *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_CLIENT_TARGET; - } + for (uint32_t i = 0; i < waitSemaphoreCount; ++i) { + int tmp_fd; + result = tu_GetSemaphoreFdKHR( + tu_device_to_handle(queue->device), + &(VkSemaphoreGetFdInfoKHR) { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR, + .handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT, + .semaphore = pWaitSemaphores[i], + }, + &tmp_fd); + if (result != VK_SUCCESS) { + if (fd >= 0) + close(fd); + return result; + } - if (grallocUsage & GRALLOC_USAGE_HW_TEXTURE) { - *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_GPU_TEXTURE; + if (fd < 0) + fd = tmp_fd; + else if (tmp_fd >= 0) { + sync_accumulate("tu", &fd, tmp_fd); + close(tmp_fd); + } } - if (grallocUsage & (GRALLOC_USAGE_HW_FB | - GRALLOC_USAGE_HW_COMPOSER | - GRALLOC_USAGE_EXTERNAL_DISP)) { - *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET; - *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_HWCOMPOSER; + if (pNativeFenceFd) { + *pNativeFenceFd = fd; + } else if (fd >= 0) { + close(fd); + /* We still need to do the exports, to reset the semaphores, but + * otherwise we don't wait on them. */ } - return VK_SUCCESS; } -#endif diff --git a/lib/mesa/src/freedreno/vulkan/tu_cmd_buffer.c b/lib/mesa/src/freedreno/vulkan/tu_cmd_buffer.c index 0acb45d71..fe436e595 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/lib/mesa/src/freedreno/vulkan/tu_cmd_buffer.c @@ -1,879 +1,807 @@ /* * Copyright © 2016 Red Hat. * Copyright © 2016 Bas Nieuwenhuizen - * SPDX-License-Identifier: MIT * * based in part on anv driver which is: * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ -#include "tu_cmd_buffer.h" +#include "tu_private.h" + +#include "registers/adreno_pm4.xml.h" +#include "registers/adreno_common.xml.h" +#include "registers/a6xx.xml.h" -#include "vk_render_pass.h" -#include "vk_util.h" -#include "vk_common_entrypoints.h" +#include "vk_format.h" -#include "tu_clear_blit.h" #include "tu_cs.h" -#include "tu_image.h" -#include "tu_tracepoints.h" -static void -tu_clone_trace_range(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - struct u_trace_iterator begin, struct u_trace_iterator end) +void +tu_bo_list_init(struct tu_bo_list *list) { - if (u_trace_iterator_equal(begin, end)) - return; - - tu_cs_emit_wfi(cs); - tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); - u_trace_clone_append(begin, end, &cmd->trace, cs, - tu_copy_timestamp_buffer); + list->count = list->capacity = 0; + list->bo_infos = NULL; } -static void -tu_clone_trace(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - struct u_trace *trace) +void +tu_bo_list_destroy(struct tu_bo_list *list) { - tu_clone_trace_range(cmd, cs, u_trace_begin_iterator(trace), - u_trace_end_iterator(trace)); + free(list->bo_infos); } void -tu6_emit_event_write(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - enum vgt_event_type event) +tu_bo_list_reset(struct tu_bo_list *list) { - bool need_seqno = false; - switch (event) { - case CACHE_FLUSH_TS: - case WT_DONE_TS: - case RB_DONE_TS: - case PC_CCU_FLUSH_DEPTH_TS: - case PC_CCU_FLUSH_COLOR_TS: - case PC_CCU_RESOLVE_TS: - need_seqno = true; - break; - default: - break; + list->count = 0; +} + +/** + * \a flags consists of MSM_SUBMIT_BO_FLAGS. + */ +static uint32_t +tu_bo_list_add_info(struct tu_bo_list *list, + const struct drm_msm_gem_submit_bo *bo_info) +{ + for (uint32_t i = 0; i < list->count; ++i) { + if (list->bo_infos[i].handle == bo_info->handle) { + assert(list->bo_infos[i].presumed == bo_info->presumed); + list->bo_infos[i].flags |= bo_info->flags; + return i; + } } - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1); - tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event)); - if (need_seqno) { - tu_cs_emit_qw(cs, global_iova(cmd, seqno_dummy)); - tu_cs_emit(cs, 0); + /* grow list->bo_infos if needed */ + if (list->count == list->capacity) { + uint32_t new_capacity = MAX2(2 * list->count, 16); + struct drm_msm_gem_submit_bo *new_bo_infos = realloc( + list->bo_infos, new_capacity * sizeof(struct drm_msm_gem_submit_bo)); + if (!new_bo_infos) + return TU_BO_LIST_FAILED; + list->bo_infos = new_bo_infos; + list->capacity = new_capacity; } + + list->bo_infos[list->count] = *bo_info; + return list->count++; } -/* Emits the tessfactor address to the top-level CS if it hasn't been already. - * Updating this register requires a WFI if outstanding drawing is using it, but - * tu6_init_hardware() will have WFIed before we started and no other draws - * could be using the tessfactor address yet since we only emit one per cmdbuf. - */ -static void -tu6_lazy_emit_tessfactor_addr(struct tu_cmd_buffer *cmd) +uint32_t +tu_bo_list_add(struct tu_bo_list *list, + const struct tu_bo *bo, + uint32_t flags) { - if (cmd->state.tessfactor_addr_set) - return; - - tu_cs_emit_regs(&cmd->cs, A6XX_PC_TESSFACTOR_ADDR(.qword = cmd->device->tess_bo->iova)); - /* Updating PC_TESSFACTOR_ADDR could race with the next draw which uses it. */ - cmd->state.cache.flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE; - cmd->state.tessfactor_addr_set = true; + return tu_bo_list_add_info(list, &(struct drm_msm_gem_submit_bo) { + .flags = flags, + .handle = bo->gem_handle, + .presumed = bo->iova, + }); } -static void -tu6_lazy_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +VkResult +tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other) { - struct tu_device *dev = cmd->device; - - /* VSC buffers: - * use vsc pitches from the largest values used so far with this device - * if there hasn't been overflow, there will already be a scratch bo - * allocated for these sizes - * - * if overflow is detected, the stream size is increased by 2x - */ - mtx_lock(&dev->mutex); + for (uint32_t i = 0; i < other->count; i++) { + if (tu_bo_list_add_info(list, other->bo_infos + i) == TU_BO_LIST_FAILED) + return VK_ERROR_OUT_OF_HOST_MEMORY; + } - struct tu6_global *global = dev->global_bo->map; + return VK_SUCCESS; +} - uint32_t vsc_draw_overflow = global->vsc_draw_overflow; - uint32_t vsc_prim_overflow = global->vsc_prim_overflow; +static VkResult +tu_tiling_config_update_gmem_layout(struct tu_tiling_config *tiling, + const struct tu_device *dev) +{ + const uint32_t gmem_size = dev->physical_device->gmem_size; + uint32_t offset = 0; - if (vsc_draw_overflow >= dev->vsc_draw_strm_pitch) - dev->vsc_draw_strm_pitch = (dev->vsc_draw_strm_pitch - VSC_PAD) * 2 + VSC_PAD; + for (uint32_t i = 0; i < tiling->buffer_count; i++) { + /* 16KB-aligned */ + offset = align(offset, 0x4000); - if (vsc_prim_overflow >= dev->vsc_prim_strm_pitch) - dev->vsc_prim_strm_pitch = (dev->vsc_prim_strm_pitch - VSC_PAD) * 2 + VSC_PAD; + tiling->gmem_offsets[i] = offset; + offset += tiling->tile0.extent.width * tiling->tile0.extent.height * + tiling->buffer_cpp[i]; + } - cmd->vsc_prim_strm_pitch = dev->vsc_prim_strm_pitch; - cmd->vsc_draw_strm_pitch = dev->vsc_draw_strm_pitch; + return offset <= gmem_size ? VK_SUCCESS : VK_ERROR_OUT_OF_DEVICE_MEMORY; +} - mtx_unlock(&dev->mutex); +static void +tu_tiling_config_update_tile_layout(struct tu_tiling_config *tiling, + const struct tu_device *dev) +{ + const uint32_t tile_align_w = dev->physical_device->tile_align_w; + const uint32_t tile_align_h = dev->physical_device->tile_align_h; + const uint32_t max_tile_width = 1024; /* A6xx */ + + tiling->tile0.offset = (VkOffset2D) { + .x = tiling->render_area.offset.x & ~(tile_align_w - 1), + .y = tiling->render_area.offset.y & ~(tile_align_h - 1), + }; + + const uint32_t ra_width = + tiling->render_area.extent.width + + (tiling->render_area.offset.x - tiling->tile0.offset.x); + const uint32_t ra_height = + tiling->render_area.extent.height + + (tiling->render_area.offset.y - tiling->tile0.offset.y); + + /* start from 1 tile */ + tiling->tile_count = (VkExtent2D) { + .width = 1, + .height = 1, + }; + tiling->tile0.extent = (VkExtent2D) { + .width = align(ra_width, tile_align_w), + .height = align(ra_height, tile_align_h), + }; + + /* do not exceed max tile width */ + while (tiling->tile0.extent.width > max_tile_width) { + tiling->tile_count.width++; + tiling->tile0.extent.width = + align(ra_width / tiling->tile_count.width, tile_align_w); + } + + /* do not exceed gmem size */ + while (tu_tiling_config_update_gmem_layout(tiling, dev) != VK_SUCCESS) { + if (tiling->tile0.extent.width > tiling->tile0.extent.height) { + tiling->tile_count.width++; + tiling->tile0.extent.width = + align(ra_width / tiling->tile_count.width, tile_align_w); + } else { + tiling->tile_count.height++; + tiling->tile0.extent.height = + align(ra_height / tiling->tile_count.height, tile_align_h); + } + } +} - struct tu_bo *vsc_bo; - uint32_t size0 = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES + - cmd->vsc_draw_strm_pitch * MAX_VSC_PIPES; +static void +tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling, + const struct tu_device *dev) +{ + const uint32_t max_pipe_count = 32; /* A6xx */ - tu_get_scratch_bo(dev, size0 + MAX_VSC_PIPES * 4, &vsc_bo); + /* start from 1 tile per pipe */ + tiling->pipe0 = (VkExtent2D) { + .width = 1, + .height = 1, + }; + tiling->pipe_count = tiling->tile_count; - tu_cs_emit_regs(cs, - A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0)); - tu_cs_emit_regs(cs, - A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo)); - tu_cs_emit_regs(cs, - A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo, - .bo_offset = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES)); + /* do not exceed max pipe count vertically */ + while (tiling->pipe_count.height > max_pipe_count) { + tiling->pipe0.height += 2; + tiling->pipe_count.height = + (tiling->tile_count.height + tiling->pipe0.height - 1) / + tiling->pipe0.height; + } - cmd->vsc_initialized = true; + /* do not exceed max pipe count */ + while (tiling->pipe_count.width * tiling->pipe_count.height > + max_pipe_count) { + tiling->pipe0.width += 1; + tiling->pipe_count.width = + (tiling->tile_count.width + tiling->pipe0.width - 1) / + tiling->pipe0.width; + } } static void -tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer, - struct tu_cs *cs, - enum tu_cmd_flush_bits flushes) +tu_tiling_config_update_pipes(struct tu_tiling_config *tiling, + const struct tu_device *dev) { - if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_FLUSHALL)) - flushes |= TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE; - - if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_SYNCDRAW)) - flushes |= TU_CMD_FLAG_WAIT_MEM_WRITES | - TU_CMD_FLAG_WAIT_FOR_IDLE | - TU_CMD_FLAG_WAIT_FOR_ME; - - /* Experiments show that invalidating CCU while it still has data in it - * doesn't work, so make sure to always flush before invalidating in case - * any data remains that hasn't yet been made available through a barrier. - * However it does seem to work for UCHE. - */ - if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR | - TU_CMD_FLAG_CCU_INVALIDATE_COLOR)) - tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS); - if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH | - TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)) - tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS); - if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR) - tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR); - if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH) - tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH); - if (flushes & TU_CMD_FLAG_CACHE_FLUSH) - tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS); - if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE) - tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE); - if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES) - tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); - if ((flushes & TU_CMD_FLAG_WAIT_FOR_IDLE) || - (cmd_buffer->device->physical_device->info->a6xx.has_ccu_flush_bug && - (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CCU_FLUSH_DEPTH)))) - tu_cs_emit_wfi(cs); - if (flushes & TU_CMD_FLAG_WAIT_FOR_ME) - tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); -} + const uint32_t max_pipe_count = 32; /* A6xx */ + const uint32_t used_pipe_count = + tiling->pipe_count.width * tiling->pipe_count.height; + const VkExtent2D last_pipe = { + .width = tiling->tile_count.width % tiling->pipe0.width, + .height = tiling->tile_count.height % tiling->pipe0.height, + }; + + assert(used_pipe_count <= max_pipe_count); + assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config)); + + for (uint32_t y = 0; y < tiling->pipe_count.height; y++) { + for (uint32_t x = 0; x < tiling->pipe_count.width; x++) { + const uint32_t pipe_x = tiling->pipe0.width * x; + const uint32_t pipe_y = tiling->pipe0.height * y; + const uint32_t pipe_w = (x == tiling->pipe_count.width - 1) + ? last_pipe.width + : tiling->pipe0.width; + const uint32_t pipe_h = (y == tiling->pipe_count.height - 1) + ? last_pipe.height + : tiling->pipe0.height; + const uint32_t n = tiling->pipe_count.width * y + x; + + tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) | + A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) | + A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) | + A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h); + tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h); + } + } -/* "Normal" cache flushes, that don't require any special handling */ + memset(tiling->pipe_config + used_pipe_count, 0, + sizeof(uint32_t) * (max_pipe_count - used_pipe_count)); +} static void -tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer, - struct tu_cs *cs) -{ - tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.cache.flush_bits); - cmd_buffer->state.cache.flush_bits = 0; +tu_tiling_config_update(struct tu_tiling_config *tiling, + const struct tu_device *dev, + const uint32_t *buffer_cpp, + uint32_t buffer_count, + const VkRect2D *render_area) +{ + /* see if there is any real change */ + const bool ra_changed = + render_area && + memcmp(&tiling->render_area, render_area, sizeof(*render_area)); + const bool buf_changed = tiling->buffer_count != buffer_count || + memcmp(tiling->buffer_cpp, buffer_cpp, + sizeof(*buffer_cpp) * buffer_count); + if (!ra_changed && !buf_changed) + return; + + if (ra_changed) + tiling->render_area = *render_area; + + if (buf_changed) { + memcpy(tiling->buffer_cpp, buffer_cpp, + sizeof(*buffer_cpp) * buffer_count); + tiling->buffer_count = buffer_count; + } + + tu_tiling_config_update_tile_layout(tiling, dev); + tu_tiling_config_update_pipe_layout(tiling, dev); + tu_tiling_config_update_pipes(tiling, dev); } -/* Renderpass cache flushes */ +static void +tu_tiling_config_get_tile(const struct tu_tiling_config *tiling, + const struct tu_device *dev, + uint32_t tx, + uint32_t ty, + struct tu_tile *tile) +{ + /* find the pipe and the slot for tile (tx, ty) */ + const uint32_t px = tx / tiling->pipe0.width; + const uint32_t py = ty / tiling->pipe0.height; + const uint32_t sx = tx - tiling->pipe0.width * px; + const uint32_t sy = ty - tiling->pipe0.height * py; + + assert(tx < tiling->tile_count.width && ty < tiling->tile_count.height); + assert(px < tiling->pipe_count.width && py < tiling->pipe_count.height); + assert(sx < tiling->pipe0.width && sy < tiling->pipe0.height); + + /* convert to 1D indices */ + tile->pipe = tiling->pipe_count.width * py + px; + tile->slot = tiling->pipe0.width * sy + sx; + + /* get the blit area for the tile */ + tile->begin = (VkOffset2D) { + .x = tiling->tile0.offset.x + tiling->tile0.extent.width * tx, + .y = tiling->tile0.offset.y + tiling->tile0.extent.height * ty, + }; + tile->end.x = + (tx == tiling->tile_count.width - 1) + ? tiling->render_area.offset.x + tiling->render_area.extent.width + : tile->begin.x + tiling->tile0.extent.width; + tile->end.y = + (ty == tiling->tile_count.height - 1) + ? tiling->render_area.offset.y + tiling->render_area.extent.height + : tile->begin.y + tiling->tile0.extent.height; +} + +static enum a3xx_msaa_samples +tu6_msaa_samples(uint32_t samples) +{ + switch (samples) { + case 1: + return MSAA_ONE; + case 2: + return MSAA_TWO; + case 4: + return MSAA_FOUR; + case 8: + return MSAA_EIGHT; + default: + assert(!"invalid sample count"); + return MSAA_ONE; + } +} -void -tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer, - struct tu_cs *cs) +static enum a4xx_index_size +tu6_index_size(VkIndexType type) { - if (!cmd_buffer->state.renderpass_cache.flush_bits && - likely(!cmd_buffer->device->physical_device->instance->debug_flags)) - return; - tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.renderpass_cache.flush_bits); - cmd_buffer->state.renderpass_cache.flush_bits = 0; + switch (type) { + case VK_INDEX_TYPE_UINT16: + return INDEX4_SIZE_16_BIT; + case VK_INDEX_TYPE_UINT32: + return INDEX4_SIZE_32_BIT; + default: + unreachable("invalid VkIndexType"); + return INDEX4_SIZE_8_BIT; + } } -/* Cache flushes for things that use the color/depth read/write path (i.e. - * blits and draws). This deals with changing CCU state as well as the usual - * cache flushing. - */ +static void +tu6_emit_marker(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + tu_cs_emit_write_reg(cs, cmd->marker_reg, ++cmd->marker_seqno); +} void -tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, - struct tu_cs *cs, - enum tu_cmd_ccu_state ccu_state) +tu6_emit_event_write(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + enum vgt_event_type event, + bool need_seqno) { - enum tu_cmd_flush_bits flushes = cmd_buffer->state.cache.flush_bits; + tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1); + tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event)); + if (need_seqno) { + tu_cs_emit_qw(cs, cmd->scratch_bo.iova); + tu_cs_emit(cs, ++cmd->scratch_seqno); + } +} - assert(ccu_state != TU_CMD_CCU_UNKNOWN); - /* It's unsafe to flush inside condition because we clear flush_bits */ - assert(!cs->cond_stack_depth); +static void +tu6_emit_cache_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + tu6_emit_event_write(cmd, cs, 0x31, false); +} - /* Changing CCU state must involve invalidating the CCU. In sysmem mode, - * the CCU may also contain data that we haven't flushed out yet, so we - * also need to flush. Also, in order to program RB_CCU_CNTL, we need to - * emit a WFI as it isn't pipelined. - */ - if (ccu_state != cmd_buffer->state.ccu_state) { - if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) { - flushes |= - TU_CMD_FLAG_CCU_FLUSH_COLOR | - TU_CMD_FLAG_CCU_FLUSH_DEPTH; - cmd_buffer->state.cache.pending_flush_bits &= ~( - TU_CMD_FLAG_CCU_FLUSH_COLOR | - TU_CMD_FLAG_CCU_FLUSH_DEPTH); - } - flushes |= - TU_CMD_FLAG_CCU_INVALIDATE_COLOR | - TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | - TU_CMD_FLAG_WAIT_FOR_IDLE; - cmd_buffer->state.cache.pending_flush_bits &= ~( - TU_CMD_FLAG_CCU_INVALIDATE_COLOR | - TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | - TU_CMD_FLAG_WAIT_FOR_IDLE); - } +static void +tu6_emit_lrz_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + tu6_emit_event_write(cmd, cs, LRZ_FLUSH, false); +} - tu6_emit_flushes(cmd_buffer, cs, flushes); - cmd_buffer->state.cache.flush_bits = 0; - - if (ccu_state != cmd_buffer->state.ccu_state) { - struct tu_physical_device *phys_dev = cmd_buffer->device->physical_device; - tu_cs_emit_regs(cs, - A6XX_RB_CCU_CNTL(.color_offset = - ccu_state == TU_CMD_CCU_GMEM ? - phys_dev->ccu_offset_gmem : - phys_dev->ccu_offset_bypass, - .gmem = ccu_state == TU_CMD_CCU_GMEM)); - cmd_buffer->state.ccu_state = ccu_state; +static void +tu6_emit_wfi(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + if (cmd->wait_for_idle) { + tu_cs_emit_wfi(cs); + cmd->wait_for_idle = false; } } static void -tu6_emit_zs(struct tu_cmd_buffer *cmd, - const struct tu_subpass *subpass, - struct tu_cs *cs) +tu6_emit_zs(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { + const struct tu_subpass *subpass = cmd->state.subpass; + const uint32_t a = subpass->depth_stencil_attachment.attachment; if (a == VK_ATTACHMENT_UNUSED) { - tu_cs_emit_regs(cs, - A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE), - A6XX_RB_DEPTH_BUFFER_PITCH(0), - A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0), - A6XX_RB_DEPTH_BUFFER_BASE(0), - A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0)); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6); + tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH6_NONE)); + tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_PITCH */ + tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_ARRAY_PITCH */ + tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_BASE_LO */ + tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_BASE_HI */ + tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_BASE_GMEM */ + + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_DEPTH_BUFFER_INFO, 1); + tu_cs_emit(cs, + A6XX_GRAS_SU_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH6_NONE)); - tu_cs_emit_regs(cs, - A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE)); + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_BUFFER_BASE_LO, 5); + tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_LO */ + tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_HI */ + tu_cs_emit(cs, 0x00000000); /* GRAS_LRZ_BUFFER_PITCH */ + tu_cs_emit(cs, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO */ + tu_cs_emit(cs, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_HI */ - tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0)); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 1); + tu_cs_emit(cs, 0x00000000); /* RB_STENCIL_INFO */ return; } - const struct tu_image_view *iview = cmd->state.attachments[a]; - const struct tu_render_pass_attachment *attachment = - &cmd->state.pass->attachments[a]; - enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6); - tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value); - if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) - tu_cs_image_depth_ref(cs, iview, 0); - else - tu_cs_image_ref(cs, &iview->view, 0); - tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, attachment)); - - tu_cs_emit_regs(cs, - A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt)); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3); - tu_cs_image_flag_ref(cs, &iview->view, 0); - - if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT || - attachment->format == VK_FORMAT_S8_UINT) { - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6); - tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value); - if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { - tu_cs_image_stencil_ref(cs, iview, 0); - tu_cs_emit(cs, tu_attachment_gmem_offset_stencil(cmd, attachment)); - } else { - tu_cs_image_ref(cs, &iview->view, 0); - tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, attachment)); - } - } else { - tu_cs_emit_regs(cs, - A6XX_RB_STENCIL_INFO(0)); - } + /* enable zs? */ } static void -tu6_emit_mrt(struct tu_cmd_buffer *cmd, - const struct tu_subpass *subpass, - struct tu_cs *cs) +tu6_emit_mrt(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_subpass *subpass = cmd->state.subpass; + const struct tu_tiling_config *tiling = &cmd->state.tiling_config; + unsigned char mrt_comp[MAX_RTS] = { 0 }; + unsigned srgb_cntl = 0; - enum a6xx_format mrt0_format = 0; - + uint32_t gmem_index = 0; for (uint32_t i = 0; i < subpass->color_count; ++i) { uint32_t a = subpass->color_attachments[i].attachment; - if (a == VK_ATTACHMENT_UNUSED) { - /* From the VkPipelineRenderingCreateInfo definition: - * - * Valid formats indicate that an attachment can be used - but it - * is still valid to set the attachment to NULL when beginning - * rendering. - * - * This means that with dynamic rendering, pipelines may write to - * some attachments that are UNUSED here. Setting the format to 0 - * here should prevent them from writing to anything. This also seems - * to also be required for alpha-to-coverage which can use the alpha - * value for an otherwise-unused attachment. - */ - tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6); - for (unsigned i = 0; i < 6; i++) - tu_cs_emit(cs, 0); - - tu_cs_emit_regs(cs, - A6XX_SP_FS_MRT_REG(i, .dword = 0)); + if (a == VK_ATTACHMENT_UNUSED) continue; - } - const struct tu_image_view *iview = cmd->state.attachments[a]; + const struct tu_image_view *iview = fb->attachments[a].attachment; + const struct tu_image_level *slice = + &iview->image->levels[iview->base_mip]; + const enum a6xx_tile_mode tile_mode = TILE6_LINEAR; + uint32_t stride = 0; + uint32_t offset = 0; - tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6); - tu_cs_emit(cs, iview->view.RB_MRT_BUF_INFO); - tu_cs_image_ref(cs, &iview->view, 0); - tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, &cmd->state.pass->attachments[a])); + mrt_comp[i] = 0xf; - tu_cs_emit_regs(cs, - A6XX_SP_FS_MRT_REG(i, .dword = iview->view.SP_FS_MRT_REG)); + if (vk_format_is_srgb(iview->vk_format)) + srgb_cntl |= (1 << i); - tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR(i), 3); - tu_cs_image_flag_ref(cs, &iview->view, 0); + const struct tu_native_format *format = + tu6_get_native_format(iview->vk_format); + assert(format && format->rb >= 0); - if (i == 0) - mrt0_format = iview->view.SP_FS_MRT_REG & 0xff; - } + offset = slice->offset + slice->size * iview->base_layer; + stride = slice->pitch * vk_format_get_blocksize(iview->vk_format); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6); + tu_cs_emit(cs, A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format->rb) | + A6XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) | + A6XX_RB_MRT_BUF_INFO_COLOR_SWAP(format->swap)); + tu_cs_emit(cs, A6XX_RB_MRT_PITCH(stride)); + tu_cs_emit(cs, A6XX_RB_MRT_ARRAY_PITCH(slice->size)); + tu_cs_emit_qw(cs, iview->image->bo->iova + iview->image->bo_offset + + offset); /* BASE_LO/HI */ + tu_cs_emit( + cs, tiling->gmem_offsets[gmem_index++]); /* RB_MRT[i].BASE_GMEM */ + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_MRT_REG(i), 1); + tu_cs_emit(cs, A6XX_SP_FS_MRT_REG_COLOR_FORMAT(format->rb)); + +#if 0 + /* when we support UBWC, these would be the system memory + * addr/pitch/etc: + */ + tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(i), 4); + tu_cs_emit(cs, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_LO */ + tu_cs_emit(cs, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_HI */ + tu_cs_emit(cs, A6XX_RB_MRT_FLAG_BUFFER_PITCH(0)); + tu_cs_emit(cs, A6XX_RB_MRT_FLAG_BUFFER_ARRAY_PITCH(0)); +#endif + } + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_SRGB_CNTL, 1); + tu_cs_emit(cs, srgb_cntl); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_SRGB_CNTL, 1); + tu_cs_emit(cs, srgb_cntl); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_COMPONENTS, 1); + tu_cs_emit(cs, A6XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | + A6XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | + A6XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | + A6XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | + A6XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | + A6XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | + A6XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | + A6XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_RENDER_COMPONENTS, 1); + tu_cs_emit(cs, A6XX_SP_FS_RENDER_COMPONENTS_RT0(mrt_comp[0]) | + A6XX_SP_FS_RENDER_COMPONENTS_RT1(mrt_comp[1]) | + A6XX_SP_FS_RENDER_COMPONENTS_RT2(mrt_comp[2]) | + A6XX_SP_FS_RENDER_COMPONENTS_RT3(mrt_comp[3]) | + A6XX_SP_FS_RENDER_COMPONENTS_RT4(mrt_comp[4]) | + A6XX_SP_FS_RENDER_COMPONENTS_RT5(mrt_comp[5]) | + A6XX_SP_FS_RENDER_COMPONENTS_RT6(mrt_comp[6]) | + A6XX_SP_FS_RENDER_COMPONENTS_RT7(mrt_comp[7])); +} - tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = mrt0_format)); +static void +tu6_emit_msaa(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + const struct tu_subpass *subpass = cmd->state.subpass; + const enum a3xx_msaa_samples samples = + tu6_msaa_samples(subpass->max_sample_count); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_RAS_MSAA_CNTL, 2); + tu_cs_emit(cs, A6XX_SP_TP_RAS_MSAA_CNTL_SAMPLES(samples)); + tu_cs_emit( + cs, A6XX_SP_TP_DEST_MSAA_CNTL_SAMPLES(samples) | + ((samples == MSAA_ONE) ? A6XX_SP_TP_DEST_MSAA_CNTL_MSAA_DISABLE + : 0)); - tu_cs_emit_regs(cs, - A6XX_RB_SRGB_CNTL(.dword = subpass->srgb_cntl)); - tu_cs_emit_regs(cs, - A6XX_SP_SRGB_CNTL(.dword = subpass->srgb_cntl)); + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_RAS_MSAA_CNTL, 2); + tu_cs_emit(cs, A6XX_GRAS_RAS_MSAA_CNTL_SAMPLES(samples)); + tu_cs_emit( + cs, + A6XX_GRAS_DEST_MSAA_CNTL_SAMPLES(samples) | + ((samples == MSAA_ONE) ? A6XX_GRAS_DEST_MSAA_CNTL_MSAA_DISABLE : 0)); - unsigned layers = MAX2(fb->layers, util_logbase2(subpass->multiview_mask) + 1); - tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(layers - 1)); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_RAS_MSAA_CNTL, 2); + tu_cs_emit(cs, A6XX_RB_RAS_MSAA_CNTL_SAMPLES(samples)); + tu_cs_emit( + cs, + A6XX_RB_DEST_MSAA_CNTL_SAMPLES(samples) | + ((samples == MSAA_ONE) ? A6XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE : 0)); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_MSAA_CNTL, 1); + tu_cs_emit(cs, A6XX_RB_MSAA_CNTL_SAMPLES(samples)); } static void -tu6_emit_bin_size(struct tu_cs *cs, - uint32_t bin_w, uint32_t bin_h, uint32_t flags) +tu6_emit_bin_size(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t flags) { - tu_cs_emit_regs(cs, - A6XX_GRAS_BIN_CONTROL(.binw = bin_w, - .binh = bin_h, - .dword = flags)); + const struct tu_tiling_config *tiling = &cmd->state.tiling_config; + const uint32_t bin_w = tiling->tile0.extent.width; + const uint32_t bin_h = tiling->tile0.extent.height; + + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_BIN_CONTROL, 1); + tu_cs_emit(cs, A6XX_GRAS_BIN_CONTROL_BINW(bin_w) | + A6XX_GRAS_BIN_CONTROL_BINH(bin_h) | flags); - tu_cs_emit_regs(cs, - A6XX_RB_BIN_CONTROL(.binw = bin_w, - .binh = bin_h, - .dword = flags)); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BIN_CONTROL, 1); + tu_cs_emit(cs, A6XX_RB_BIN_CONTROL_BINW(bin_w) | + A6XX_RB_BIN_CONTROL_BINH(bin_h) | flags); /* no flag for RB_BIN_CONTROL2... */ - tu_cs_emit_regs(cs, - A6XX_RB_BIN_CONTROL2(.binw = bin_w, - .binh = bin_h)); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BIN_CONTROL2, 1); + tu_cs_emit(cs, A6XX_RB_BIN_CONTROL2_BINW(bin_w) | + A6XX_RB_BIN_CONTROL2_BINH(bin_h)); } static void tu6_emit_render_cntl(struct tu_cmd_buffer *cmd, - const struct tu_subpass *subpass, struct tu_cs *cs, bool binning) { - /* doesn't RB_RENDER_CNTL set differently for binning pass: */ - bool no_track = !cmd->device->physical_device->info->a6xx.has_cp_reg_write; uint32_t cntl = 0; - cntl |= A6XX_RB_RENDER_CNTL_CCUSINGLECACHELINESIZE(2); - if (binning) { - if (no_track) - return; + cntl |= A6XX_RB_RENDER_CNTL_UNK4; + if (binning) cntl |= A6XX_RB_RENDER_CNTL_BINNING; - } else { - uint32_t mrts_ubwc_enable = 0; - for (uint32_t i = 0; i < subpass->color_count; ++i) { - uint32_t a = subpass->color_attachments[i].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - - const struct tu_image_view *iview = cmd->state.attachments[a]; - if (iview->view.ubwc_enabled) - mrts_ubwc_enable |= 1 << i; - } - - cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable); - - const uint32_t a = subpass->depth_stencil_attachment.attachment; - if (a != VK_ATTACHMENT_UNUSED) { - const struct tu_image_view *iview = cmd->state.attachments[a]; - if (iview->view.ubwc_enabled) - cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH; - } - - if (no_track) { - tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CNTL, 1); - tu_cs_emit(cs, cntl); - return; - } - - /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs - * in order to set it correctly for the different subpasses. However, - * that means the packets we're emitting also happen during binning. So - * we need to guard the write on !BINNING at CP execution time. - */ - tu_cs_reserve(cs, 3 + 4); - tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2); - tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | - CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM); - tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4)); - } tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3); - tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL)); + tu_cs_emit(cs, 0x2); tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL); tu_cs_emit(cs, cntl); } static void -tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align) +tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { - struct tu_physical_device *phys_dev = cmd->device->physical_device; - const VkRect2D *render_area = &cmd->state.render_area; - - /* Avoid assertion fails with an empty render area at (0, 0) where the - * subtraction below wraps around. Empty render areas should be forced to - * the sysmem path by use_sysmem_rendering(). It's not even clear whether - * an empty scissor here works, and the blob seems to force sysmem too as - * it sets something wrong (non-empty) for the scissor. - */ - if (render_area->extent.width == 0 || - render_area->extent.height == 0) - return; - - uint32_t x1 = render_area->offset.x; - uint32_t y1 = render_area->offset.y; - uint32_t x2 = x1 + render_area->extent.width - 1; - uint32_t y2 = y1 + render_area->extent.height - 1; - - if (align) { - x1 = x1 & ~(phys_dev->info->gmem_align_w - 1); - y1 = y1 & ~(phys_dev->info->gmem_align_h - 1); - x2 = ALIGN_POT(x2 + 1, phys_dev->info->gmem_align_w) - 1; - y2 = ALIGN_POT(y2 + 1, phys_dev->info->gmem_align_h) - 1; - } + const VkRect2D *render_area = &cmd->state.tiling_config.render_area; + const uint32_t x1 = render_area->offset.x; + const uint32_t y1 = render_area->offset.y; + const uint32_t x2 = x1 + render_area->extent.width - 1; + const uint32_t y2 = y1 + render_area->extent.height - 1; - tu_cs_emit_regs(cs, - A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1), - A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2)); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2); + tu_cs_emit(cs, + A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1)); + tu_cs_emit(cs, + A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2)); } -void -tu6_emit_window_scissor(struct tu_cs *cs, - uint32_t x1, - uint32_t y1, - uint32_t x2, - uint32_t y2) -{ - tu_cs_emit_regs(cs, - A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1), - A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2)); - - tu_cs_emit_regs(cs, - A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = x1, .y = y1), - A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2)); +static void +tu6_emit_blit_info(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + const struct tu_image_view *iview, + uint32_t gmem_offset, + uint32_t blit_info) +{ + const struct tu_image_level *slice = + &iview->image->levels[iview->base_mip]; + const uint32_t offset = slice->offset + slice->size * iview->base_layer; + const uint32_t stride = + slice->pitch * vk_format_get_blocksize(iview->vk_format); + const enum a6xx_tile_mode tile_mode = TILE6_LINEAR; + const enum a3xx_msaa_samples samples = tu6_msaa_samples(1); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1); + tu_cs_emit(cs, blit_info); + + /* tile mode? */ + const struct tu_native_format *format = + tu6_get_native_format(iview->vk_format); + assert(format && format->rb >= 0); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 5); + tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_TILE_MODE(tile_mode) | + A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) | + A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(format->rb) | + A6XX_RB_BLIT_DST_INFO_COLOR_SWAP(format->swap)); + tu_cs_emit_qw(cs, + iview->image->bo->iova + iview->image->bo_offset + offset); + tu_cs_emit(cs, A6XX_RB_BLIT_DST_PITCH(stride)); + tu_cs_emit(cs, A6XX_RB_BLIT_DST_ARRAY_PITCH(slice->size)); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1); + tu_cs_emit(cs, gmem_offset); } -void -tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1) -{ - tu_cs_emit_regs(cs, - A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1)); - - tu_cs_emit_regs(cs, - A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1)); +static void +tu6_emit_blit_clear(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + const struct tu_image_view *iview, + uint32_t gmem_offset, + const VkClearValue *clear_value) +{ + const enum a6xx_tile_mode tile_mode = TILE6_LINEAR; + const enum a3xx_msaa_samples samples = tu6_msaa_samples(1); + + const struct tu_native_format *format = + tu6_get_native_format(iview->vk_format); + assert(format && format->rb >= 0); + /* must be WZYX; other values are ignored */ + const enum a3xx_color_swap swap = WZYX; + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1); + tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_TILE_MODE(tile_mode) | + A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) | + A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(format->rb) | + A6XX_RB_BLIT_DST_INFO_COLOR_SWAP(swap)); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1); + tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(0xf)); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1); + tu_cs_emit(cs, gmem_offset); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1); + tu_cs_emit(cs, 0); - tu_cs_emit_regs(cs, - A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1)); + /* pack clear_value into WZYX order */ + uint32_t clear_vals[4] = { 0 }; + tu_pack_clear_value(clear_value, iview->vk_format, clear_vals); - tu_cs_emit_regs(cs, - A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1)); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4); + tu_cs_emit(cs, clear_vals[0]); + tu_cs_emit(cs, clear_vals[1]); + tu_cs_emit(cs, clear_vals[2]); + tu_cs_emit(cs, clear_vals[3]); } -void -tu6_apply_depth_bounds_workaround(struct tu_device *device, - uint32_t *rb_depth_cntl) +static void +tu6_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { - if (!device->physical_device->info->a6xx.depth_bounds_require_depth_test_quirk) - return; - - /* On some GPUs it is necessary to enable z test for depth bounds test when - * UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is required to - * pass z test. Relevant tests: - * dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable - * dEQP-VK.dynamic_state.ds_state.depth_bounds_1 - */ - *rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | - A6XX_RB_DEPTH_CNTL_ZFUNC(FUNC_ALWAYS); + tu6_emit_marker(cmd, cs); + tu6_emit_event_write(cmd, cs, BLIT, false); + tu6_emit_marker(cmd, cs); } static void -tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state) +tu6_emit_window_scissor(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t x1, + uint32_t y1, + uint32_t x2, + uint32_t y2) { - uint32_t enable_mask; - switch (id) { - case TU_DRAW_STATE_PROGRAM: - /* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even - * when resources would actually be used in the binning shader. - * Presumably the overhead of prefetching the resources isn't - * worth it. - */ - case TU_DRAW_STATE_DESC_SETS_LOAD: - enable_mask = CP_SET_DRAW_STATE__0_GMEM | - CP_SET_DRAW_STATE__0_SYSMEM; - break; - case TU_DRAW_STATE_PROGRAM_BINNING: - enable_mask = CP_SET_DRAW_STATE__0_BINNING; - break; - case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM: - case TU_DRAW_STATE_PRIM_MODE_GMEM: - enable_mask = CP_SET_DRAW_STATE__0_GMEM; - break; - case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM: - case TU_DRAW_STATE_PRIM_MODE_SYSMEM: - enable_mask = CP_SET_DRAW_STATE__0_SYSMEM; - break; - default: - enable_mask = CP_SET_DRAW_STATE__0_GMEM | - CP_SET_DRAW_STATE__0_SYSMEM | - CP_SET_DRAW_STATE__0_BINNING; - break; - } + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); + tu_cs_emit(cs, A6XX_GRAS_SC_WINDOW_SCISSOR_TL_X(x1) | + A6XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(y1)); + tu_cs_emit(cs, A6XX_GRAS_SC_WINDOW_SCISSOR_BR_X(x2) | + A6XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(y2)); - STATIC_ASSERT(TU_DRAW_STATE_COUNT <= 32); - - /* We need to reload the descriptors every time the descriptor sets - * change. However, the commands we send only depend on the pipeline - * because the whole point is to cache descriptors which are used by the - * pipeline. There's a problem here, in that the firmware has an - * "optimization" which skips executing groups that are set to the same - * value as the last draw. This means that if the descriptor sets change - * but not the pipeline, we'd try to re-execute the same buffer which - * the firmware would ignore and we wouldn't pre-load the new - * descriptors. Set the DIRTY bit to avoid this optimization - */ - if (id == TU_DRAW_STATE_DESC_SETS_LOAD) - enable_mask |= CP_SET_DRAW_STATE__0_DIRTY; - - tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(state.size) | - enable_mask | - CP_SET_DRAW_STATE__0_GROUP_ID(id) | - COND(!state.size || !state.iova, CP_SET_DRAW_STATE__0_DISABLE)); - tu_cs_emit_qw(cs, state.iova); -} - -void -tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples, - bool msaa_disable) -{ - const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples); - msaa_disable |= (samples == MSAA_ONE); - tu_cs_emit_regs(cs, - A6XX_SP_TP_RAS_MSAA_CNTL(samples), - A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples, - .msaa_disable = msaa_disable)); - - tu_cs_emit_regs(cs, - A6XX_GRAS_RAS_MSAA_CNTL(samples), - A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples, - .msaa_disable = msaa_disable)); - - tu_cs_emit_regs(cs, - A6XX_RB_RAS_MSAA_CNTL(samples), - A6XX_RB_DEST_MSAA_CNTL(.samples = samples, - .msaa_disable = msaa_disable)); + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_RESOLVE_CNTL_1, 2); + tu_cs_emit( + cs, A6XX_GRAS_RESOLVE_CNTL_1_X(x1) | A6XX_GRAS_RESOLVE_CNTL_1_Y(y1)); + tu_cs_emit( + cs, A6XX_GRAS_RESOLVE_CNTL_2_X(x2) | A6XX_GRAS_RESOLVE_CNTL_2_Y(y2)); } static void -tu6_update_msaa(struct tu_cmd_buffer *cmd, VkSampleCountFlagBits samples) -{ - bool is_line = - tu6_primtype_line(cmd->state.primtype) || - (tu6_primtype_patches(cmd->state.primtype) && - cmd->state.pipeline && - cmd->state.pipeline->tess.patch_type == IR3_TESS_ISOLINES); - bool msaa_disable = is_line && cmd->state.line_mode == BRESENHAM; - - if (cmd->state.msaa_disable != msaa_disable || - cmd->state.samples != samples) { - struct tu_cs cs; - cmd->state.msaa = tu_cs_draw_state(&cmd->sub_cs, &cs, 9); - tu6_emit_msaa(&cs, samples, msaa_disable); - if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) { - tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3); - tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_MSAA, cmd->state.msaa); - } - cmd->state.msaa_disable = msaa_disable; - cmd->state.samples = samples; - } -} - -static bool -use_hw_binning(struct tu_cmd_buffer *cmd) +tu6_emit_window_offset(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t x1, + uint32_t y1) { - const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout]; + tu_cs_emit_pkt4(cs, REG_A6XX_RB_WINDOW_OFFSET, 1); + tu_cs_emit(cs, A6XX_RB_WINDOW_OFFSET_X(x1) | A6XX_RB_WINDOW_OFFSET_Y(y1)); - /* XFB commands are emitted for BINNING || SYSMEM, which makes it - * incompatible with non-hw binning GMEM rendering. this is required because - * some of the XFB commands need to only be executed once. - * use_sysmem_rendering() should have made sure we only ended up here if no - * XFB was used. - */ - if (cmd->state.rp.xfb_used) { - assert(tiling->binning_possible); - return true; - } + tu_cs_emit_pkt4(cs, REG_A6XX_RB_WINDOW_OFFSET2, 1); + tu_cs_emit(cs, + A6XX_RB_WINDOW_OFFSET2_X(x1) | A6XX_RB_WINDOW_OFFSET2_Y(y1)); - /* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT emulates GL_PRIMITIVES_GENERATED, - * which wasn't designed to care about tilers and expects the result not to - * be multiplied by tile count. - * See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3131 - */ - if (cmd->state.rp.has_prim_generated_query_in_rp || - cmd->state.prim_generated_query_running_before_rp) { - assert(tiling->binning_possible); - return true; - } + tu_cs_emit_pkt4(cs, REG_A6XX_SP_WINDOW_OFFSET, 1); + tu_cs_emit(cs, A6XX_SP_WINDOW_OFFSET_X(x1) | A6XX_SP_WINDOW_OFFSET_Y(y1)); - return tiling->binning; -} - -static bool -use_sysmem_rendering(struct tu_cmd_buffer *cmd, - struct tu_renderpass_result **autotune_result) -{ - if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM)) - return true; - - /* can't fit attachments into gmem */ - if (!cmd->state.pass->gmem_pixels[cmd->state.gmem_layout]) - return true; - - if (cmd->state.framebuffer->layers > 1) - return true; - - /* Use sysmem for empty render areas */ - if (cmd->state.render_area.extent.width == 0 || - cmd->state.render_area.extent.height == 0) - return true; - - if (cmd->state.rp.has_tess) - return true; - - if (cmd->state.rp.disable_gmem) - return true; - - /* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */ - if (cmd->state.rp.xfb_used && !cmd->state.tiling->binning_possible) - return true; - - /* QUERY_TYPE_PRIMITIVES_GENERATED is incompatible with non-hw binning - * GMEM rendering, see use_hw_binning. - */ - if ((cmd->state.rp.has_prim_generated_query_in_rp || - cmd->state.prim_generated_query_running_before_rp) && - !cmd->state.tiling->binning_possible) - return true; - - if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_GMEM)) - return false; - - bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune, - cmd, autotune_result); - if (*autotune_result) { - list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results); - } - - return use_sysmem; -} - -/* Optimization: there is no reason to load gmem if there is no - * geometry to process. COND_REG_EXEC predicate is set here, - * but the actual skip happens in tu6_emit_tile_load() and tile_store_cs, - * for each blit separately. - */ -static void -tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - uint32_t pipe, uint32_t slot, bool wfm) -{ - if (cmd->state.tiling->binning_possible) { - tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); - tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(pipe)) | - A6XX_CP_REG_TEST_0_BIT(slot) | - COND(wfm, A6XX_CP_REG_TEST_0_WAIT_FOR_ME)); - } else { - /* COND_REG_EXECs are not emitted in non-binning case */ - } + tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_WINDOW_OFFSET, 1); + tu_cs_emit( + cs, A6XX_SP_TP_WINDOW_OFFSET_X(x1) | A6XX_SP_TP_WINDOW_OFFSET_Y(y1)); } static void tu6_emit_tile_select(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot) + const struct tu_tile *tile) { - const struct tu_tiling_config *tiling = cmd->state.tiling; + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(0x7)); + tu6_emit_marker(cmd, cs); tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); - tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM)); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM) | 0x10); + tu6_emit_marker(cmd, cs); - const uint32_t x1 = tiling->tile0.width * tx; - const uint32_t y1 = tiling->tile0.height * ty; - const uint32_t x2 = MIN2(x1 + tiling->tile0.width - 1, MAX_VIEWPORT_SIZE - 1); - const uint32_t y2 = MIN2(y1 + tiling->tile0.height - 1, MAX_VIEWPORT_SIZE - 1); - tu6_emit_window_scissor(cs, x1, y1, x2, y2); - tu6_emit_window_offset(cs, x1, y1); + const uint32_t x1 = tile->begin.x; + const uint32_t y1 = tile->begin.y; + const uint32_t x2 = tile->end.x - 1; + const uint32_t y2 = tile->end.y - 1; + tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2); + tu6_emit_window_offset(cmd, cs, x1, y1); - bool hw_binning = use_hw_binning(cmd); + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_OVERRIDE, 1); + tu_cs_emit(cs, A6XX_VPC_SO_OVERRIDE_SO_DISABLE); - if (hw_binning) { - tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + if (false) { + /* hw binning? */ + } else { + tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); + tu_cs_emit(cs, 0x1); tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); tu_cs_emit(cs, 0x0); - - tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4); - tu_cs_emit(cs, tiling->pipe_sizes[pipe] | - CP_SET_BIN_DATA5_0_VSC_N(slot)); - tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch); - tu_cs_emit(cs, pipe * 4); - tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch); } - - tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, hw_binning); - - tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); - tu_cs_emit(cs, !hw_binning); - - tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); - tu_cs_emit(cs, 0x0); } static void -tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - uint32_t layer_mask, - uint32_t a, - uint32_t gmem_a) +tu6_emit_tile_load(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_image_view *dst = cmd->state.attachments[a]; - const struct tu_image_view *src = cmd->state.attachments[gmem_a]; - - tu_resolve_sysmem(cmd, cs, src, dst, layer_mask, fb->layers, &cmd->state.render_area); -} - -static void -tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - const struct tu_subpass *subpass) -{ - if (subpass->resolve_attachments) { - /* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass - * Commands": - * - * End-of-subpass multisample resolves are treated as color - * attachment writes for the purposes of synchronization. - * This applies to resolve operations for both color and - * depth/stencil attachments. That is, they are considered to - * execute in the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT - * pipeline stage and their writes are synchronized with - * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between - * rendering within a subpass and any resolve operations at the end - * of the subpass occurs automatically, without need for explicit - * dependencies or pipeline barriers. However, if the resolve - * attachment is also used in a different subpass, an explicit - * dependency is needed. - * - * We use the CP_BLIT path for sysmem resolves, which is really a - * transfer command, so we have to manually flush similar to the gmem - * resolve case. However, a flush afterwards isn't needed because of the - * last sentence and the fact that we're in sysmem mode. - */ - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS); - if (subpass->resolve_depth_stencil) - tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS); + const struct tu_subpass *subpass = cmd->state.subpass; + const struct tu_tiling_config *tiling = &cmd->state.tiling_config; + const struct tu_attachment_state *attachments = cmd->state.attachments; - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); + tu6_emit_blit_scissor(cmd, cs); - /* Wait for the flushes to land before using the 2D engine */ - tu_cs_emit_wfi(cs); - - for (unsigned i = 0; i < subpass->resolve_count; i++) { - uint32_t a = subpass->resolve_attachments[i].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - - uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); + uint32_t gmem_index = 0; + for (uint32_t i = 0; i < subpass->color_count; ++i) { + const uint32_t a = subpass->color_attachments[i].attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; - tu6_emit_sysmem_resolve(cmd, cs, subpass->multiview_mask, a, gmem_a); + const struct tu_image_view *iview = fb->attachments[a].attachment; + const struct tu_attachment_state *att = attachments + a; + if (att->pending_clear_aspects) { + assert(att->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT); + tu6_emit_blit_clear(cmd, cs, iview, + tiling->gmem_offsets[gmem_index++], + &att->clear_value); + } else { + tu6_emit_blit_info(cmd, cs, iview, + tiling->gmem_offsets[gmem_index++], + A6XX_RB_BLIT_INFO_UNK0 | A6XX_RB_BLIT_INFO_GMEM); } - } -} -static void -tu6_emit_tile_load(struct tu_cmd_buffer *cmd, struct tu_cs *cs) -{ - tu6_emit_blit_scissor(cmd, cs, true); + tu6_emit_blit(cmd, cs); + } - for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) - tu_load_gmem_attachment(cmd, cs, i, cmd->state.tiling->binning, false); + /* load/clear zs? */ } static void tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { - const struct tu_render_pass *pass = cmd->state.pass; - const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1]; - - tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); - tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE)); - - tu6_emit_blit_scissor(cmd, cs, true); + const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_tiling_config *tiling = &cmd->state.tiling_config; - for (uint32_t a = 0; a < pass->attachment_count; ++a) { - if (pass->attachments[a].gmem) - tu_store_gmem_attachment(cmd, cs, a, a, cmd->state.tiling->binning_possible); + if (false) { + /* hw binning? */ } - if (subpass->resolve_attachments) { - for (unsigned i = 0; i < subpass->resolve_count; i++) { - uint32_t a = subpass->resolve_attachments[i].attachment; - if (a != VK_ATTACHMENT_UNUSED) { - uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); - tu_store_gmem_attachment(cmd, cs, a, gmem_a, false); - } - } - } -} - -void -tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs) -{ tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) | CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | @@ -881,80 +809,84 @@ tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0)); tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0)); - cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE; + tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + tu_cs_emit(cs, 0x0); + + tu6_emit_marker(cmd, cs); + tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); + tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE) | 0x10); + tu6_emit_marker(cmd, cs); + + tu6_emit_blit_scissor(cmd, cs); + + uint32_t gmem_index = 0; + for (uint32_t i = 0; i < cmd->state.subpass->color_count; ++i) { + uint32_t a = cmd->state.subpass->color_attachments[i].attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; + + const struct tu_image_view *iview = fb->attachments[a].attachment; + tu6_emit_blit_info(cmd, cs, iview, tiling->gmem_offsets[gmem_index++], + 0); + tu6_emit_blit(cmd, cs); + } +} + +static void +tu6_emit_restart_index(struct tu_cs *cs, uint32_t restart_index) +{ + tu_cs_emit_pkt4(cs, REG_A6XX_PC_RESTART_INDEX, 1); + tu_cs_emit(cs, restart_index); } static void tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { - struct tu_device *dev = cmd->device; - const struct tu_physical_device *phys_dev = dev->physical_device; - - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); - - tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( - .vs_state = true, - .hs_state = true, - .ds_state = true, - .gs_state = true, - .fs_state = true, - .cs_state = true, - .gfx_ibo = true, - .cs_ibo = true, - .gfx_shared_const = true, - .cs_shared_const = true, - .gfx_bindless = 0x1f, - .cs_bindless = 0x1f)); - - tu_cs_emit_wfi(cs); - - cmd->state.cache.pending_flush_bits &= - ~(TU_CMD_FLAG_WAIT_FOR_IDLE | TU_CMD_FLAG_CACHE_INVALIDATE); - - tu_cs_emit_regs(cs, - A6XX_RB_CCU_CNTL(.color_offset = phys_dev->ccu_offset_bypass)); - cmd->state.ccu_state = TU_CMD_CCU_SYSMEM; - tu_cs_emit_write_reg(cs, REG_A6XX_RB_DBG_ECO_CNTL, 0x00100000); - tu_cs_emit_write_reg(cs, REG_A6XX_SP_FLOAT_CNTL, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_SP_DBG_ECO_CNTL, - phys_dev->info->a6xx.magic.SP_DBG_ECO_CNTL); - tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_ENABLE, 0x3f); - tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B605, 0x44); - tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_DBG_ECO_CNTL, - phys_dev->info->a6xx.magic.TPL1_DBG_ECO_CNTL); + VkResult result = tu_cs_reserve_space(cmd->device, cs, 256); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } + + tu6_emit_cache_flush(cmd, cs); + + tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 0xfffff); + + tu_cs_emit_write_reg(cs, REG_A6XX_RB_CCU_CNTL, 0x7c400004); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000); + tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE04, 0x8); + tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE0F, 0x3f); + tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B605, 0x44); + tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B600, 0x100000); tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80); tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_VPC_DBG_ECO_CNTL, - phys_dev->info->a6xx.magic.VPC_DBG_ECO_CNTL); - tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_DBG_ECO_CNTL, - phys_dev->info->a6xx.magic.GRAS_DBG_ECO_CNTL); - tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_DBG_ECO_CNTL, - phys_dev->info->a6xx.magic.HLSQ_DBG_ECO_CNTL); - tu_cs_emit_write_reg(cs, REG_A6XX_SP_CHICKEN_BITS, - phys_dev->info->a6xx.magic.SP_CHICKEN_BITS); + tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8600, 0x880); + tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE03, 0x00000410); tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0); tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0); - tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = false)); - tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, - phys_dev->info->a6xx.magic.UCHE_UNKNOWN_0E12); - tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, - phys_dev->info->a6xx.magic.UCHE_CLIENT_PF); - tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, - phys_dev->info->a6xx.magic.RB_UNKNOWN_8E01); - tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0); - tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true, - .isammode = ISAMMODE_GL, - .shared_consts_enable = false)); - - /* TODO: set A6XX_VFD_ADD_OFFSET_INSTANCE and fix ir3 to avoid adding base instance */ - tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX); + tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BB11, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000); + tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0); + tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AB00, 0x5); + tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A009, 0x00000001); tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010); - tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, - phys_dev->info->a6xx.magic.PC_MODE_CNTL); + tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f); + + tu_cs_emit_write_reg(cs, REG_A6XX_RB_SRGB_CNTL, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8101, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 0); tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL0, 0x401); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL1, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_SAMPLE_CNTL, 0); tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0); tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0); tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0); @@ -964,834 +896,705 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0); tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0); - tu_cs_emit_regs(cs, A6XX_VPC_POINT_COORD_INVERT(false)); + tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9101, 0xffff00); + tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9107, 0); + + tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9236, 1); tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0); - tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true)); + tu_cs_emit_write_reg(cs, REG_A6XX_VPC_SO_OVERRIDE, + A6XX_VPC_SO_OVERRIDE_SO_DISABLE); + + tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9801, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9806, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9980, 0); + + tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B06, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B06, 0); + + tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A81B, 0); tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8099, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_809B, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A0, 2); tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0); tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0); tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0); tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9981, 0x3); tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_MODE_CNTL, - 0x000000a0 | - A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL)); + tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9108, 0x3); + tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B304, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B309, 0x000000a2); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8804, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A4, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A5, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A6, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8805, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8806, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8878, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8879, 0); tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc); + tu6_emit_marker(cmd, cs); + tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000); + tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f); - tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL()); /* always disable alpha test */ - tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL()); /* always disable dithering */ + /* we don't use this yet.. probably best to disable.. */ + tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); + tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) | + CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | + CP_SET_DRAW_STATE__0_GROUP_ID(0)); + tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0)); + tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0)); - tu_disable_draw_states(cmd, cs); + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE_LO(0), 3); + tu_cs_emit(cs, 0x00000000); /* VPC_SO_BUFFER_BASE_LO_0 */ + tu_cs_emit(cs, 0x00000000); /* VPC_SO_BUFFER_BASE_HI_0 */ + tu_cs_emit(cs, 0x00000000); /* VPC_SO_BUFFER_SIZE_0 */ - tu_cs_emit_regs(cs, - A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = dev->global_bo, - .bo_offset = gb_offset(bcolor_builtin))); - tu_cs_emit_regs(cs, - A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = dev->global_bo, - .bo_offset = gb_offset(bcolor_builtin))); + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE_LO(0), 2); + tu_cs_emit(cs, 0x00000000); /* VPC_SO_FLUSH_BASE_LO_0 */ + tu_cs_emit(cs, 0x00000000); /* VPC_SO_FLUSH_BASE_HI_0 */ - tu_cs_sanity_check(cs); -} + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUF_CNTL, 1); + tu_cs_emit(cs, 0x00000000); /* VPC_SO_BUF_CNTL */ -static void -update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs) -{ - const struct tu_tiling_config *tiling = cmd->state.tiling; + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_OFFSET(0), 1); + tu_cs_emit(cs, 0x00000000); /* UNKNOWN_E2AB */ - tu_cs_emit_regs(cs, - A6XX_VSC_BIN_SIZE(.width = tiling->tile0.width, - .height = tiling->tile0.height)); + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE_LO(1), 3); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); - tu_cs_emit_regs(cs, - A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width, - .ny = tiling->tile_count.height)); + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_OFFSET(1), 6); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); - tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32); - tu_cs_emit_array(cs, tiling->pipe_config, 32); + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_OFFSET(2), 6); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); - tu_cs_emit_regs(cs, - A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch), - A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - VSC_PAD)); + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_OFFSET(3), 3); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); + tu_cs_emit(cs, 0x00000000); - tu_cs_emit_regs(cs, - A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch), - A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - VSC_PAD)); -} + tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_CTRL_REG0, 1); + tu_cs_emit(cs, 0x00000000); -static void -emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs) -{ - const struct tu_tiling_config *tiling = cmd->state.tiling; - const uint32_t used_pipe_count = - tiling->pipe_count.width * tiling->pipe_count.height; + tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_CTRL_REG0, 1); + tu_cs_emit(cs, 0x00000000); - for (int i = 0; i < used_pipe_count; i++) { - tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8); - tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | - CP_COND_WRITE5_0_WRITE_MEMORY); - tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i))); - tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0)); - tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - VSC_PAD)); - tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0)); - tu_cs_emit_qw(cs, global_iova(cmd, vsc_draw_overflow)); - tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_draw_strm_pitch)); - - tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8); - tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | - CP_COND_WRITE5_0_WRITE_MEMORY); - tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i))); - tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0)); - tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - VSC_PAD)); - tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0)); - tu_cs_emit_qw(cs, global_iova(cmd, vsc_prim_overflow)); - tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_prim_strm_pitch)); - } + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_CNTL, 1); + tu_cs_emit(cs, 0x00000000); - tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_LRZ_CNTL, 1); + tu_cs_emit(cs, 0x00000000); + + tu_cs_sanity_check(cs); } static void -tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { - struct tu_physical_device *phys_dev = cmd->device->physical_device; - const struct tu_framebuffer *fb = cmd->state.framebuffer; - - tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1); - - tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); - tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING)); - - tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); - tu_cs_emit(cs, 0x1); - - tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); - tu_cs_emit(cs, 0x1); - - tu_cs_emit_wfi(cs); - - tu_cs_emit_regs(cs, - A6XX_VFD_MODE_CNTL(.render_mode = BINNING_PASS)); - - update_vsc_pipe(cmd, cs); - - tu_cs_emit_regs(cs, - A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL)); - - tu_cs_emit_regs(cs, - A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL)); - - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); - tu_cs_emit(cs, UNK_2C); - - tu_cs_emit_regs(cs, - A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0)); - - tu_cs_emit_regs(cs, - A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0)); - - trace_start_binning_ib(&cmd->trace, cs); - - /* emit IB to binning drawcmds: */ - tu_cs_emit_call(cs, &cmd->draw_cs); + VkResult result = tu_cs_reserve_space(cmd->device, cs, 256); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } - trace_end_binning_ib(&cmd->trace, cs); + tu6_emit_lrz_flush(cmd, cs); - /* switching from binning pass to GMEM pass will cause a switch from - * PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states) - * so make sure these states are re-emitted - * (eventually these states shouldn't exist at all with shader prologue) - * only VS and GS are invalidated, as FS isn't emitted in binning pass, - * and we don't use HW binning when tesselation is used - */ - tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); - tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) | - CP_SET_DRAW_STATE__0_DISABLE | - CP_SET_DRAW_STATE__0_GROUP_ID(TU_DRAW_STATE_CONST)); - tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0)); - tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0)); + /* lrz clear? */ - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); - tu_cs_emit(cs, UNK_2D); + tu6_emit_cache_flush(cmd, cs); - /* This flush is probably required because the VSC, which produces the - * visibility stream, is a client of UCHE, whereas the CP needs to read the - * visibility stream (without caching) to do draw skipping. The - * WFI+WAIT_FOR_ME combination guarantees that the binning commands - * submitted are finished before reading the VSC regs (in - * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as - * part of draws). - */ - tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS); + tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + tu_cs_emit(cs, 0x0); - tu_cs_emit_wfi(cs); + /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */ + tu6_emit_wfi(cmd, cs); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_CCU_CNTL, 1); + tu_cs_emit(cs, 0x7c400004); /* RB_CCU_CNTL */ - tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + tu6_emit_zs(cmd, cs); + tu6_emit_mrt(cmd, cs); + tu6_emit_msaa(cmd, cs); - emit_vsc_overflow_test(cmd, cs); + if (false) { + /* hw binning? */ + } else { + tu6_emit_bin_size(cmd, cs, 0x6000000); + /* no draws */ + } - tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); - tu_cs_emit(cs, 0x0); + tu6_emit_render_cntl(cmd, cs, false); - tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); - tu_cs_emit(cs, 0x0); + tu_cs_sanity_check(cs); } -static struct tu_draw_state -tu_emit_input_attachments(struct tu_cmd_buffer *cmd, - const struct tu_subpass *subpass, - bool gmem) +static void +tu6_render_tile(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + const struct tu_tile *tile) { - const struct tu_tiling_config *tiling = cmd->state.tiling; - - /* note: we can probably emit input attachments just once for the whole - * renderpass, this would avoid emitting both sysmem/gmem versions - * - * emit two texture descriptors for each input, as a workaround for - * d24s8/d32s8, which can be sampled as both float (depth) and integer (stencil) - * tu_shader lowers uint input attachment loads to use the 2nd descriptor - * in the pair - * TODO: a smarter workaround - */ - - if (!subpass->input_count) - return (struct tu_draw_state) {}; - - struct tu_cs_memory texture; - VkResult result = tu_cs_alloc(&cmd->sub_cs, subpass->input_count * 2, - A6XX_TEX_CONST_DWORDS, &texture); + const uint32_t render_tile_space = 64 + tu_cs_get_call_size(&cmd->draw_cs); + VkResult result = tu_cs_reserve_space(cmd->device, cs, render_tile_space); if (result != VK_SUCCESS) { - vk_command_buffer_set_error(&cmd->vk, result); - return (struct tu_draw_state) {}; + cmd->record_result = result; + return; } - for (unsigned i = 0; i < subpass->input_count * 2; i++) { - uint32_t a = subpass->input_attachments[i / 2].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - - const struct tu_image_view *iview = cmd->state.attachments[a]; - const struct tu_render_pass_attachment *att = - &cmd->state.pass->attachments[a]; - uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i]; - uint32_t gmem_offset = tu_attachment_gmem_offset(cmd, att); - uint32_t cpp = att->cpp; - - memcpy(dst, iview->view.descriptor, A6XX_TEX_CONST_DWORDS * 4); - - /* Cube descriptors require a different sampling instruction in shader, - * however we don't know whether image is a cube or not until the start - * of a renderpass. We have to patch the descriptor to make it compatible - * with how it is sampled in shader. - */ - enum a6xx_tex_type tex_type = (dst[2] & A6XX_TEX_CONST_2_TYPE__MASK) >> - A6XX_TEX_CONST_2_TYPE__SHIFT; - if (tex_type == A6XX_TEX_CUBE) { - dst[2] &= ~A6XX_TEX_CONST_2_TYPE__MASK; - dst[2] |= A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D); - - uint32_t depth = (dst[5] & A6XX_TEX_CONST_5_DEPTH__MASK) >> - A6XX_TEX_CONST_5_DEPTH__SHIFT; - dst[5] &= ~A6XX_TEX_CONST_5_DEPTH__MASK; - dst[5] |= A6XX_TEX_CONST_5_DEPTH(depth * 6); - } + tu6_emit_tile_select(cmd, cs, tile); + tu_cs_emit_ib(cs, &cmd->state.tile_load_ib); - if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) { - /* note this works because spec says fb and input attachments - * must use identity swizzle - * - * Also we clear swap to WZYX. This is because the view might have - * picked XYZW to work better with border colors. - */ - dst[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK | - A6XX_TEX_CONST_0_SWAP__MASK | - A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK | - A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK); - if (!cmd->device->physical_device->info->a6xx.has_z24uint_s8uint) { - dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UINT) | - A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_W) | - A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) | - A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) | - A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE); - } else { - dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UINT_S8_UINT) | - A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_Y) | - A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) | - A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) | - A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE); - } - } + tu_cs_emit_call(cs, &cmd->draw_cs); + cmd->wait_for_idle = true; - if (i % 2 == 1 && att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { - dst[0] &= ~A6XX_TEX_CONST_0_FMT__MASK; - dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT); - dst[2] &= ~(A6XX_TEX_CONST_2_PITCHALIGN__MASK | A6XX_TEX_CONST_2_PITCH__MASK); - dst[2] |= A6XX_TEX_CONST_2_PITCH(iview->stencil_PITCH << 6); - dst[3] = 0; - dst[4] = iview->stencil_base_addr; - dst[5] = (dst[5] & 0xffff) | iview->stencil_base_addr >> 32; - - cpp = att->samples; - gmem_offset = att->gmem_offset_stencil[cmd->state.gmem_layout]; - } + tu_cs_emit_ib(cs, &cmd->state.tile_store_ib); - if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem) - continue; + tu_cs_sanity_check(cs); +} - /* patched for gmem */ - dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); - dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); - dst[2] = - A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | - A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp); - dst[3] = 0; - dst[4] = cmd->device->physical_device->gmem_base + gmem_offset; - dst[5] = A6XX_TEX_CONST_5_DEPTH(1); - for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++) - dst[i] = 0; +static void +tu6_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + VkResult result = tu_cs_reserve_space(cmd->device, cs, 16); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; } - struct tu_cs cs; - struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 9); + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_CNTL, 1); + tu_cs_emit(cs, A6XX_GRAS_LRZ_CNTL_ENABLE | A6XX_GRAS_LRZ_CNTL_UNK3); - tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_FRAG, 3); - tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) | - CP_LOAD_STATE6_0_NUM_UNIT(subpass->input_count * 2)); - tu_cs_emit_qw(&cs, texture.iova); + tu6_emit_lrz_flush(cmd, cs); - tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova)); + tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS, true); - tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_COUNT(subpass->input_count * 2)); - - assert(cs.cur == cs.end); /* validate draw state size */ - - return ds; + tu_cs_sanity_check(cs); } static void -tu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass) +tu_cmd_render_tiles(struct tu_cmd_buffer *cmd) { - struct tu_cs *cs = &cmd->draw_cs; + const struct tu_tiling_config *tiling = &cmd->state.tiling_config; - tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 6); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM, - tu_emit_input_attachments(cmd, subpass, true)); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM, - tu_emit_input_attachments(cmd, subpass, false)); -} + tu6_render_begin(cmd, &cmd->cs); + for (uint32_t y = 0; y < tiling->tile_count.height; y++) { + for (uint32_t x = 0; x < tiling->tile_count.width; x++) { + struct tu_tile tile; + tu_tiling_config_get_tile(tiling, cmd->device, x, y, &tile); + tu6_render_tile(cmd, &cmd->cs, &tile); + } + } + + tu6_render_end(cmd, &cmd->cs); +} static void -tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd, - const VkClearValue *clear_values) +tu_cmd_prepare_tile_load_ib(struct tu_cmd_buffer *cmd) { - struct tu_cs *cs = &cmd->draw_cs; - - tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); - - tu6_emit_tile_load(cmd, cs); + const uint32_t tile_load_space = 16 + 32 * MAX_RTS; + const struct tu_subpass *subpass = cmd->state.subpass; + struct tu_attachment_state *attachments = cmd->state.attachments; + struct tu_cs sub_cs; - tu6_emit_blit_scissor(cmd, cs, false); - - for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) - tu_clear_gmem_attachment(cmd, cs, i, &clear_values[i]); - - tu_cond_exec_end(cs); + VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->tile_cs, + tile_load_space, &sub_cs); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } - tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); + /* emit to tile-load sub_cs */ + tu6_emit_tile_load(cmd, &sub_cs); - for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) - tu_clear_sysmem_attachment(cmd, cs, i, &clear_values[i]); + cmd->state.tile_load_ib = tu_cs_end_sub_stream(&cmd->tile_cs, &sub_cs); - tu_cond_exec_end(cs); + for (uint32_t i = 0; i < subpass->color_count; ++i) { + const uint32_t a = subpass->color_attachments[i].attachment; + if (a != VK_ATTACHMENT_UNUSED) + attachments[a].pending_clear_aspects = 0; + } } static void -tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - struct tu_renderpass_result *autotune_result) +tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd) { - const struct tu_framebuffer *fb = cmd->state.framebuffer; - - tu_lrz_sysmem_begin(cmd, cs); - - assert(fb->width > 0 && fb->height > 0); - tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1); - tu6_emit_window_offset(cs, 0, 0); - - tu6_emit_bin_size(cs, 0, 0, - A6XX_RB_BIN_CONTROL_BUFFERS_LOCATION(BUFFERS_IN_SYSMEM) | - A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS); - - tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); - tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS)); - - tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - tu_cs_emit(cs, 0x0); - - tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM); - - tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); - tu_cs_emit(cs, 0x1); + const uint32_t tile_store_space = 32 + 32 * MAX_RTS; + struct tu_cs sub_cs; - tu_cs_emit_pkt7(cs, CP_SET_MODE, 1); - tu_cs_emit(cs, 0x0); + VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->tile_cs, + tile_store_space, &sub_cs); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } - tu_autotune_begin_renderpass(cmd, cs, autotune_result); + /* emit to tile-store sub_cs */ + tu6_emit_tile_store(cmd, &sub_cs); - tu_cs_sanity_check(cs); + cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->tile_cs, &sub_cs); } static void -tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - struct tu_renderpass_result *autotune_result) +tu_cmd_update_tiling_config(struct tu_cmd_buffer *cmd, + const VkRect2D *render_area) { - tu_autotune_end_renderpass(cmd, cs, autotune_result); - - /* Do any resolves of the last subpass. These are handled in the - * tile_store_cs in the gmem path. - */ - tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass); + const struct tu_device *dev = cmd->device; + const struct tu_render_pass *pass = cmd->state.pass; + const struct tu_subpass *subpass = cmd->state.subpass; + struct tu_tiling_config *tiling = &cmd->state.tiling_config; - tu_cs_emit_call(cs, &cmd->draw_epilogue_cs); + uint32_t buffer_cpp[MAX_RTS + 2]; + uint32_t buffer_count = 0; - tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - tu_cs_emit(cs, 0x0); + for (uint32_t i = 0; i < subpass->color_count; ++i) { + const uint32_t a = subpass->color_attachments[i].attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; - tu_lrz_sysmem_end(cmd, cs); + const struct tu_render_pass_attachment *att = &pass->attachments[a]; + buffer_cpp[buffer_count++] = + vk_format_get_blocksize(att->format) * att->samples; + } - tu_cs_sanity_check(cs); -} + if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) { + const uint32_t a = subpass->depth_stencil_attachment.attachment; + const struct tu_render_pass_attachment *att = &pass->attachments[a]; + + /* TODO */ + assert(att->format != VK_FORMAT_D32_SFLOAT_S8_UINT); + + buffer_cpp[buffer_count++] = + vk_format_get_blocksize(att->format) * att->samples; + } + + tu_tiling_config_update(tiling, dev, buffer_cpp, buffer_count, + render_area); +} + +const struct tu_dynamic_state default_dynamic_state = { + .viewport = + { + .count = 0, + }, + .scissor = + { + .count = 0, + }, + .line_width = 1.0f, + .depth_bias = + { + .bias = 0.0f, + .clamp = 0.0f, + .slope = 0.0f, + }, + .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f }, + .depth_bounds = + { + .min = 0.0f, + .max = 1.0f, + }, + .stencil_compare_mask = + { + .front = ~0u, + .back = ~0u, + }, + .stencil_write_mask = + { + .front = ~0u, + .back = ~0u, + }, + .stencil_reference = + { + .front = 0u, + .back = 0u, + }, +}; -static void -tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - struct tu_renderpass_result *autotune_result) +static void UNUSED /* FINISHME */ +tu_bind_dynamic_state(struct tu_cmd_buffer *cmd_buffer, + const struct tu_dynamic_state *src) { - struct tu_physical_device *phys_dev = cmd->device->physical_device; - const struct tu_tiling_config *tiling = cmd->state.tiling; - tu_lrz_tiling_begin(cmd, cs); + struct tu_dynamic_state *dest = &cmd_buffer->state.dynamic; + uint32_t copy_mask = src->mask; + uint32_t dest_mask = 0; - tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - tu_cs_emit(cs, 0x0); - - tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM); + tu_use_args(cmd_buffer); /* FINISHME */ - if (use_hw_binning(cmd)) { - if (!cmd->vsc_initialized) { - tu6_lazy_emit_vsc(cmd, cs); + /* Make sure to copy the number of viewports/scissors because they can + * only be specified at pipeline creation time. + */ + dest->viewport.count = src->viewport.count; + dest->scissor.count = src->scissor.count; + dest->discard_rectangle.count = src->discard_rectangle.count; + + if (copy_mask & TU_DYNAMIC_VIEWPORT) { + if (memcmp(&dest->viewport.viewports, &src->viewport.viewports, + src->viewport.count * sizeof(VkViewport))) { + typed_memcpy(dest->viewport.viewports, src->viewport.viewports, + src->viewport.count); + dest_mask |= TU_DYNAMIC_VIEWPORT; } + } - tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, - A6XX_RB_BIN_CONTROL_RENDER_MODE(BINNING_PASS) | - A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); - - tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true); + if (copy_mask & TU_DYNAMIC_SCISSOR) { + if (memcmp(&dest->scissor.scissors, &src->scissor.scissors, + src->scissor.count * sizeof(VkRect2D))) { + typed_memcpy(dest->scissor.scissors, src->scissor.scissors, + src->scissor.count); + dest_mask |= TU_DYNAMIC_SCISSOR; + } + } - tu6_emit_binning_pass(cmd, cs); + if (copy_mask & TU_DYNAMIC_LINE_WIDTH) { + if (dest->line_width != src->line_width) { + dest->line_width = src->line_width; + dest_mask |= TU_DYNAMIC_LINE_WIDTH; + } + } - tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, - A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS | - A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); + if (copy_mask & TU_DYNAMIC_DEPTH_BIAS) { + if (memcmp(&dest->depth_bias, &src->depth_bias, + sizeof(src->depth_bias))) { + dest->depth_bias = src->depth_bias; + dest_mask |= TU_DYNAMIC_DEPTH_BIAS; + } + } - tu_cs_emit_regs(cs, - A6XX_VFD_MODE_CNTL(0)); + if (copy_mask & TU_DYNAMIC_BLEND_CONSTANTS) { + if (memcmp(&dest->blend_constants, &src->blend_constants, + sizeof(src->blend_constants))) { + typed_memcpy(dest->blend_constants, src->blend_constants, 4); + dest_mask |= TU_DYNAMIC_BLEND_CONSTANTS; + } + } - tu_cs_emit_regs(cs, - A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL)); + if (copy_mask & TU_DYNAMIC_DEPTH_BOUNDS) { + if (memcmp(&dest->depth_bounds, &src->depth_bounds, + sizeof(src->depth_bounds))) { + dest->depth_bounds = src->depth_bounds; + dest_mask |= TU_DYNAMIC_DEPTH_BOUNDS; + } + } - tu_cs_emit_regs(cs, - A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL)); + if (copy_mask & TU_DYNAMIC_STENCIL_COMPARE_MASK) { + if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask, + sizeof(src->stencil_compare_mask))) { + dest->stencil_compare_mask = src->stencil_compare_mask; + dest_mask |= TU_DYNAMIC_STENCIL_COMPARE_MASK; + } + } - tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - tu_cs_emit(cs, 0x1); - tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_LOCAL, 1); - tu_cs_emit(cs, 0x1); - } else { - tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, - A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6)); - - if (tiling->binning_possible) { - /* Mark all tiles as visible for tu6_emit_cond_for_load_stores(), since - * the actual binner didn't run. - */ - int pipe_count = tiling->pipe_count.width * tiling->pipe_count.height; - tu_cs_emit_pkt4(cs, REG_A6XX_VSC_STATE_REG(0), pipe_count); - for (int i = 0; i < pipe_count; i++) - tu_cs_emit(cs, ~0); + if (copy_mask & TU_DYNAMIC_STENCIL_WRITE_MASK) { + if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask, + sizeof(src->stencil_write_mask))) { + dest->stencil_write_mask = src->stencil_write_mask; + dest_mask |= TU_DYNAMIC_STENCIL_WRITE_MASK; } } - tu_autotune_begin_renderpass(cmd, cs, autotune_result); + if (copy_mask & TU_DYNAMIC_STENCIL_REFERENCE) { + if (memcmp(&dest->stencil_reference, &src->stencil_reference, + sizeof(src->stencil_reference))) { + dest->stencil_reference = src->stencil_reference; + dest_mask |= TU_DYNAMIC_STENCIL_REFERENCE; + } + } - tu_cs_sanity_check(cs); + if (copy_mask & TU_DYNAMIC_DISCARD_RECTANGLE) { + if (memcmp(&dest->discard_rectangle.rectangles, + &src->discard_rectangle.rectangles, + src->discard_rectangle.count * sizeof(VkRect2D))) { + typed_memcpy(dest->discard_rectangle.rectangles, + src->discard_rectangle.rectangles, + src->discard_rectangle.count); + dest_mask |= TU_DYNAMIC_DISCARD_RECTANGLE; + } + } } -static void -tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot) +static VkResult +tu_create_cmd_buffer(struct tu_device *device, + struct tu_cmd_pool *pool, + VkCommandBufferLevel level, + VkCommandBuffer *pCommandBuffer) { - tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot); - - trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs); - - /* Primitives that passed all tests are still counted in in each - * tile even with HW binning beforehand. Do not permit it. - */ - if (cmd->state.prim_generated_query_running_before_rp) - tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS); + struct tu_cmd_buffer *cmd_buffer; + cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (cmd_buffer == NULL) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - tu_cs_emit_call(cs, &cmd->draw_cs); + cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC; + cmd_buffer->device = device; + cmd_buffer->pool = pool; + cmd_buffer->level = level; - if (cmd->state.prim_generated_query_running_before_rp) - tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS); + if (pool) { + list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); + cmd_buffer->queue_family_index = pool->queue_family_index; - if (use_hw_binning(cmd)) { - tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); - tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS)); + } else { + /* Init the pool_link so we can safely call list_del when we destroy + * the command buffer + */ + list_inithead(&cmd_buffer->pool_link); + cmd_buffer->queue_family_index = TU_QUEUE_GENERAL; } - /* Predicate is changed in draw_cs so we have to re-emit it */ - if (cmd->state.rp.draw_cs_writes_to_cond_pred) - tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, false); + tu_bo_list_init(&cmd_buffer->bo_list); + tu_cs_init(&cmd_buffer->cs, TU_CS_MODE_GROW, 4096); + tu_cs_init(&cmd_buffer->draw_cs, TU_CS_MODE_GROW, 4096); + tu_cs_init(&cmd_buffer->tile_cs, TU_CS_MODE_SUB_STREAM, 1024); - tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - tu_cs_emit(cs, 0x0); + *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer); - tu_cs_emit_call(cs, &cmd->tile_store_cs); + list_inithead(&cmd_buffer->upload.list); - tu_clone_trace_range(cmd, cs, cmd->trace_renderpass_start, - cmd->trace_renderpass_end); + cmd_buffer->marker_reg = REG_A6XX_CP_SCRATCH_REG( + cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ? 7 : 6); - tu_cs_sanity_check(cs); + VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000); + if (result != VK_SUCCESS) + return result; - trace_end_draw_ib_gmem(&cmd->trace, &cmd->cs); + return VK_SUCCESS; } static void -tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - struct tu_renderpass_result *autotune_result) +tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer) { - tu_autotune_end_renderpass(cmd, cs, autotune_result); + tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo); - tu_cs_emit_call(cs, &cmd->draw_epilogue_cs); + list_del(&cmd_buffer->pool_link); - tu_lrz_tiling_end(cmd, cs); + for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) + free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr); - tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS); + tu_cs_finish(cmd_buffer->device, &cmd_buffer->cs); + tu_cs_finish(cmd_buffer->device, &cmd_buffer->draw_cs); + tu_cs_finish(cmd_buffer->device, &cmd_buffer->tile_cs); - tu_cs_sanity_check(cs); + tu_bo_list_destroy(&cmd_buffer->bo_list); + vk_free(&cmd_buffer->pool->alloc, cmd_buffer); } -static void -tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, - struct tu_renderpass_result *autotune_result) +static VkResult +tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer) { - const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_tiling_config *tiling = cmd->state.tiling; + cmd_buffer->wait_for_idle = true; - /* Create gmem stores now (at EndRenderPass time)) because they needed to - * know whether to allow their conditional execution, which was tied to a - * state that was known only at the end of the renderpass. They will be - * called from tu6_render_tile(). - */ - tu_cs_begin(&cmd->tile_store_cs); - tu6_emit_tile_store(cmd, &cmd->tile_store_cs); - tu_cs_end(&cmd->tile_store_cs); - - cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace); + cmd_buffer->record_result = VK_SUCCESS; - tu6_tile_render_begin(cmd, &cmd->cs, autotune_result); + tu_bo_list_reset(&cmd_buffer->bo_list); + tu_cs_reset(cmd_buffer->device, &cmd_buffer->cs); + tu_cs_reset(cmd_buffer->device, &cmd_buffer->draw_cs); + tu_cs_reset(cmd_buffer->device, &cmd_buffer->tile_cs); - /* Note: we reverse the order of walking the pipes and tiles on every - * other row, to improve texture cache locality compared to raster order. - */ - for (uint32_t py = 0; py < tiling->pipe_count.height; py++) { - uint32_t pipe_row = py * tiling->pipe_count.width; - for (uint32_t pipe_row_i = 0; pipe_row_i < tiling->pipe_count.width; pipe_row_i++) { - uint32_t px; - if (py & 1) - px = tiling->pipe_count.width - 1 - pipe_row_i; - else - px = pipe_row_i; - uint32_t pipe = pipe_row + px; - uint32_t tx1 = px * tiling->pipe0.width; - uint32_t ty1 = py * tiling->pipe0.height; - uint32_t tx2 = MIN2(tx1 + tiling->pipe0.width, tiling->tile_count.width); - uint32_t ty2 = MIN2(ty1 + tiling->pipe0.height, tiling->tile_count.height); - uint32_t tile_row_stride = tx2 - tx1; - uint32_t slot_row = 0; - for (uint32_t ty = ty1; ty < ty2; ty++) { - for (uint32_t tile_row_i = 0; tile_row_i < tile_row_stride; tile_row_i++) { - uint32_t tx; - if (ty & 1) - tx = tile_row_stride - 1 - tile_row_i; - else - tx = tile_row_i; - uint32_t slot = slot_row + tx; - tu6_render_tile(cmd, &cmd->cs, tx1 + tx, ty, pipe, slot); - } - slot_row += tile_row_stride; - } - } + for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) { + cmd_buffer->descriptors[i].dirty = 0; + cmd_buffer->descriptors[i].valid = 0; + cmd_buffer->descriptors[i].push_dirty = false; } - tu6_tile_render_end(cmd, &cmd->cs, autotune_result); - - trace_end_render_pass(&cmd->trace, &cmd->cs, fb, tiling); - - /* tu6_render_tile has cloned these tracepoints for each tile */ - if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end)) - u_trace_disable_event_range(cmd->trace_renderpass_start, - cmd->trace_renderpass_end); + cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL; - /* Reset the gmem store CS entry lists so that the next render pass - * does its own stores. - */ - tu_cs_discard_entries(&cmd->tile_store_cs); + return cmd_buffer->record_result; } -static void -tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd, - struct tu_renderpass_result *autotune_result) +static VkResult +tu_cmd_state_setup_attachments(struct tu_cmd_buffer *cmd_buffer, + const VkRenderPassBeginInfo *info) { - cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace); - - tu6_sysmem_render_begin(cmd, &cmd->cs, autotune_result); + struct tu_cmd_state *state = &cmd_buffer->state; + const struct tu_framebuffer *fb = state->framebuffer; + const struct tu_render_pass *pass = state->pass; - trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs); - - tu_cs_emit_call(&cmd->cs, &cmd->draw_cs); + for (uint32_t i = 0; i < fb->attachment_count; ++i) { + const struct tu_image_view *iview = fb->attachments[i].attachment; + tu_bo_list_add(&cmd_buffer->bo_list, iview->image->bo, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); + } - trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs); + if (pass->attachment_count == 0) { + state->attachments = NULL; + return VK_SUCCESS; + } - tu6_sysmem_render_end(cmd, &cmd->cs, autotune_result); + state->attachments = + vk_alloc(&cmd_buffer->pool->alloc, + pass->attachment_count * sizeof(state->attachments[0]), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (state->attachments == NULL) { + cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; + return cmd_buffer->record_result; + } - trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer, cmd->state.tiling); -} + for (uint32_t i = 0; i < pass->attachment_count; ++i) { + const struct tu_render_pass_attachment *att = &pass->attachments[i]; + VkImageAspectFlags att_aspects = vk_format_aspects(att->format); + VkImageAspectFlags clear_aspects = 0; -void -tu_cmd_render(struct tu_cmd_buffer *cmd_buffer) -{ - if (cmd_buffer->state.rp.has_tess) - tu6_lazy_emit_tessfactor_addr(cmd_buffer); + if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) { + /* color attachment */ + if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT; + } + } else { + /* depthstencil attachment */ + if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && + att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && + att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE) + clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + } + if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && + att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) { + clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + } + } - struct tu_renderpass_result *autotune_result = NULL; - if (use_sysmem_rendering(cmd_buffer, &autotune_result)) - tu_cmd_render_sysmem(cmd_buffer, autotune_result); - else - tu_cmd_render_tiles(cmd_buffer, autotune_result); + state->attachments[i].pending_clear_aspects = clear_aspects; + state->attachments[i].cleared_views = 0; + if (clear_aspects && info) { + assert(info->clearValueCount > i); + state->attachments[i].clear_value = info->pClearValues[i]; + } - /* Outside of renderpasses we assume all draw states are disabled. We do - * this outside the draw CS for the normal case where 3d gmem stores aren't - * used. - */ - tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs); + state->attachments[i].current_layout = att->initial_layout; + } + return VK_SUCCESS; } -static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer) +VkResult +tu_AllocateCommandBuffers(VkDevice _device, + const VkCommandBufferAllocateInfo *pAllocateInfo, + VkCommandBuffer *pCommandBuffers) { - /* discard draw_cs and draw_epilogue_cs entries now that the tiles are - rendered */ - tu_cs_discard_entries(&cmd_buffer->draw_cs); - tu_cs_begin(&cmd_buffer->draw_cs); - tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs); - tu_cs_begin(&cmd_buffer->draw_epilogue_cs); + TU_FROM_HANDLE(tu_device, device, _device); + TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool); - cmd_buffer->state.pass = NULL; - cmd_buffer->state.subpass = NULL; - cmd_buffer->state.framebuffer = NULL; - cmd_buffer->state.attachments = NULL; - cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */ - memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp)); + VkResult result = VK_SUCCESS; + uint32_t i; - /* LRZ is not valid next time we use it */ - cmd_buffer->state.lrz.valid = false; - cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ; -} + for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { -static VkResult -tu_create_cmd_buffer(struct vk_command_pool *pool, - struct vk_command_buffer **cmd_buffer_out) -{ - struct tu_device *device = - container_of(pool->base.device, struct tu_device, vk); - struct tu_cmd_buffer *cmd_buffer; + if (!list_empty(&pool->free_cmd_buffers)) { + struct tu_cmd_buffer *cmd_buffer = list_first_entry( + &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link); - cmd_buffer = vk_zalloc2(&device->vk.alloc, NULL, sizeof(*cmd_buffer), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + list_del(&cmd_buffer->pool_link); + list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers); - if (cmd_buffer == NULL) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + result = tu_reset_cmd_buffer(cmd_buffer); + cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC; + cmd_buffer->level = pAllocateInfo->level; - VkResult result = vk_command_buffer_init(pool, &cmd_buffer->vk, - &tu_cmd_buffer_ops, 0); - if (result != VK_SUCCESS) { - vk_free2(&device->vk.alloc, NULL, cmd_buffer); - return result; + pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer); + } else { + result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level, + &pCommandBuffers[i]); + } + if (result != VK_SUCCESS) + break; } - cmd_buffer->device = device; - - u_trace_init(&cmd_buffer->trace, &device->trace_context); - list_inithead(&cmd_buffer->renderpass_autotune_results); - - tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096, "cmd cs"); - tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096, "draw cs"); - tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048, "tile store cs"); - tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096, "draw epilogue cs"); - tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048, "draw sub cs"); - tu_cs_init(&cmd_buffer->pre_chain.draw_cs, device, TU_CS_MODE_GROW, 4096, "prechain draw cs"); - tu_cs_init(&cmd_buffer->pre_chain.draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096, "prechain draw epiligoue cs"); - - *cmd_buffer_out = &cmd_buffer->vk; - - return VK_SUCCESS; -} + if (result != VK_SUCCESS) { + tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, + pCommandBuffers); -static void -tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer) -{ - struct tu_cmd_buffer *cmd_buffer = - container_of(vk_cmd_buffer, struct tu_cmd_buffer, vk); - - tu_cs_finish(&cmd_buffer->cs); - tu_cs_finish(&cmd_buffer->draw_cs); - tu_cs_finish(&cmd_buffer->tile_store_cs); - tu_cs_finish(&cmd_buffer->draw_epilogue_cs); - tu_cs_finish(&cmd_buffer->sub_cs); - tu_cs_finish(&cmd_buffer->pre_chain.draw_cs); - tu_cs_finish(&cmd_buffer->pre_chain.draw_epilogue_cs); - - u_trace_fini(&cmd_buffer->trace); - - tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results); - - for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { - if (cmd_buffer->descriptors[i].push_set.layout) - vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, - &cmd_buffer->descriptors[i].push_set.layout->vk); - vk_free(&cmd_buffer->device->vk.alloc, - cmd_buffer->descriptors[i].push_set.mapped_ptr); + /* From the Vulkan 1.0.66 spec: + * + * "vkAllocateCommandBuffers can be used to create multiple + * command buffers. If the creation of any of those command + * buffers fails, the implementation must destroy all + * successfully created command buffer objects from this + * command, set all entries of the pCommandBuffers array to + * NULL and return the error." + */ + memset(pCommandBuffers, 0, + sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount); } - vk_command_buffer_finish(&cmd_buffer->vk); - vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->vk.pool->alloc, - cmd_buffer); + return result; } -static void -tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, - UNUSED VkCommandBufferResetFlags flags) +void +tu_FreeCommandBuffers(VkDevice device, + VkCommandPool commandPool, + uint32_t commandBufferCount, + const VkCommandBuffer *pCommandBuffers) { - struct tu_cmd_buffer *cmd_buffer = - container_of(vk_cmd_buffer, struct tu_cmd_buffer, vk); - - vk_command_buffer_reset(&cmd_buffer->vk); - - tu_cs_reset(&cmd_buffer->cs); - tu_cs_reset(&cmd_buffer->draw_cs); - tu_cs_reset(&cmd_buffer->tile_store_cs); - tu_cs_reset(&cmd_buffer->draw_epilogue_cs); - tu_cs_reset(&cmd_buffer->sub_cs); - tu_cs_reset(&cmd_buffer->pre_chain.draw_cs); - tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs); - - tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results); - - for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { - memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets)); - if (cmd_buffer->descriptors[i].push_set.layout) { - vk_descriptor_set_layout_unref(&cmd_buffer->device->vk, - &cmd_buffer->descriptors[i].push_set.layout->vk); + for (uint32_t i = 0; i < commandBufferCount; i++) { + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]); + + if (cmd_buffer) { + if (cmd_buffer->pool) { + list_del(&cmd_buffer->pool_link); + list_addtail(&cmd_buffer->pool_link, + &cmd_buffer->pool->free_cmd_buffers); + } else + tu_cmd_buffer_destroy(cmd_buffer); } - memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set)); - cmd_buffer->descriptors[i].push_set.base.type = VK_OBJECT_TYPE_DESCRIPTOR_SET; - cmd_buffer->descriptors[i].max_sets_bound = 0; - cmd_buffer->descriptors[i].dynamic_bound = 0; } - - u_trace_fini(&cmd_buffer->trace); - u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->trace_context); - - cmd_buffer->state.max_vbs_bound = 0; - cmd_buffer->state.last_prim_params.valid = false; - - cmd_buffer->vsc_initialized = false; - - cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL; } -const struct vk_command_buffer_ops tu_cmd_buffer_ops = { - .create = tu_create_cmd_buffer, - .reset = tu_reset_cmd_buffer, - .destroy = tu_cmd_buffer_destroy, -}; - -/* Initialize the cache, assuming all necessary flushes have happened but *not* - * invalidations. - */ -static void -tu_cache_init(struct tu_cache_state *cache) +VkResult +tu_ResetCommandBuffer(VkCommandBuffer commandBuffer, + VkCommandBufferResetFlags flags) { - cache->flush_bits = 0; - cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE; + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + return tu_reset_cmd_buffer(cmd_buffer); } -/* Unlike the public entrypoint, this doesn't handle cache tracking, and - * tracking the CCU state. It's used for the driver to insert its own command - * buffer in the middle of a submit. - */ VkResult -tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer, - VkCommandBufferUsageFlags usage_flags) +tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, + const VkCommandBufferBeginInfo *pBeginInfo) { + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + VkResult result = VK_SUCCESS; + if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) { /* If the command buffer has already been resetted with * vkResetCommandBuffer, no need to do it again. */ - tu_reset_cmd_buffer(&cmd_buffer->vk, 0); + result = tu_reset_cmd_buffer(cmd_buffer); + if (result != VK_SUCCESS) + return result; } memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state)); - cmd_buffer->state.index_size = 0xff; /* dirty restart index */ - cmd_buffer->state.line_mode = RECTANGULAR; - cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* dirty value */ - - tu_cache_init(&cmd_buffer->state.cache); - tu_cache_init(&cmd_buffer->state.renderpass_cache); - cmd_buffer->usage_flags = usage_flags; + cmd_buffer->usage_flags = pBeginInfo->flags; tu_cs_begin(&cmd_buffer->cs); - tu_cs_begin(&cmd_buffer->draw_cs); - tu_cs_begin(&cmd_buffer->draw_epilogue_cs); - cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING; - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, - const VkCommandBufferBeginInfo *pBeginInfo) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); - VkResult result = tu_cmd_buffer_begin(cmd_buffer, pBeginInfo->flags); - if (result != VK_SUCCESS) - return result; + cmd_buffer->marker_seqno = 0; + cmd_buffer->scratch_seqno = 0; /* setup initial configuration into command buffer */ - if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { - trace_start_cmd_buffer(&cmd_buffer->trace, &cmd_buffer->cs); - + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { switch (cmd_buffer->queue_family_index) { case TU_QUEUE_GENERAL: tu6_init_hw(cmd_buffer, &cmd_buffer->cs); @@ -1799,212 +1602,35 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, default: break; } - } else if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { - const bool pass_continue = - pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT; - - trace_start_cmd_buffer(&cmd_buffer->trace, - pass_continue ? &cmd_buffer->draw_cs : &cmd_buffer->cs); - - assert(pBeginInfo->pInheritanceInfo); - - cmd_buffer->inherited_pipeline_statistics = - pBeginInfo->pInheritanceInfo->pipelineStatistics; - - vk_foreach_struct_const(ext, pBeginInfo->pInheritanceInfo) { - switch (ext->sType) { - case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: { - const VkCommandBufferInheritanceConditionalRenderingInfoEXT *cond_rend = (void *) ext; - cmd_buffer->state.predication_active = cond_rend->conditionalRenderingEnable; - break; - default: - break; - } - } - } - - if (pass_continue) { - const VkCommandBufferInheritanceRenderingInfo *rendering_info = - vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, - COMMAND_BUFFER_INHERITANCE_RENDERING_INFO); - - if (unlikely(cmd_buffer->device->instance->debug_flags & TU_DEBUG_DYNAMIC)) { - rendering_info = - vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level, - pBeginInfo); - } - - if (rendering_info) { - tu_setup_dynamic_inheritance(cmd_buffer, rendering_info); - cmd_buffer->state.pass = &cmd_buffer->dynamic_pass; - cmd_buffer->state.subpass = &cmd_buffer->dynamic_subpass; - } else { - cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); - cmd_buffer->state.subpass = - &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; - } - - /* We can't set the gmem layout here, because the state.pass only has - * to be compatible (same formats/sample counts) with the primary's - * renderpass, rather than exactly equal. - */ - - tu_lrz_begin_secondary_cmdbuf(cmd_buffer); - } else { - /* When executing in the middle of another command buffer, the CCU - * state is unknown. - */ - cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN; - } } - return VK_SUCCESS; -} - -static void -tu6_emit_vertex_strides(struct tu_cmd_buffer *cmd, unsigned num_vbs) -{ - struct tu_cs cs; - cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].iova = - tu_cs_draw_state(&cmd->sub_cs, &cs, 2 * num_vbs).iova; - - for (uint32_t i = 0; i < num_vbs; i++) - tu_cs_emit_regs(&cs, A6XX_VFD_FETCH_STRIDE(i, cmd->state.vb[i].stride)); - - cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE; -} - -static struct tu_cs -tu_cmd_dynamic_state(struct tu_cmd_buffer *cmd, uint32_t id, uint32_t size) -{ - struct tu_cs cs; - - assert(id < ARRAY_SIZE(cmd->state.dynamic_state)); - cmd->state.dynamic_state[id] = tu_cs_draw_state(&cmd->sub_cs, &cs, size); - - /* note: this also avoids emitting draw states before renderpass clears, - * which may use the 3D clear path (for MSAA cases) - */ - if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE) - return cs; - - tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3); - tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]); - - return cs; -} - -static void -tu_cmd_end_dynamic_state(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - uint32_t id) -{ - assert(id < ARRAY_SIZE(cmd->state.dynamic_state)); - cmd->state.dynamic_state[id] = tu_cs_end_draw_state(&cmd->sub_cs, cs); - - /* note: this also avoids emitting draw states before renderpass clears, - * which may use the 3D clear path (for MSAA cases) - */ - if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE) - return; - - tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3); - tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]); -} - -static void -tu_update_num_vbs(struct tu_cmd_buffer *cmd, unsigned num_vbs) -{ - /* the vertex_buffers draw state always contains all the currently - * bound vertex buffers. update its size to only emit the vbs which - * are actually used by the pipeline - * note there is a HW optimization which makes it so the draw state - * is not re-executed completely when only the size changes - */ - if (cmd->state.vertex_buffers.size != num_vbs * 4) { - cmd->state.vertex_buffers.size = num_vbs * 4; - cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS; - } - - if (cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size != num_vbs * 2) { - cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size = num_vbs * 2; - cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE; - } -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, - uint32_t vertexBindingDescriptionCount, - const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions, - uint32_t vertexAttributeDescriptionCount, - const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs cs; - - unsigned num_vbs = 0; - for (unsigned i = 0; i < vertexBindingDescriptionCount; i++) { - const VkVertexInputBindingDescription2EXT *binding = - &pVertexBindingDescriptions[i]; - num_vbs = MAX2(num_vbs, binding->binding + 1); - cmd->state.vb[binding->binding].stride = binding->stride; - } - - tu6_emit_vertex_strides(cmd, num_vbs); - tu_update_num_vbs(cmd, num_vbs); + cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING; - tu_cs_begin_sub_stream(&cmd->sub_cs, TU6_EMIT_VERTEX_INPUT_MAX_DWORDS, &cs); - tu6_emit_vertex_input(&cs, vertexBindingDescriptionCount, - pVertexBindingDescriptions, - vertexAttributeDescriptionCount, - pVertexAttributeDescriptions); - tu_cmd_end_dynamic_state(cmd, &cs, TU_DYNAMIC_STATE_VERTEX_INPUT); + return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL -tu_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, - uint32_t firstBinding, - uint32_t bindingCount, - const VkBuffer* pBuffers, - const VkDeviceSize* pOffsets, - const VkDeviceSize* pSizes, - const VkDeviceSize* pStrides) +void +tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBuffer *pBuffers, + const VkDeviceSize *pOffsets) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs cs; - cmd->state.max_vbs_bound = MAX2( - cmd->state.max_vbs_bound, firstBinding + bindingCount); - - cmd->state.vertex_buffers.iova = - tu_cs_draw_state(&cmd->sub_cs, &cs, 4 * cmd->state.max_vbs_bound).iova; + assert(firstBinding + bindingCount <= MAX_VBS); for (uint32_t i = 0; i < bindingCount; i++) { - if (pBuffers[i] == VK_NULL_HANDLE) { - cmd->state.vb[firstBinding + i].base = 0; - cmd->state.vb[firstBinding + i].size = 0; - } else { - struct tu_buffer *buf = tu_buffer_from_handle(pBuffers[i]); - cmd->state.vb[firstBinding + i].base = buf->iova + pOffsets[i]; - cmd->state.vb[firstBinding + i].size = pSizes ? pSizes[i] : (buf->vk.size - pOffsets[i]); - } - - if (pStrides) - cmd->state.vb[firstBinding + i].stride = pStrides[i]; - } - - for (uint32_t i = 0; i < cmd->state.max_vbs_bound; i++) { - tu_cs_emit_regs(&cs, - A6XX_VFD_FETCH_BASE(i, .qword = cmd->state.vb[i].base), - A6XX_VFD_FETCH_SIZE(i, cmd->state.vb[i].size)); + cmd->state.vb.buffers[firstBinding + i] = + tu_buffer_from_handle(pBuffers[i]); + cmd->state.vb.offsets[firstBinding + i] = pOffsets[i]; } + /* VB states depend on VkPipelineVertexInputStateCreateInfo */ cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS; - - if (pStrides) - tu6_emit_vertex_strides(cmd, cmd->state.max_vbs_bound); } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, @@ -2013,42 +1639,31 @@ tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); TU_FROM_HANDLE(tu_buffer, buf, buffer); + /* initialize/update the restart index */ + if (!cmd->state.index_buffer || cmd->state.index_type != indexType) { + struct tu_cs *draw_cs = &cmd->draw_cs; + VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 2); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } + tu6_emit_restart_index( + draw_cs, indexType == VK_INDEX_TYPE_UINT32 ? 0xffffffff : 0xffff); - uint32_t index_size, index_shift, restart_index; - - switch (indexType) { - case VK_INDEX_TYPE_UINT16: - index_size = INDEX4_SIZE_16_BIT; - index_shift = 1; - restart_index = 0xffff; - break; - case VK_INDEX_TYPE_UINT32: - index_size = INDEX4_SIZE_32_BIT; - index_shift = 2; - restart_index = 0xffffffff; - break; - case VK_INDEX_TYPE_UINT8_EXT: - index_size = INDEX4_SIZE_8_BIT; - index_shift = 0; - restart_index = 0xff; - break; - default: - unreachable("invalid VkIndexType"); + tu_cs_sanity_check(draw_cs); } - /* initialize/update the restart index */ - if (cmd->state.index_size != index_size) - tu_cs_emit_regs(&cmd->draw_cs, A6XX_PC_RESTART_INDEX(restart_index)); - - assert(buf->vk.size >= offset); + /* track the BO */ + if (cmd->state.index_buffer != buf) + tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ); - cmd->state.index_va = buf->iova + offset; - cmd->state.max_index_count = (buf->vk.size - offset) >> index_shift; - cmd->state.index_size = index_size; + cmd->state.index_buffer = buf; + cmd->state.index_offset = offset; + cmd->state.index_type = indexType; } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout _layout, @@ -2058,404 +1673,9 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, uint32_t dynamicOffsetCount, const uint32_t *pDynamicOffsets) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout); - unsigned dyn_idx = 0; - - struct tu_descriptor_state *descriptors_state = - tu_get_descriptors_state(cmd, pipelineBindPoint); - - descriptors_state->max_sets_bound = - MAX2(descriptors_state->max_sets_bound, firstSet + descriptorSetCount); - - for (unsigned i = 0; i < descriptorSetCount; ++i) { - unsigned idx = i + firstSet; - TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]); - - descriptors_state->sets[idx] = set; - - if (!set) - continue; - - if (!set->layout->dynamic_offset_size) - continue; - - uint32_t *src = set->dynamic_descriptors; - uint32_t *dst = descriptors_state->dynamic_descriptors + - layout->set[idx].dynamic_offset_start / 4; - for (unsigned j = 0; j < set->layout->binding_count; j++) { - struct tu_descriptor_set_binding_layout *binding = - &set->layout->binding[j]; - if (binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || - binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { - for (unsigned k = 0; k < binding->array_size; k++, dyn_idx++) { - assert(dyn_idx < dynamicOffsetCount); - uint32_t offset = pDynamicOffsets[dyn_idx]; - memcpy(dst, src, binding->size); - - if (binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) { - /* Note: we can assume here that the addition won't roll - * over and change the SIZE field. - */ - uint64_t va = src[0] | ((uint64_t)src[1] << 32); - va += offset; - dst[0] = va; - dst[1] = va >> 32; - } else { - uint32_t *dst_desc = dst; - for (unsigned i = 0; - i < binding->size / (4 * A6XX_TEX_CONST_DWORDS); - i++, dst_desc += A6XX_TEX_CONST_DWORDS) { - /* Note: A6XX_TEX_CONST_5_DEPTH is always 0 */ - uint64_t va = dst_desc[4] | ((uint64_t)dst_desc[5] << 32); - va += offset; - dst_desc[4] = va; - dst_desc[5] = va >> 32; - } - } - - dst += binding->size / 4; - src += binding->size / 4; - } - } - } - } - assert(dyn_idx == dynamicOffsetCount); - - uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_invalidate_value; - uint64_t addr[MAX_SETS] = {}; - uint64_t dynamic_addr = 0; - struct tu_cs *cs, state_cs; - - for (uint32_t i = 0; i < descriptors_state->max_sets_bound; i++) { - struct tu_descriptor_set *set = descriptors_state->sets[i]; - if (set) - addr[i] = set->va | 3; - } - - if (layout->dynamic_offset_size) { - /* allocate and fill out dynamic descriptor set */ - struct tu_cs_memory dynamic_desc_set; - VkResult result = tu_cs_alloc(&cmd->sub_cs, - layout->dynamic_offset_size / (4 * A6XX_TEX_CONST_DWORDS), - A6XX_TEX_CONST_DWORDS, &dynamic_desc_set); - if (result != VK_SUCCESS) { - vk_command_buffer_set_error(&cmd->vk, result); - return; - } - - memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors, - layout->dynamic_offset_size); - dynamic_addr = dynamic_desc_set.iova | 3; - descriptors_state->dynamic_bound = true; - } - - if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) { - sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0); - hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0); - hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f); - - cmd->state.desc_sets = - tu_cs_draw_state(&cmd->sub_cs, &state_cs, - 4 + 4 * descriptors_state->max_sets_bound + - (descriptors_state->dynamic_bound ? 6 : 0)); - cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD; - cs = &state_cs; - } else { - assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE); - - sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0); - hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0); - hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_CS_BINDLESS(0x1f); - - cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD; - cs = &cmd->cs; - } - - tu_cs_emit_pkt4(cs, sp_bindless_base_reg, 2 * descriptors_state->max_sets_bound); - tu_cs_emit_array(cs, (const uint32_t*) addr, 2 * descriptors_state->max_sets_bound); - tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg, 2 * descriptors_state->max_sets_bound); - tu_cs_emit_array(cs, (const uint32_t*) addr, 2 * descriptors_state->max_sets_bound); - - /* Dynamic descriptors get the last descriptor set. */ - if (descriptors_state->dynamic_bound) { - tu_cs_emit_pkt4(cs, sp_bindless_base_reg + 4 * 2, 2); - tu_cs_emit_qw(cs, dynamic_addr); - tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + 4 * 2, 2); - tu_cs_emit_qw(cs, dynamic_addr); - } - - tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(.dword = hlsq_invalidate_value)); - - if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) { - assert(cs->cur == cs->end); /* validate draw state size */ - /* note: this also avoids emitting draw states before renderpass clears, - * which may use the 3D clear path (for MSAA cases) - */ - if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) { - tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3); - tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets); - } - } } -static enum VkResult -tu_push_descriptor_set_update_layout(struct tu_device *device, - struct tu_descriptor_set *set, - struct tu_descriptor_set_layout *layout) -{ - if (set->layout == layout) - return VK_SUCCESS; - - if (set->layout) - vk_descriptor_set_layout_unref(&device->vk, &set->layout->vk); - vk_descriptor_set_layout_ref(&layout->vk); - set->layout = layout; - - if (set->host_size < layout->size) { - void *new_buf = - vk_realloc(&device->vk.alloc, set->mapped_ptr, layout->size, 8, - VK_QUERY_SCOPE_COMMAND_BUFFER_KHR); - if (!new_buf) - return VK_ERROR_OUT_OF_HOST_MEMORY; - set->mapped_ptr = new_buf; - set->host_size = layout->size; - } - return VK_SUCCESS; -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, - VkPipelineBindPoint pipelineBindPoint, - VkPipelineLayout _layout, - uint32_t _set, - uint32_t descriptorWriteCount, - const VkWriteDescriptorSet *pDescriptorWrites) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout); - struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout; - struct tu_descriptor_set *set = - &tu_get_descriptors_state(cmd, pipelineBindPoint)->push_set; - - struct tu_cs_memory set_mem; - VkResult result = tu_cs_alloc(&cmd->sub_cs, - DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4), - A6XX_TEX_CONST_DWORDS, &set_mem); - if (result != VK_SUCCESS) { - vk_command_buffer_set_error(&cmd->vk, result); - return; - } - - result = tu_push_descriptor_set_update_layout(cmd->device, set, layout); - if (result != VK_SUCCESS) { - vk_command_buffer_set_error(&cmd->vk, result); - return; - } - - tu_update_descriptor_sets(cmd->device, tu_descriptor_set_to_handle(set), - descriptorWriteCount, pDescriptorWrites, 0, NULL); - - memcpy(set_mem.map, set->mapped_ptr, layout->size); - set->va = set_mem.iova; - - tu_CmdBindDescriptorSets(commandBuffer, pipelineBindPoint, _layout, _set, - 1, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) }, - 0, NULL); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer, - VkDescriptorUpdateTemplate descriptorUpdateTemplate, - VkPipelineLayout _layout, - uint32_t _set, - const void* pData) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout); - TU_FROM_HANDLE(tu_descriptor_update_template, templ, descriptorUpdateTemplate); - struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout; - struct tu_descriptor_set *set = - &tu_get_descriptors_state(cmd, templ->bind_point)->push_set; - - struct tu_cs_memory set_mem; - VkResult result = tu_cs_alloc(&cmd->sub_cs, - DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4), - A6XX_TEX_CONST_DWORDS, &set_mem); - if (result != VK_SUCCESS) { - vk_command_buffer_set_error(&cmd->vk, result); - return; - } - - result = tu_push_descriptor_set_update_layout(cmd->device, set, layout); - if (result != VK_SUCCESS) { - vk_command_buffer_set_error(&cmd->vk, result); - return; - } - - tu_update_descriptor_set_with_template(cmd->device, set, descriptorUpdateTemplate, pData); - - memcpy(set_mem.map, set->mapped_ptr, layout->size); - set->va = set_mem.iova; - - tu_CmdBindDescriptorSets(commandBuffer, templ->bind_point, _layout, _set, - 1, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) }, - 0, NULL); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, - uint32_t firstBinding, - uint32_t bindingCount, - const VkBuffer *pBuffers, - const VkDeviceSize *pOffsets, - const VkDeviceSize *pSizes) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs *cs = &cmd->draw_cs; - - /* using COND_REG_EXEC for xfb commands matches the blob behavior - * presumably there isn't any benefit using a draw state when the - * condition is (SYSMEM | BINNING) - */ - tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | - CP_COND_REG_EXEC_0_SYSMEM | - CP_COND_REG_EXEC_0_BINNING); - - for (uint32_t i = 0; i < bindingCount; i++) { - TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]); - uint64_t iova = buf->iova + pOffsets[i]; - uint32_t size = buf->bo->size - (iova - buf->bo->iova); - uint32_t idx = i + firstBinding; - - if (pSizes && pSizes[i] != VK_WHOLE_SIZE) - size = pSizes[i]; - - /* BUFFER_BASE is 32-byte aligned, add remaining offset to BUFFER_OFFSET */ - uint32_t offset = iova & 0x1f; - iova &= ~(uint64_t) 0x1f; - - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE(idx), 3); - tu_cs_emit_qw(cs, iova); - tu_cs_emit(cs, size + offset); - - cmd->state.streamout_offset[idx] = offset; - } - - tu_cond_exec_end(cs); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, - uint32_t firstCounterBuffer, - uint32_t counterBufferCount, - const VkBuffer *pCounterBuffers, - const VkDeviceSize *pCounterBufferOffsets) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs *cs = &cmd->draw_cs; - - tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | - CP_COND_REG_EXEC_0_SYSMEM | - CP_COND_REG_EXEC_0_BINNING); - - tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false)); - - /* TODO: only update offset for active buffers */ - for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) - tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, cmd->state.streamout_offset[i])); - - for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) { - uint32_t idx = firstCounterBuffer + i; - uint32_t offset = cmd->state.streamout_offset[idx]; - uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u; - - if (!pCounterBuffers[i]) - continue; - - TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]); - - tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); - tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) | - CP_MEM_TO_REG_0_UNK31 | - CP_MEM_TO_REG_0_CNT(1)); - tu_cs_emit_qw(cs, buf->iova + counter_buffer_offset); - - if (offset) { - tu_cs_emit_pkt7(cs, CP_REG_RMW, 3); - tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) | - CP_REG_RMW_0_SRC1_ADD); - tu_cs_emit(cs, 0xffffffff); - tu_cs_emit(cs, offset); - } - } - - tu_cond_exec_end(cs); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, - uint32_t firstCounterBuffer, - uint32_t counterBufferCount, - const VkBuffer *pCounterBuffers, - const VkDeviceSize *pCounterBufferOffsets) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs *cs = &cmd->draw_cs; - - tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | - CP_COND_REG_EXEC_0_SYSMEM | - CP_COND_REG_EXEC_0_BINNING); - - tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true)); - - /* TODO: only flush buffers that need to be flushed */ - for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) { - /* note: FLUSH_BASE is always the same, so it could go in init_hw()? */ - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2); - tu_cs_emit_qw(cs, global_iova(cmd, flush_base[i])); - tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i); - } - - for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) { - uint32_t idx = firstCounterBuffer + i; - uint32_t offset = cmd->state.streamout_offset[idx]; - uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u; - - if (!pCounterBuffers[i]) - continue; - - TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]); - - /* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */ - tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); - tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) | - CP_MEM_TO_REG_0_SHIFT_BY_2 | - 0x40000 | /* ??? */ - CP_MEM_TO_REG_0_UNK31 | - CP_MEM_TO_REG_0_CNT(1)); - tu_cs_emit_qw(cs, global_iova(cmd, flush_base[idx])); - - if (offset) { - tu_cs_emit_pkt7(cs, CP_REG_RMW, 3); - tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH_REG(0)) | - CP_REG_RMW_0_SRC1_ADD); - tu_cs_emit(cs, 0xffffffff); - tu_cs_emit(cs, -offset); - } - - tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) | - CP_REG_TO_MEM_0_CNT(1)); - tu_cs_emit_qw(cs, buf->iova + counter_buffer_offset); - } - - tu_cond_exec_end(cs); - - cmd->state.rp.xfb_used = true; -} - -VKAPI_ATTR void VKAPI_CALL +void tu_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout, VkShaderStageFlags stageFlags, @@ -2463,63 +1683,38 @@ tu_CmdPushConstants(VkCommandBuffer commandBuffer, uint32_t size, const void *pValues) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - memcpy((void*) cmd->push_constants + offset, pValues, size); - cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS; } -/* Flush everything which has been made available but we haven't actually - * flushed yet. - */ -static void -tu_flush_all_pending(struct tu_cache_state *cache) -{ - cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH; - cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH; -} - -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_EndCommandBuffer(VkCommandBuffer commandBuffer) { TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); - /* We currently flush CCU at the end of the command buffer, like - * what the blob does. There's implicit synchronization around every - * vkQueueSubmit, but the kernel only flushes the UCHE, and we don't - * know yet if this command buffer will be the last in the submit so we - * have to defensively flush everything else. - * - * TODO: We could definitely do better than this, since these flushes - * aren't required by Vulkan, but we'd need kernel support to do that. - * Ideally, we'd like the kernel to flush everything afterwards, so that we - * wouldn't have to do any flushes here, and when submitting multiple - * command buffers there wouldn't be any unnecessary flushes in between. - */ - if (cmd_buffer->state.pass) { - tu_flush_all_pending(&cmd_buffer->state.renderpass_cache); - tu_emit_cache_flush_renderpass(cmd_buffer, &cmd_buffer->draw_cs); + if (cmd_buffer->scratch_seqno) { + tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->scratch_bo, + MSM_SUBMIT_BO_WRITE); + } - trace_end_cmd_buffer(&cmd_buffer->trace, &cmd_buffer->draw_cs, cmd_buffer); - } else { - tu_flush_all_pending(&cmd_buffer->state.cache); - cmd_buffer->state.cache.flush_bits |= - TU_CMD_FLAG_CCU_FLUSH_COLOR | - TU_CMD_FLAG_CCU_FLUSH_DEPTH; - tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs); + for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) { + tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i], + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); + } - trace_end_cmd_buffer(&cmd_buffer->trace, &cmd_buffer->cs, cmd_buffer); + for (uint32_t i = 0; i < cmd_buffer->tile_cs.bo_count; i++) { + tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->tile_cs.bos[i], + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); } tu_cs_end(&cmd_buffer->cs); - tu_cs_end(&cmd_buffer->draw_cs); - tu_cs_end(&cmd_buffer->draw_epilogue_cs); + + assert(!cmd_buffer->state.attachments); cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE; - return vk_command_buffer_get_record_result(&cmd_buffer->vk); + return cmd_buffer->record_result; } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline _pipeline) @@ -2527,2300 +1722,654 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline); - if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) { - cmd->state.compute_pipeline = pipeline; - tu_cs_emit_state_ib(&cmd->cs, pipeline->program.state); - return; - } - - assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS); - - cmd->state.pipeline = pipeline; - cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS | - TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_VS_PARAMS; - - if (pipeline->output.feedback_loop_may_involve_textures && - !cmd->state.rp.disable_gmem) { - /* VK_EXT_attachment_feedback_loop_layout allows feedback loop to involve - * not only input attachments but also sampled images or image resources. - * But we cannot just patch gmem for image in the descriptors. - * - * At the moment, in context of DXVK, it is expected that only a few - * drawcalls in a frame would use feedback loop and they would be wrapped - * in their own renderpasses, so it should be ok to force sysmem. - * - * However, there are two further possible optimizations if need would - * arise for other translation layer: - * - Tiling could be enabled if we ensure that there is no barrier in - * the renderpass; - * - Check that both pipeline and attachments agree that feedback loop - * is needed. - */ - perf_debug( - cmd->device, - "Disabling gmem due to VK_EXT_attachment_feedback_loop_layout"); - cmd->state.rp.disable_gmem = true; - } - - if (pipeline->prim_order.sysmem_single_prim_mode && - !cmd->state.rp.sysmem_single_prim_mode) { - if (pipeline->output.subpass_feedback_loop_color || - pipeline->output.subpass_feedback_loop_ds) { - perf_debug(cmd->device, "single_prim_mode due to feedback loop"); - } else { - perf_debug(cmd->device, "single_prim_mode due to rast order access"); - } - cmd->state.rp.sysmem_single_prim_mode = true; - } - - struct tu_cs *cs = &cmd->draw_cs; - - /* note: this also avoids emitting draw states before renderpass clears, - * which may use the 3D clear path (for MSAA cases) - */ - if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) { - uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT); - - tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (6 + util_bitcount(mask))); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast.state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order.state_sysmem); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order.state_gmem); - - u_foreach_bit(i, mask) - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]); - } - - if (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) { - cmd->state.rp.has_tess = true; - - if (!(pipeline->dynamic_state_mask & - BIT(TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS))) { - cmd->state.patch_control_points = pipeline->tess.patch_control_points; - cmd->state.dirty &= ~TU_CMD_DIRTY_PATCH_CONTROL_POINTS; - } else { - cmd->state.dirty |= TU_CMD_DIRTY_PATCH_CONTROL_POINTS; - } - } - - cmd->state.line_mode = pipeline->rast.line_mode; - if (!(pipeline->dynamic_state_mask & - BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY))) - cmd->state.primtype = pipeline->ia.primtype; - - tu6_update_msaa(cmd, pipeline->output.samples); - - if ((pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_VIEWPORT)) && - (pipeline->viewport.z_negative_one_to_one != cmd->state.z_negative_one_to_one)) { - cmd->state.z_negative_one_to_one = pipeline->viewport.z_negative_one_to_one; - cmd->state.dirty |= TU_CMD_DIRTY_VIEWPORTS; - } - - if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VERTEX_INPUT))) - tu_update_num_vbs(cmd, pipeline->vi.num_vbs); - -#define UPDATE_REG(group, X, Y) { \ - /* note: would be better to have pipeline bits already masked */ \ - uint32_t pipeline_bits = pipeline->group.X & pipeline->group.X##_mask; \ - if ((cmd->state.X & pipeline->group.X##_mask) != pipeline_bits) { \ - cmd->state.X &= ~pipeline->group.X##_mask; \ - cmd->state.X |= pipeline_bits; \ - cmd->state.dirty |= TU_CMD_DIRTY_##Y; \ - } \ - if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_##Y))) \ - cmd->state.dirty &= ~TU_CMD_DIRTY_##Y; \ -} - - /* these registers can have bits set from both pipeline and dynamic state - * this updates the bits set by the pipeline - * if the pipeline doesn't use a dynamic state for the register, then - * the relevant dirty bit is cleared to avoid overriding the non-dynamic - * state with a dynamic state the next draw. - */ - UPDATE_REG(rast, gras_su_cntl, GRAS_SU_CNTL); - UPDATE_REG(rast_ds, rb_depth_cntl, RB_DEPTH_CNTL); - UPDATE_REG(ds, rb_stencil_cntl, RB_STENCIL_CNTL); - UPDATE_REG(rast, pc_raster_cntl, RASTERIZER_DISCARD); - UPDATE_REG(rast, vpc_unknown_9107, RASTERIZER_DISCARD); - UPDATE_REG(blend, sp_blend_cntl, BLEND); - UPDATE_REG(blend, rb_blend_cntl, BLEND); - - for (unsigned i = 0; i < pipeline->blend.num_rts; i++) { - if ((cmd->state.rb_mrt_control[i] & pipeline->blend.rb_mrt_control_mask) != - pipeline->blend.rb_mrt_control[i]) { - cmd->state.rb_mrt_control[i] &= ~pipeline->blend.rb_mrt_control_mask; - cmd->state.rb_mrt_control[i] |= pipeline->blend.rb_mrt_control[i]; - cmd->state.dirty |= TU_CMD_DIRTY_BLEND; - } - - if (cmd->state.rb_mrt_blend_control[i] != pipeline->blend.rb_mrt_blend_control[i]) { - cmd->state.rb_mrt_blend_control[i] = pipeline->blend.rb_mrt_blend_control[i]; - cmd->state.dirty |= TU_CMD_DIRTY_BLEND; - } - } -#undef UPDATE_REG - - if (cmd->state.pipeline_color_write_enable != pipeline->blend.color_write_enable) { - cmd->state.pipeline_color_write_enable = pipeline->blend.color_write_enable; - cmd->state.dirty |= TU_CMD_DIRTY_BLEND; - } - if (cmd->state.pipeline_blend_enable != pipeline->blend.blend_enable) { - cmd->state.pipeline_blend_enable = pipeline->blend.blend_enable; - cmd->state.dirty |= TU_CMD_DIRTY_BLEND; - } - if (cmd->state.logic_op_enabled != pipeline->blend.logic_op_enabled) { - cmd->state.logic_op_enabled = pipeline->blend.logic_op_enabled; - cmd->state.dirty |= TU_CMD_DIRTY_BLEND; - } - if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_LOGIC_OP)) && - cmd->state.rop_reads_dst != pipeline->blend.rop_reads_dst) { - cmd->state.rop_reads_dst = pipeline->blend.rop_reads_dst; - cmd->state.dirty |= TU_CMD_DIRTY_BLEND; - } - if (cmd->state.dynamic_state[TU_DYNAMIC_STATE_BLEND].size != pipeline->blend.num_rts * 3 + 4) { - cmd->state.dirty |= TU_CMD_DIRTY_BLEND; - } - if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_BLEND))) { - cmd->state.dirty &= ~TU_CMD_DIRTY_BLEND; + switch (pipelineBindPoint) { + case VK_PIPELINE_BIND_POINT_GRAPHICS: + cmd->state.pipeline = pipeline; + cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE; + break; + case VK_PIPELINE_BIND_POINT_COMPUTE: + tu_finishme("binding compute pipeline"); + break; + default: + unreachable("unrecognized pipeline bind point"); + break; } - - if (pipeline->output.rb_depth_cntl_disable) - cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL; } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount, const VkViewport *pViewports) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + struct tu_cs *draw_cs = &cmd->draw_cs; - memcpy(&cmd->state.viewport[firstViewport], pViewports, viewportCount * sizeof(*pViewports)); - cmd->state.max_viewport = MAX2(cmd->state.max_viewport, firstViewport + viewportCount); + VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 12); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } - /* With VK_EXT_depth_clip_control we have to take into account - * negativeOneToOne property of the pipeline, so the viewport calculations - * are deferred until it is known. - */ - cmd->state.dirty |= TU_CMD_DIRTY_VIEWPORTS; + assert(firstViewport == 0 && viewportCount == 1); + tu6_emit_viewport(draw_cs, pViewports); + + tu_cs_sanity_check(draw_cs); } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount, const VkRect2D *pScissors) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs cs; + struct tu_cs *draw_cs = &cmd->draw_cs; + + VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 3); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } - memcpy(&cmd->state.scissor[firstScissor], pScissors, scissorCount * sizeof(*pScissors)); - cmd->state.max_scissor = MAX2(cmd->state.max_scissor, firstScissor + scissorCount); + assert(firstScissor == 0 && scissorCount == 1); + tu6_emit_scissor(draw_cs, pScissors); - cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * cmd->state.max_scissor); - tu6_emit_scissor(&cs, cmd->state.scissor, cmd->state.max_scissor); + tu_cs_sanity_check(draw_cs); } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK; - cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(lineWidth / 2.0f); + cmd->state.dynamic.line_width = lineWidth; - cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL; + /* line width depends on VkPipelineRasterizationStateCreateInfo */ + cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH; } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdSetDepthBias(VkCommandBuffer commandBuffer, float depthBiasConstantFactor, float depthBiasClamp, float depthBiasSlopeFactor) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BIAS, 4); + struct tu_cs *draw_cs = &cmd->draw_cs; + + VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 4); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } + + tu6_emit_depth_bias(draw_cs, depthBiasConstantFactor, depthBiasClamp, + depthBiasSlopeFactor); - tu6_emit_depth_bias(&cs, depthBiasConstantFactor, depthBiasClamp, depthBiasSlopeFactor); + tu_cs_sanity_check(draw_cs); } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4]) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5); + struct tu_cs *draw_cs = &cmd->draw_cs; - tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4); - tu_cs_emit_array(&cs, (const uint32_t *) blendConstants, 4); + VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 5); + if (result != VK_SUCCESS) { + cmd->record_result = result; + return; + } + + tu6_emit_blend_constants(draw_cs, blendConstants); + + tu_cs_sanity_check(draw_cs); } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3); - - tu_cs_emit_regs(&cs, - A6XX_RB_Z_BOUNDS_MIN(minDepthBounds), - A6XX_RB_Z_BOUNDS_MAX(maxDepthBounds)); } void -update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask) -{ - if (face & VK_STENCIL_FACE_FRONT_BIT) - *value = (*value & 0xff00) | (mask & 0xff); - if (face & VK_STENCIL_FACE_BACK_BIT) - *value = (*value & 0xff) | (mask & 0xff) << 8; -} - -VKAPI_ATTR void VKAPI_CALL tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t compareMask) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2); - update_stencil_mask(&cmd->state.dynamic_stencil_mask, faceMask, compareMask); + if (faceMask & VK_STENCIL_FACE_FRONT_BIT) + cmd->state.dynamic.stencil_compare_mask.front = compareMask; + if (faceMask & VK_STENCIL_FACE_BACK_BIT) + cmd->state.dynamic.stencil_compare_mask.back = compareMask; - tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.dword = cmd->state.dynamic_stencil_mask)); + /* the front/back compare masks must be updated together */ + cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK; } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t writeMask) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2); - - update_stencil_mask(&cmd->state.dynamic_stencil_wrmask, faceMask, writeMask); - tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = cmd->state.dynamic_stencil_wrmask)); + if (faceMask & VK_STENCIL_FACE_FRONT_BIT) + cmd->state.dynamic.stencil_write_mask.front = writeMask; + if (faceMask & VK_STENCIL_FACE_BACK_BIT) + cmd->state.dynamic.stencil_write_mask.back = writeMask; - cmd->state.dirty |= TU_CMD_DIRTY_LRZ; + /* the front/back write masks must be updated together */ + cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK; } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t reference) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2); - - update_stencil_mask(&cmd->state.dynamic_stencil_ref, faceMask, reference); - - tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.dword = cmd->state.dynamic_stencil_ref)); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer, - const VkSampleLocationsInfoEXT* pSampleLocationsInfo) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, 9); - assert(pSampleLocationsInfo); + if (faceMask & VK_STENCIL_FACE_FRONT_BIT) + cmd->state.dynamic.stencil_reference.front = reference; + if (faceMask & VK_STENCIL_FACE_BACK_BIT) + cmd->state.dynamic.stencil_reference.back = reference; - tu6_emit_sample_locations(&cs, pSampleLocationsInfo); + /* the front/back references must be updated together */ + cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE; } -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - - cmd->state.gras_su_cntl &= - ~(A6XX_GRAS_SU_CNTL_CULL_FRONT | A6XX_GRAS_SU_CNTL_CULL_BACK); - - if (cullMode & VK_CULL_MODE_FRONT_BIT) - cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT; - if (cullMode & VK_CULL_MODE_BACK_BIT) - cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK; - - cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL; -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - - cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_FRONT_CW; - - if (frontFace == VK_FRONT_FACE_CLOCKWISE) - cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW; - - cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL; -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer, - VkPrimitiveTopology primitiveTopology) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - - cmd->state.primtype = tu6_primtype(primitiveTopology); - tu6_update_msaa(cmd, cmd->state.samples); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer, - uint32_t viewportCount, - const VkViewport* pViewports) -{ - tu_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer, - uint32_t scissorCount, - const VkRect2D* pScissors) -{ - tu_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer, - VkBool32 depthTestEnable) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - - cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; - - if (depthTestEnable) - cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; - - cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL; -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer, - VkBool32 depthWriteEnable) +void +tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, + uint32_t commandBufferCount, + const VkCommandBuffer *pCmdBuffers) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - - cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; - - if (depthWriteEnable) - cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; - - cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL; } -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer, - VkCompareOp depthCompareOp) +VkResult +tu_CreateCommandPool(VkDevice _device, + const VkCommandPoolCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkCommandPool *pCmdPool) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - - cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK; + TU_FROM_HANDLE(tu_device, device, _device); + struct tu_cmd_pool *pool; - cmd->state.rb_depth_cntl |= - A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(depthCompareOp)); + pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pool == NULL) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL; -} + if (pAllocator) + pool->alloc = *pAllocator; + else + pool->alloc = device->alloc; -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer, - VkBool32 depthBoundsTestEnable) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + list_inithead(&pool->cmd_buffers); + list_inithead(&pool->free_cmd_buffers); - cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE; + pool->queue_family_index = pCreateInfo->queueFamilyIndex; - if (depthBoundsTestEnable) - cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE; + *pCmdPool = tu_cmd_pool_to_handle(pool); - cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL; + return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer, - VkBool32 stencilTestEnable) +void +tu_DestroyCommandPool(VkDevice _device, + VkCommandPool commandPool, + const VkAllocationCallbacks *pAllocator) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - - cmd->state.rb_stencil_cntl &= ~( - A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | - A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | - A6XX_RB_STENCIL_CONTROL_STENCIL_READ); - - if (stencilTestEnable) { - cmd->state.rb_stencil_cntl |= - A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | - A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | - A6XX_RB_STENCIL_CONTROL_STENCIL_READ; - } + TU_FROM_HANDLE(tu_device, device, _device); + TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool); - cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL; -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer, - VkStencilFaceFlags faceMask, - VkStencilOp failOp, - VkStencilOp passOp, - VkStencilOp depthFailOp, - VkCompareOp compareOp) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + if (!pool) + return; - if (faceMask & VK_STENCIL_FACE_FRONT_BIT) { - cmd->state.rb_stencil_cntl &= ~( - A6XX_RB_STENCIL_CONTROL_FUNC__MASK | - A6XX_RB_STENCIL_CONTROL_FAIL__MASK | - A6XX_RB_STENCIL_CONTROL_ZPASS__MASK | - A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK); - - cmd->state.rb_stencil_cntl |= - A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(compareOp)) | - A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(failOp)) | - A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(passOp)) | - A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(depthFailOp)); + list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer, + &pool->cmd_buffers, pool_link) + { + tu_cmd_buffer_destroy(cmd_buffer); } - if (faceMask & VK_STENCIL_FACE_BACK_BIT) { - cmd->state.rb_stencil_cntl &= ~( - A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK | - A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK | - A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK | - A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK); - - cmd->state.rb_stencil_cntl |= - A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(compareOp)) | - A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(failOp)) | - A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(passOp)) | - A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(depthFailOp)); + list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer, + &pool->free_cmd_buffers, pool_link) + { + tu_cmd_buffer_destroy(cmd_buffer); } - cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL; + vk_free2(&device->alloc, pAllocator, pool); } -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer, - VkBool32 depthBiasEnable) +VkResult +tu_ResetCommandPool(VkDevice device, + VkCommandPool commandPool, + VkCommandPoolResetFlags flags) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool); + VkResult result; - cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET; - if (depthBiasEnable) - cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET; + list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers, + pool_link) + { + result = tu_reset_cmd_buffer(cmd_buffer); + if (result != VK_SUCCESS) + return result; + } - cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL; + return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer, - VkBool32 primitiveRestartEnable) +void +tu_TrimCommandPool(VkDevice device, + VkCommandPool commandPool, + VkCommandPoolTrimFlags flags) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool); - cmd->state.primitive_restart_enable = primitiveRestartEnable; -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer, - VkBool32 rasterizerDiscardEnable) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + if (!pool) + return; - cmd->state.pc_raster_cntl &= ~A6XX_PC_RASTER_CNTL_DISCARD; - cmd->state.vpc_unknown_9107 &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD; - if (rasterizerDiscardEnable) { - cmd->state.pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD; - cmd->state.vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD; + list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer, + &pool->free_cmd_buffers, pool_link) + { + tu_cmd_buffer_destroy(cmd_buffer); } - - cmd->state.dirty |= TU_CMD_DIRTY_RASTERIZER_DISCARD; } -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, - VkLogicOp logicOp) +void +tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer, + const VkRenderPassBeginInfo *pRenderPassBegin, + VkSubpassContents contents) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - - cmd->state.rb_mrt_control_rop = - tu6_rb_mrt_control_rop(logicOp, &cmd->state.rop_reads_dst); + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass); + TU_FROM_HANDLE(tu_framebuffer, framebuffer, pRenderPassBegin->framebuffer); + VkResult result; - cmd->state.dirty |= TU_CMD_DIRTY_BLEND; -} + cmd_buffer->state.pass = pass; + cmd_buffer->state.subpass = pass->subpasses; + cmd_buffer->state.framebuffer = framebuffer; -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, - uint32_t patchControlPoints) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + result = tu_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin); + if (result != VK_SUCCESS) + return; - cmd->state.patch_control_points = patchControlPoints; + tu_cmd_update_tiling_config(cmd_buffer, &pRenderPassBegin->renderArea); + tu_cmd_prepare_tile_load_ib(cmd_buffer); + tu_cmd_prepare_tile_store_ib(cmd_buffer); - cmd->state.dirty |= TU_CMD_DIRTY_PATCH_CONTROL_POINTS; + /* draw_cs should contain entries only for this render pass */ + assert(!cmd_buffer->draw_cs.entry_count); + tu_cs_begin(&cmd_buffer->draw_cs); } -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, - uint32_t lineStippleFactor, - uint16_t lineStipplePattern) +void +tu_CmdBeginRenderPass2KHR(VkCommandBuffer commandBuffer, + const VkRenderPassBeginInfo *pRenderPassBeginInfo, + const VkSubpassBeginInfoKHR *pSubpassBeginInfo) { - tu_stub(); + tu_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo, + pSubpassBeginInfo->contents); } -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount, - const VkBool32 *pColorWriteEnables) +void +tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents) { TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - uint32_t color_write_enable = 0; - - for (unsigned i = 0; i < attachmentCount; i++) { - if (pColorWriteEnables[i]) - color_write_enable |= BIT(i); - } - - cmd->state.color_write_enable = color_write_enable; - cmd->state.dirty |= TU_CMD_DIRTY_BLEND; -} - -static void -tu_flush_for_access(struct tu_cache_state *cache, - enum tu_cmd_access_mask src_mask, - enum tu_cmd_access_mask dst_mask) -{ - enum tu_cmd_flush_bits flush_bits = 0; - - if (src_mask & TU_ACCESS_SYSMEM_WRITE) { - cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE; - } - - if (src_mask & TU_ACCESS_CP_WRITE) { - /* Flush the CP write queue. - */ - cache->pending_flush_bits |= - TU_CMD_FLAG_WAIT_MEM_WRITES | - TU_CMD_FLAG_ALL_INVALIDATE; - } - -#define SRC_FLUSH(domain, flush, invalidate) \ - if (src_mask & TU_ACCESS_##domain##_WRITE) { \ - cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \ - (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \ - } - SRC_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE) - SRC_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) - SRC_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) + tu_cmd_render_tiles(cmd); -#undef SRC_FLUSH + cmd->state.subpass++; -#define SRC_INCOHERENT_FLUSH(domain, flush, invalidate) \ - if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) { \ - flush_bits |= TU_CMD_FLAG_##flush; \ - cache->pending_flush_bits |= \ - (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \ - } - - SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) - SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) - -#undef SRC_INCOHERENT_FLUSH - - /* Treat host & sysmem write accesses the same, since the kernel implicitly - * drains the queue before signalling completion to the host. - */ - if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) { - flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH; - } - -#define DST_FLUSH(domain, flush, invalidate) \ - if (dst_mask & (TU_ACCESS_##domain##_READ | \ - TU_ACCESS_##domain##_WRITE)) { \ - flush_bits |= cache->pending_flush_bits & \ - (TU_CMD_FLAG_##invalidate | \ - (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \ - } - - DST_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE) - DST_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) - DST_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) - -#undef DST_FLUSH - -#define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \ - if (dst_mask & (TU_ACCESS_##domain##_INCOHERENT_READ | \ - TU_ACCESS_##domain##_INCOHERENT_WRITE)) { \ - flush_bits |= TU_CMD_FLAG_##invalidate | \ - (cache->pending_flush_bits & \ - (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \ - } - - DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR) - DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH) - -#undef DST_INCOHERENT_FLUSH - - cache->flush_bits |= flush_bits; - cache->pending_flush_bits &= ~flush_bits; -} - -/* When translating Vulkan access flags to which cache is accessed - * (CCU/UCHE/sysmem), we should take into account both the access flags and - * the stage so that accesses with MEMORY_READ_BIT/MEMORY_WRITE_BIT + a - * specific stage return something sensible. The specification for - * VK_KHR_synchronization2 says that we should do this: - * - * Additionally, scoping the pipeline stages into the barrier structs - * allows the use of the MEMORY_READ and MEMORY_WRITE flags without - * sacrificing precision. The per-stage access flags should be used to - * disambiguate specific accesses in a given stage or set of stages - for - * instance, between uniform reads and sampling operations. - * - * Note that while in all known cases the stage is actually enough, we should - * still narrow things down based on the access flags to handle "old-style" - * barriers that may specify a wider range of stages but more precise access - * flags. These helpers allow us to do both. - */ - -static bool -filter_read_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, - VkAccessFlags2 tu_flags, VkPipelineStageFlags2 tu_stages) -{ - return (flags & (tu_flags | VK_ACCESS_2_MEMORY_READ_BIT)) && - (stages & (tu_stages | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)); -} - -static bool -filter_write_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, - VkAccessFlags2 tu_flags, VkPipelineStageFlags2 tu_stages) -{ - return (flags & (tu_flags | VK_ACCESS_2_MEMORY_WRITE_BIT)) && - (stages & (tu_stages | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)); + tu_cmd_update_tiling_config(cmd, NULL); + tu_cmd_prepare_tile_load_ib(cmd); + tu_cmd_prepare_tile_store_ib(cmd); } -static bool -gfx_read_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, - VkAccessFlags2 tu_flags, VkPipelineStageFlags2 tu_stages) +void +tu_CmdNextSubpass2KHR(VkCommandBuffer commandBuffer, + const VkSubpassBeginInfoKHR *pSubpassBeginInfo, + const VkSubpassEndInfoKHR *pSubpassEndInfo) { - return filter_read_access(flags, stages, tu_flags, - tu_stages | VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT); + tu_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents); } -static bool -gfx_write_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, - VkAccessFlags2 tu_flags, VkPipelineStageFlags2 tu_stages) -{ - return filter_write_access(flags, stages, tu_flags, - tu_stages | VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT); -} -static enum tu_cmd_access_mask -vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only, bool gmem) +struct tu_draw_info { - enum tu_cmd_access_mask mask = 0; - - if (gfx_read_access(flags, stages, - VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT | - VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT | - VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | - VK_ACCESS_2_HOST_READ_BIT, - VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | - VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT | - VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT | - VK_PIPELINE_STAGE_2_HOST_BIT)) - mask |= TU_ACCESS_SYSMEM_READ; - - if (gfx_write_access(flags, stages, - VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT, - VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT)) - mask |= TU_ACCESS_CP_WRITE; - - if (gfx_write_access(flags, stages, - VK_ACCESS_2_HOST_WRITE_BIT, - VK_PIPELINE_STAGE_2_HOST_BIT)) - mask |= TU_ACCESS_SYSMEM_WRITE; - -#define SHADER_STAGES \ - (VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT | \ - VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT | \ - VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | \ - VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT | \ - VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT | \ - VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | \ - VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT) - - - if (gfx_read_access(flags, stages, - VK_ACCESS_2_INDEX_READ_BIT | - VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT | - VK_ACCESS_2_UNIFORM_READ_BIT | - VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT | - VK_ACCESS_2_SHADER_READ_BIT, - VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT | - VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT | - VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT | - SHADER_STAGES)) - mask |= TU_ACCESS_UCHE_READ; - - if (gfx_write_access(flags, stages, - VK_ACCESS_2_SHADER_WRITE_BIT | - VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT, - VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT | - SHADER_STAGES)) - mask |= TU_ACCESS_UCHE_WRITE; - - /* When using GMEM, the CCU is always flushed automatically to GMEM, and - * then GMEM is flushed to sysmem. Furthermore, we already had to flush any - * previous writes in sysmem mode when transitioning to GMEM. Therefore we - * can ignore CCU and pretend that color attachments and transfers use - * sysmem directly. + /** + * Number of vertices. */ + uint32_t count; - if (gfx_read_access(flags, stages, - VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT | - VK_ACCESS_2_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT, - VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT)) { - if (gmem) - mask |= TU_ACCESS_SYSMEM_READ; - else - mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ; - } - - if (gfx_read_access(flags, stages, - VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT, - VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | - VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT)) { - if (gmem) - mask |= TU_ACCESS_SYSMEM_READ; - else - mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ; - } - - if (gfx_write_access(flags, stages, - VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, - VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT)) { - if (gmem) { - mask |= TU_ACCESS_SYSMEM_WRITE; - } else { - mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE; - } - } - - if (gfx_write_access(flags, stages, - VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, - VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | - VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT)) { - if (gmem) { - mask |= TU_ACCESS_SYSMEM_WRITE; - } else { - mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE; - } - } - - if (filter_write_access(flags, stages, - VK_ACCESS_2_TRANSFER_WRITE_BIT, - VK_PIPELINE_STAGE_2_COPY_BIT | - VK_PIPELINE_STAGE_2_BLIT_BIT | - VK_PIPELINE_STAGE_2_CLEAR_BIT | - VK_PIPELINE_STAGE_2_RESOLVE_BIT | - VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT)) { - if (gmem) { - mask |= TU_ACCESS_SYSMEM_WRITE; - } else if (image_only) { - /* Because we always split up blits/copies of images involving - * multiple layers, we always access each layer in the same way, with - * the same base address, same format, etc. This means we can avoid - * flushing between multiple writes to the same image. This elides - * flushes between e.g. multiple blits to the same image. - */ - mask |= TU_ACCESS_CCU_COLOR_WRITE; - } else { - mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE; - } - } - - if (filter_read_access(flags, stages, - VK_ACCESS_2_TRANSFER_READ_BIT, - VK_PIPELINE_STAGE_2_COPY_BIT | - VK_PIPELINE_STAGE_2_BLIT_BIT | - VK_PIPELINE_STAGE_2_RESOLVE_BIT | - VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT)) { - mask |= TU_ACCESS_UCHE_READ; - } - - return mask; -} - -/* These helpers deal with legacy BOTTOM_OF_PIPE/TOP_OF_PIPE stages. - */ - -static VkPipelineStageFlags2 -sanitize_src_stage(VkPipelineStageFlags2 stage_mask) -{ - /* From the Vulkan spec: - * - * VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT is ... equivalent to - * VK_PIPELINE_STAGE_2_NONE in the first scope. - * - * VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT is equivalent to - * VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT with VkAccessFlags2 set to 0 - * when specified in the first synchronization scope, ... + /** + * Index of the first vertex. */ - if (stage_mask & VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT) - return VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT; + int32_t vertex_offset; - return stage_mask & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT; -} - -static VkPipelineStageFlags2 -sanitize_dst_stage(VkPipelineStageFlags2 stage_mask) -{ - /* From the Vulkan spec: - * - * VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT is equivalent to - * VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT with VkAccessFlags2 set to 0 - * when specified in the second synchronization scope, ... - * - * VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT is ... equivalent to - * VK_PIPELINE_STAGE_2_NONE in the second scope. - * + /** + * First instance id. */ - if (stage_mask & VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) - return VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT; - - return stage_mask & ~VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT; -} - -static enum tu_stage -vk2tu_single_stage(VkPipelineStageFlags2 vk_stage, bool dst) -{ - if (vk_stage == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT || - vk_stage == VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT) - return TU_STAGE_CP; - - if (vk_stage == VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT || - vk_stage == VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT || - vk_stage == VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT) - return TU_STAGE_FE; - - if (vk_stage == VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT || - vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT || - vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT || - vk_stage == VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT || - vk_stage == VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT) - return TU_STAGE_SP_VS; - - if (vk_stage == VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT || - vk_stage == VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT) - return TU_STAGE_SP_PS; - - if (vk_stage == VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT || /* Yes, really */ - /* See comment in TU_STAGE_GRAS about early fragment tests */ - vk_stage == VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT || - vk_stage == VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT || - vk_stage == VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT) - - return TU_STAGE_PS; - - if (vk_stage == VK_PIPELINE_STAGE_2_COPY_BIT || - vk_stage == VK_PIPELINE_STAGE_2_BLIT_BIT || - vk_stage == VK_PIPELINE_STAGE_2_RESOLVE_BIT || - vk_stage == VK_PIPELINE_STAGE_2_CLEAR_BIT || - vk_stage == VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT) - /* Blits read in SP_PS and write in PS, in both 2d and 3d cases */ - return dst ? TU_STAGE_SP_PS : TU_STAGE_PS; - - if (vk_stage == VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT || - vk_stage == VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT) - /* Be conservative */ - return dst ? TU_STAGE_CP : TU_STAGE_PS; - - if (vk_stage == VK_PIPELINE_STAGE_2_HOST_BIT) - return dst ? TU_STAGE_PS : TU_STAGE_CP; - - unreachable("unknown pipeline stage"); -} + uint32_t first_instance; -static enum tu_stage -vk2tu_src_stage(VkPipelineStageFlags vk_stages) -{ - enum tu_stage stage = TU_STAGE_CP; - u_foreach_bit (bit, vk_stages) { - enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false); - stage = MAX2(stage, new_stage); - } - - return stage; -} - -static enum tu_stage -vk2tu_dst_stage(VkPipelineStageFlags vk_stages) -{ - enum tu_stage stage = TU_STAGE_PS; - u_foreach_bit (bit, vk_stages) { - enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true); - stage = MIN2(stage, new_stage); - } - - return stage; -} - -static void -tu_flush_for_stage(struct tu_cache_state *cache, - enum tu_stage src_stage, enum tu_stage dst_stage) -{ - /* As far as we know, flushes take place in the last stage so if there are - * any pending flushes then we have to move down the source stage, because - * the data only becomes available when the flush finishes. In particular - * this can matter when the CP writes something and we need to invalidate - * UCHE to read it. - */ - if (cache->flush_bits & (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE)) - src_stage = TU_STAGE_PS; - - /* Note: if the destination stage is the CP, then the CP also has to wait - * for any WFI's to finish. This is already done for draw calls, including - * before indirect param reads, for the most part, so we just need to WFI. - * - * However, some indirect draw opcodes, depending on firmware, don't have - * implicit CP_WAIT_FOR_ME so we have to handle it manually. - * - * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly - * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it. - * - * Currently we read the draw predicate using CP_MEM_TO_MEM, which - * also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not* - * implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to - * complete since it's written for DX11 where you can only predicate on the - * result of a query object. So if we implement 64-bit comparisons in the - * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit - * comparisons, then this will have to be dealt with. + /** + * Number of instances. */ - if (src_stage > dst_stage) { - cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE; - if (dst_stage == TU_STAGE_CP) - cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME; - } -} + uint32_t instance_count; -void -tu_render_pass_state_merge(struct tu_render_pass_state *dst, - const struct tu_render_pass_state *src) -{ - dst->xfb_used |= src->xfb_used; - dst->has_tess |= src->has_tess; - dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp; - dst->disable_gmem |= src->disable_gmem; - dst->sysmem_single_prim_mode |= src->sysmem_single_prim_mode; - dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred; - - dst->drawcall_count += src->drawcall_count; - dst->drawcall_bandwidth_per_sample_sum += - src->drawcall_bandwidth_per_sample_sum; -} - -void -tu_restore_suspended_pass(struct tu_cmd_buffer *cmd, - struct tu_cmd_buffer *suspended) -{ - cmd->state.pass = suspended->state.suspended_pass.pass; - cmd->state.subpass = suspended->state.suspended_pass.subpass; - cmd->state.framebuffer = suspended->state.suspended_pass.framebuffer; - cmd->state.attachments = suspended->state.suspended_pass.attachments; - cmd->state.render_area = suspended->state.suspended_pass.render_area; - cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout; - cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout]; - cmd->state.lrz = suspended->state.suspended_pass.lrz; -} - -/* Take the saved pre-chain in "secondary" and copy its commands to "cmd", - * appending it after any saved-up commands in "cmd". - */ -void -tu_append_pre_chain(struct tu_cmd_buffer *cmd, - struct tu_cmd_buffer *secondary) -{ - tu_cs_add_entries(&cmd->draw_cs, &secondary->pre_chain.draw_cs); - tu_cs_add_entries(&cmd->draw_epilogue_cs, - &secondary->pre_chain.draw_epilogue_cs); - - tu_render_pass_state_merge(&cmd->state.rp, - &secondary->pre_chain.state); - tu_clone_trace_range(cmd, &cmd->draw_cs, secondary->pre_chain.trace_renderpass_start, - secondary->pre_chain.trace_renderpass_end); -} - -/* Take the saved post-chain in "secondary" and copy it to "cmd". - */ -void -tu_append_post_chain(struct tu_cmd_buffer *cmd, - struct tu_cmd_buffer *secondary) -{ - tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs); - tu_cs_add_entries(&cmd->draw_epilogue_cs, &secondary->draw_epilogue_cs); - - tu_clone_trace_range(cmd, &cmd->draw_cs, secondary->trace_renderpass_start, - secondary->trace_renderpass_end); - cmd->state.rp = secondary->state.rp; -} - -/* Assuming "secondary" is just a sequence of suspended and resuming passes, - * copy its state to "cmd". This also works instead of tu_append_post_chain(), - * but it's a bit slower because we don't assume that the chain begins in - * "secondary" and therefore have to care about the command buffer's - * renderpass state. - */ -void -tu_append_pre_post_chain(struct tu_cmd_buffer *cmd, - struct tu_cmd_buffer *secondary) -{ - tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs); - tu_cs_add_entries(&cmd->draw_epilogue_cs, &secondary->draw_epilogue_cs); - - tu_clone_trace_range(cmd, &cmd->draw_cs, secondary->trace_renderpass_start, - secondary->trace_renderpass_end); - tu_render_pass_state_merge(&cmd->state.rp, - &secondary->state.rp); -} - -/* Take the current render pass state and save it to "pre_chain" to be - * combined later. - */ -static void -tu_save_pre_chain(struct tu_cmd_buffer *cmd) -{ - tu_cs_add_entries(&cmd->pre_chain.draw_cs, - &cmd->draw_cs); - tu_cs_add_entries(&cmd->pre_chain.draw_epilogue_cs, - &cmd->draw_epilogue_cs); - cmd->pre_chain.trace_renderpass_start = - cmd->trace_renderpass_start; - cmd->pre_chain.trace_renderpass_end = - cmd->trace_renderpass_end; - cmd->pre_chain.state = cmd->state.rp; -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, - uint32_t commandBufferCount, - const VkCommandBuffer *pCmdBuffers) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - VkResult result; - - assert(commandBufferCount > 0); - - /* Emit any pending flushes. */ - if (cmd->state.pass) { - tu_flush_all_pending(&cmd->state.renderpass_cache); - tu_emit_cache_flush_renderpass(cmd, &cmd->draw_cs); - } else { - tu_flush_all_pending(&cmd->state.cache); - tu_emit_cache_flush(cmd, &cmd->cs); - } - - for (uint32_t i = 0; i < commandBufferCount; i++) { - TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]); - - if (secondary->usage_flags & - VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { - assert(tu_cs_is_empty(&secondary->cs)); - - result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs); - if (result != VK_SUCCESS) { - vk_command_buffer_set_error(&cmd->vk, result); - break; - } - - result = tu_cs_add_entries(&cmd->draw_epilogue_cs, - &secondary->draw_epilogue_cs); - if (result != VK_SUCCESS) { - vk_command_buffer_set_error(&cmd->vk, result); - break; - } - - /* If LRZ was made invalid in secondary - we should disable - * LRZ retroactively for the whole renderpass. - */ - if (!secondary->state.lrz.valid) - cmd->state.lrz.valid = false; - - tu_clone_trace(cmd, &cmd->draw_cs, &secondary->trace); - tu_render_pass_state_merge(&cmd->state.rp, &secondary->state.rp); - } else { - switch (secondary->state.suspend_resume) { - case SR_NONE: - assert(tu_cs_is_empty(&secondary->draw_cs)); - assert(tu_cs_is_empty(&secondary->draw_epilogue_cs)); - tu_cs_add_entries(&cmd->cs, &secondary->cs); - tu_clone_trace(cmd, &cmd->cs, &secondary->trace); - break; - - case SR_IN_PRE_CHAIN: - /* cmd may be empty, which means that the chain begins before cmd - * in which case we have to update its state. - */ - if (cmd->state.suspend_resume == SR_NONE) { - cmd->state.suspend_resume = SR_IN_PRE_CHAIN; - cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); - } - - /* The secondary is just a continuous suspend/resume chain so we - * just have to append it to the the command buffer. - */ - assert(tu_cs_is_empty(&secondary->cs)); - tu_append_pre_post_chain(cmd, secondary); - break; - - case SR_AFTER_PRE_CHAIN: - case SR_IN_CHAIN: - case SR_IN_CHAIN_AFTER_PRE_CHAIN: - if (secondary->state.suspend_resume == SR_AFTER_PRE_CHAIN || - secondary->state.suspend_resume == SR_IN_CHAIN_AFTER_PRE_CHAIN) { - /* In thse cases there is a `pre_chain` in the secondary which - * ends that we need to append to the primary. - */ - - if (cmd->state.suspend_resume == SR_NONE) - cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); - - tu_append_pre_chain(cmd, secondary); - - /* We're about to render, so we need to end the command stream - * in case there were any extra commands generated by copying - * the trace. - */ - tu_cs_end(&cmd->draw_cs); - tu_cs_end(&cmd->draw_epilogue_cs); - - switch (cmd->state.suspend_resume) { - case SR_NONE: - case SR_IN_PRE_CHAIN: - /* The renderpass chain ends in the secondary but isn't - * started in the primary, so we have to move the state to - * `pre_chain`. - */ - cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace); - tu_save_pre_chain(cmd); - cmd->state.suspend_resume = SR_AFTER_PRE_CHAIN; - break; - case SR_IN_CHAIN: - case SR_IN_CHAIN_AFTER_PRE_CHAIN: - /* The renderpass ends in the secondary and starts somewhere - * earlier in this primary. Since the last render pass in - * the chain is in the secondary, we are technically outside - * of a render pass. Fix that here by reusing the dynamic - * render pass that was setup for the last suspended render - * pass before the secondary. - */ - tu_restore_suspended_pass(cmd, cmd); - - tu_cmd_render(cmd); - if (cmd->state.suspend_resume == SR_IN_CHAIN) - cmd->state.suspend_resume = SR_NONE; - else - cmd->state.suspend_resume = SR_AFTER_PRE_CHAIN; - break; - case SR_AFTER_PRE_CHAIN: - unreachable("resuming render pass is not preceded by suspending one"); - } - - tu_reset_render_pass(cmd); - } - - tu_cs_add_entries(&cmd->cs, &secondary->cs); - - if (secondary->state.suspend_resume == SR_IN_CHAIN_AFTER_PRE_CHAIN || - secondary->state.suspend_resume == SR_IN_CHAIN) { - /* The secondary ends in a "post-chain" (the opposite of a - * pre-chain) that we need to copy into the current command - * buffer. - */ - cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); - tu_append_post_chain(cmd, secondary); - cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace); - cmd->state.suspended_pass = secondary->state.suspended_pass; - - switch (cmd->state.suspend_resume) { - case SR_NONE: - cmd->state.suspend_resume = SR_IN_CHAIN; - break; - case SR_AFTER_PRE_CHAIN: - cmd->state.suspend_resume = SR_IN_CHAIN_AFTER_PRE_CHAIN; - break; - default: - unreachable("suspending render pass is followed by a not resuming one"); - } - } - } - } - - cmd->state.index_size = secondary->state.index_size; /* for restart index update */ - } - cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */ - - if (!cmd->state.lrz.gpu_dir_tracking && cmd->state.pass) { - /* After a secondary command buffer is executed, LRZ is not valid - * until it is cleared again. - */ - cmd->state.lrz.valid = false; - } - - /* After executing secondary command buffers, there may have been arbitrary - * flushes executed, so when we encounter a pipeline barrier with a - * srcMask, we have to assume that we need to invalidate. Therefore we need - * to re-initialize the cache with all pending invalidate bits set. + /** + * First index (indexed draws only). */ - if (cmd->state.pass) { - tu_cache_init(&cmd->state.renderpass_cache); - } else { - tu_cache_init(&cmd->state.cache); - } -} + uint32_t first_index; -static void -tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, - const struct tu_subpass_barrier *barrier, - bool external) -{ - /* Note: we don't know until the end of the subpass whether we'll use - * sysmem, so assume sysmem here to be safe. + /** + * Whether it's an indexed draw. */ - struct tu_cache_state *cache = - external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache; - VkPipelineStageFlags2 src_stage_vk = - sanitize_src_stage(barrier->src_stage_mask); - VkPipelineStageFlags2 dst_stage_vk = - sanitize_dst_stage(barrier->dst_stage_mask); - enum tu_cmd_access_mask src_flags = - vk2tu_access(barrier->src_access_mask, src_stage_vk, false, false); - enum tu_cmd_access_mask dst_flags = - vk2tu_access(barrier->dst_access_mask, dst_stage_vk, false, false); - - if (barrier->incoherent_ccu_color) - src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE; - if (barrier->incoherent_ccu_depth) - src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE; - - tu_flush_for_access(cache, src_flags, dst_flags); - - enum tu_stage src_stage = vk2tu_src_stage(src_stage_vk); - enum tu_stage dst_stage = vk2tu_dst_stage(dst_stage_vk); - tu_flush_for_stage(cache, src_stage, dst_stage); -} - -/* emit mrt/zs/msaa/ubwc state for the subpass that is starting (either at - * vkCmdBeginRenderPass2() or vkCmdNextSubpass2()) - */ -static void -tu_emit_subpass_begin(struct tu_cmd_buffer *cmd) -{ - tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs); - tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs); - if (cmd->state.subpass->samples != 0) - tu6_update_msaa(cmd, cmd->state.subpass->samples); - tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false); + bool indexed; - tu_set_input_attachments(cmd, cmd->state.subpass); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, - const VkRenderPassBeginInfo *pRenderPassBegin, - const VkSubpassBeginInfo *pSubpassBeginInfo) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - - if (unlikely(cmd->device->instance->debug_flags & TU_DEBUG_DYNAMIC)) { - vk_common_CmdBeginRenderPass2(commandBuffer, pRenderPassBegin, - pSubpassBeginInfo); - return; - } - - TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass); - TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer); - - const struct VkRenderPassAttachmentBeginInfo *pAttachmentInfo = - vk_find_struct_const(pRenderPassBegin->pNext, - RENDER_PASS_ATTACHMENT_BEGIN_INFO); - - cmd->state.pass = pass; - cmd->state.subpass = pass->subpasses; - cmd->state.framebuffer = fb; - cmd->state.render_area = pRenderPassBegin->renderArea; - - cmd->state.attachments = - vk_alloc(&cmd->vk.pool->alloc, pass->attachment_count * - sizeof(cmd->state.attachments[0]), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - - if (!cmd->state.attachments) { - vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY); - return; - } - - for (unsigned i = 0; i < pass->attachment_count; i++) { - cmd->state.attachments[i] = pAttachmentInfo ? - tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) : - cmd->state.framebuffer->attachments[i].attachment; - } - tu_choose_gmem_layout(cmd); - - trace_start_render_pass(&cmd->trace, &cmd->cs); - - /* Note: because this is external, any flushes will happen before draw_cs - * gets called. However deferred flushes could have to happen later as part - * of the subpass. + /** + * Indirect draw parameters resource. */ - tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true); - cmd->state.renderpass_cache.pending_flush_bits = - cmd->state.cache.pending_flush_bits; - cmd->state.renderpass_cache.flush_bits = 0; - - if (pass->subpasses[0].feedback_invalidate) - cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE; - - tu_lrz_begin_renderpass(cmd, pRenderPassBegin->pClearValues); - - cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); - - tu_emit_renderpass_begin(cmd, pRenderPassBegin->pClearValues); - tu_emit_subpass_begin(cmd); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdBeginRendering(VkCommandBuffer commandBuffer, - const VkRenderingInfo *pRenderingInfo) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - VkClearValue clear_values[2 * (MAX_RTS + 1)]; - - tu_setup_dynamic_render_pass(cmd, pRenderingInfo); - tu_setup_dynamic_framebuffer(cmd, pRenderingInfo); - - cmd->state.pass = &cmd->dynamic_pass; - cmd->state.subpass = &cmd->dynamic_subpass; - cmd->state.framebuffer = &cmd->dynamic_framebuffer; - cmd->state.render_area = pRenderingInfo->renderArea; - - cmd->state.attachments = cmd->dynamic_attachments; - - for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; i++) { - uint32_t a = cmd->dynamic_subpass.color_attachments[i].attachment; - if (!pRenderingInfo->pColorAttachments[i].imageView) - continue; - - TU_FROM_HANDLE(tu_image_view, view, - pRenderingInfo->pColorAttachments[i].imageView); - cmd->state.attachments[a] = view; - clear_values[a] = pRenderingInfo->pColorAttachments[i].clearValue; - - a = cmd->dynamic_subpass.resolve_attachments[i].attachment; - if (a != VK_ATTACHMENT_UNUSED) { - TU_FROM_HANDLE(tu_image_view, resolve_view, - pRenderingInfo->pColorAttachments[i].resolveImageView); - cmd->state.attachments[a] = resolve_view; - } - } - - uint32_t a = cmd->dynamic_subpass.depth_stencil_attachment.attachment; - if (pRenderingInfo->pDepthAttachment || pRenderingInfo->pStencilAttachment) { - const struct VkRenderingAttachmentInfo *common_info = - (pRenderingInfo->pDepthAttachment && - pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE) ? - pRenderingInfo->pDepthAttachment : - pRenderingInfo->pStencilAttachment; - if (common_info && common_info->imageView != VK_NULL_HANDLE) { - TU_FROM_HANDLE(tu_image_view, view, common_info->imageView); - cmd->state.attachments[a] = view; - if (pRenderingInfo->pDepthAttachment) { - clear_values[a].depthStencil.depth = - pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth; - } - - if (pRenderingInfo->pStencilAttachment) { - clear_values[a].depthStencil.stencil = - pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil; - } - - if (cmd->dynamic_subpass.resolve_count > - cmd->dynamic_subpass.color_count) { - TU_FROM_HANDLE(tu_image_view, resolve_view, - common_info->resolveImageView); - a = cmd->dynamic_subpass.resolve_attachments[cmd->dynamic_subpass.color_count].attachment; - cmd->state.attachments[a] = resolve_view; - } - } - } - - if (unlikely(cmd->device->instance->debug_flags & TU_DEBUG_DYNAMIC)) { - const VkRenderingSelfDependencyInfoMESA *self_dependency = - vk_find_struct_const(pRenderingInfo->pNext, RENDERING_SELF_DEPENDENCY_INFO_MESA); - if (self_dependency && - (self_dependency->colorSelfDependencies || - self_dependency->depthSelfDependency || - self_dependency->stencilSelfDependency)) { - /* Mesa's renderpass emulation requires us to use normal attachments - * for input attachments, and currently doesn't try to keep track of - * which color/depth attachment an input attachment corresponds to. - * So when there's a self-dependency, we have to use sysmem. - */ - cmd->state.rp.disable_gmem = true; - } - } - - tu_choose_gmem_layout(cmd); - - cmd->state.renderpass_cache.pending_flush_bits = - cmd->state.cache.pending_flush_bits; - cmd->state.renderpass_cache.flush_bits = 0; - - bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT; - bool suspending = pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT; - cmd->state.suspending = suspending; - cmd->state.resuming = resuming; + struct tu_buffer *indirect; + uint64_t indirect_offset; + uint32_t stride; - /* We can't track LRZ across command buffer boundaries, so we have to - * disable LRZ when resuming/suspending unless we can track on the GPU. + /** + * Draw count parameters resource. */ - if ((resuming || suspending) && - !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) { - cmd->state.lrz.valid = false; - } else { - if (resuming) - tu_lrz_begin_resumed_renderpass(cmd, clear_values); - else - tu_lrz_begin_renderpass(cmd, clear_values); - } - - - if (suspending) { - cmd->state.suspended_pass.pass = cmd->state.pass; - cmd->state.suspended_pass.subpass = cmd->state.subpass; - cmd->state.suspended_pass.framebuffer = cmd->state.framebuffer; - cmd->state.suspended_pass.render_area = cmd->state.render_area; - cmd->state.suspended_pass.attachments = cmd->state.attachments; - cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout; - } - - if (!resuming) { - trace_start_render_pass(&cmd->trace, &cmd->cs); - } - - if (!resuming || cmd->state.suspend_resume == SR_NONE) { - cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace); - } + struct tu_buffer *count_buffer; + uint64_t count_buffer_offset; +}; - if (!resuming) { - tu_emit_renderpass_begin(cmd, clear_values); - tu_emit_subpass_begin(cmd); - } +enum tu_draw_state_group_id +{ + TU_DRAW_STATE_PROGRAM, + TU_DRAW_STATE_PROGRAM_BINNING, + TU_DRAW_STATE_VI, + TU_DRAW_STATE_VI_BINNING, + TU_DRAW_STATE_VP, + TU_DRAW_STATE_RAST, + TU_DRAW_STATE_DS, + TU_DRAW_STATE_BLEND, - if (suspending && !resuming) { - /* entering a chain */ - switch (cmd->state.suspend_resume) { - case SR_NONE: - cmd->state.suspend_resume = SR_IN_CHAIN; - break; - case SR_AFTER_PRE_CHAIN: - cmd->state.suspend_resume = SR_IN_CHAIN_AFTER_PRE_CHAIN; - break; - case SR_IN_PRE_CHAIN: - case SR_IN_CHAIN: - case SR_IN_CHAIN_AFTER_PRE_CHAIN: - unreachable("suspending render pass not followed by resuming pass"); - break; - } - } + TU_DRAW_STATE_COUNT, +}; - if (resuming && cmd->state.suspend_resume == SR_NONE) - cmd->state.suspend_resume = SR_IN_PRE_CHAIN; -} +struct tu_draw_state_group +{ + enum tu_draw_state_group_id id; + uint32_t enable_mask; + const struct tu_cs_entry *ib; +}; -VKAPI_ATTR void VKAPI_CALL -tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, - const VkSubpassBeginInfo *pSubpassBeginInfo, - const VkSubpassEndInfo *pSubpassEndInfo) +static void +tu6_bind_draw_states(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + const struct tu_draw_info *draw) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + const struct tu_pipeline *pipeline = cmd->state.pipeline; + const struct tu_dynamic_state *dynamic = &cmd->state.dynamic; + struct tu_draw_state_group draw_state_groups[TU_DRAW_STATE_COUNT]; + uint32_t draw_state_group_count = 0; - if (unlikely(cmd->device->instance->debug_flags & TU_DEBUG_DYNAMIC)) { - vk_common_CmdNextSubpass2(commandBuffer, pSubpassBeginInfo, - pSubpassEndInfo); + VkResult result = tu_cs_reserve_space(cmd->device, cs, 256); + if (result != VK_SUCCESS) { + cmd->record_result = result; return; } - const struct tu_render_pass *pass = cmd->state.pass; - struct tu_cs *cs = &cmd->draw_cs; - const struct tu_subpass *last_subpass = cmd->state.subpass; - - const struct tu_subpass *subpass = cmd->state.subpass++; - - /* Track LRZ valid state - * - * TODO: Improve this tracking for keeping the state of the past depth/stencil images, - * so if they become active again, we reuse its old state. - */ - if (last_subpass->depth_stencil_attachment.attachment != subpass->depth_stencil_attachment.attachment) { - cmd->state.lrz.valid = false; - cmd->state.dirty |= TU_CMD_DIRTY_LRZ; - } - - tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); + /* TODO lrz */ - if (subpass->resolve_attachments) { - tu6_emit_blit_scissor(cmd, cs, true); + uint32_t pc_primitive_cntl = 0; + if (pipeline->ia.primitive_restart && draw->indexed) + pc_primitive_cntl |= A6XX_PC_PRIMITIVE_CNTL_0_PRIMITIVE_RESTART; - for (unsigned i = 0; i < subpass->resolve_count; i++) { - uint32_t a = subpass->resolve_attachments[i].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; + tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9806, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9990, 0); + tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0); - uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); + tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_0, 1); + tu_cs_emit(cs, pc_primitive_cntl); - tu_store_gmem_attachment(cmd, cs, a, gmem_a, false); - - if (!pass->attachments[a].gmem) - continue; - - /* check if the resolved attachment is needed by later subpasses, - * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM.. - */ - perf_debug(cmd->device, "TODO: missing GMEM->GMEM resolve path\n"); - tu_load_gmem_attachment(cmd, cs, a, false, true); - } + if (cmd->state.dirty & + (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH) && + (pipeline->dynamic_state.mask & TU_DYNAMIC_LINE_WIDTH)) { + tu6_emit_gras_su_cntl(cs, pipeline->rast.gras_su_cntl, + dynamic->line_width); } - tu_cond_exec_end(cs); - - tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); - - tu6_emit_sysmem_resolves(cmd, cs, subpass); - - tu_cond_exec_end(cs); - - /* Handle dependencies for the next subpass */ - tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false); - - if (cmd->state.subpass->feedback_invalidate) - cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE; - - tu_emit_subpass_begin(cmd); -} - -static uint32_t -tu6_user_consts_size(const struct tu_pipeline *pipeline, - gl_shader_stage type) -{ - const struct tu_program_descriptor_linkage *link = - &pipeline->program.link[type]; - uint32_t dwords = 0; - - if (link->tu_const_state.push_consts.dwords > 0) { - unsigned num_units = link->tu_const_state.push_consts.dwords; - dwords += 4 + num_units; + if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK) && + (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_COMPARE_MASK)) { + tu6_emit_stencil_compare_mask(cs, dynamic->stencil_compare_mask.front, + dynamic->stencil_compare_mask.back); } - return dwords; -} - -static void -tu6_emit_user_consts(struct tu_cs *cs, - const struct tu_pipeline *pipeline, - gl_shader_stage type, - uint32_t *push_constants) -{ - const struct tu_program_descriptor_linkage *link = - &pipeline->program.link[type]; - - if (link->tu_const_state.push_consts.dwords > 0) { - unsigned num_units = link->tu_const_state.push_consts.dwords; - unsigned offset = link->tu_const_state.push_consts.lo; - - /* DST_OFF and NUM_UNIT requires vec4 units */ - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset / 4) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(num_units / 4)); - tu_cs_emit(cs, 0); - tu_cs_emit(cs, 0); - for (unsigned i = 0; i < num_units; i++) - tu_cs_emit(cs, push_constants[i + offset]); + if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK) && + (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_WRITE_MASK)) { + tu6_emit_stencil_write_mask(cs, dynamic->stencil_write_mask.front, + dynamic->stencil_write_mask.back); } -} -static void -tu6_emit_shared_consts(struct tu_cs *cs, - const struct tu_pipeline *pipeline, - uint32_t *push_constants, - bool compute) -{ - if (pipeline->shared_consts.dwords > 0) { - /* Offset and num_units for shared consts are in units of dwords. */ - unsigned num_units = pipeline->shared_consts.dwords; - unsigned offset = pipeline->shared_consts.lo; - - enum a6xx_state_type st = compute ? ST6_UBO : ST6_CONSTANTS; - uint32_t cp_load_state = compute ? CP_LOAD_STATE6_FRAG : CP_LOAD_STATE6; - - tu_cs_emit_pkt7(cs, cp_load_state, 3 + num_units); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | - CP_LOAD_STATE6_0_STATE_TYPE(st) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) | - CP_LOAD_STATE6_0_NUM_UNIT(num_units)); - tu_cs_emit(cs, 0); - tu_cs_emit(cs, 0); - - for (unsigned i = 0; i < num_units; i++) - tu_cs_emit(cs, push_constants[i + offset]); + if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE) && + (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_REFERENCE)) { + tu6_emit_stencil_reference(cs, dynamic->stencil_reference.front, + dynamic->stencil_reference.back); } -} -static uint32_t -tu6_const_size(struct tu_cmd_buffer *cmd, - const struct tu_pipeline *pipeline, - bool compute) -{ - uint32_t dwords = 0; + if (cmd->state.dirty & + (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_VERTEX_BUFFERS)) { + for (uint32_t i = 0; i < pipeline->vi.count; i++) { + const uint32_t binding = pipeline->vi.bindings[i]; + const uint32_t stride = pipeline->vi.strides[i]; + const struct tu_buffer *buf = cmd->state.vb.buffers[binding]; + const VkDeviceSize offset = buf->bo_offset + + cmd->state.vb.offsets[binding] + + pipeline->vi.offsets[i]; + const VkDeviceSize size = + offset < buf->bo->size ? buf->bo->size - offset : 0; - if (pipeline->shared_consts.dwords > 0) { - dwords = pipeline->shared_consts.dwords + 4; - } else { - if (compute) { - dwords = tu6_user_consts_size(pipeline, MESA_SHADER_COMPUTE); - } else { - for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) - dwords += tu6_user_consts_size(pipeline, type); + tu_cs_emit_pkt4(cs, REG_A6XX_VFD_FETCH(i), 4); + tu_cs_emit_qw(cs, buf->bo->iova + offset); + tu_cs_emit(cs, size); + tu_cs_emit(cs, stride); } } - return dwords; -} - -static struct tu_draw_state -tu6_emit_consts(struct tu_cmd_buffer *cmd, - const struct tu_pipeline *pipeline, - bool compute) -{ - uint32_t dwords = 0; - - dwords = tu6_const_size(cmd, pipeline, compute); - - if (dwords == 0) - return (struct tu_draw_state) {}; - - struct tu_cs cs; - tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs); - - if (pipeline->shared_consts.dwords > 0) { - tu6_emit_shared_consts(&cs, pipeline, cmd->push_constants, compute); - - for (uint32_t i = 0; i < ARRAY_SIZE(pipeline->program.link); i++) { - const struct tu_program_descriptor_linkage *link = - &pipeline->program.link[i]; - assert(!link->tu_const_state.push_consts.dwords); - } - } else { - if (compute) { - tu6_emit_user_consts(&cs, pipeline, MESA_SHADER_COMPUTE, cmd->push_constants); + /* TODO shader consts */ + + if (cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) { + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_PROGRAM, + .enable_mask = 0x6, + .ib = &pipeline->program.state_ib, + }; + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_PROGRAM_BINNING, + .enable_mask = 0x1, + .ib = &pipeline->program.binning_state_ib, + }; + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_VI, + .enable_mask = 0x6, + .ib = &pipeline->vi.state_ib, + }; + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_VI_BINNING, + .enable_mask = 0x1, + .ib = &pipeline->vi.binning_state_ib, + }; + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_VP, + .enable_mask = 0x7, + .ib = &pipeline->vp.state_ib, + }; + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_RAST, + .enable_mask = 0x7, + .ib = &pipeline->rast.state_ib, + }; + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_DS, + .enable_mask = 0x7, + .ib = &pipeline->ds.state_ib, + }; + draw_state_groups[draw_state_group_count++] = + (struct tu_draw_state_group) { + .id = TU_DRAW_STATE_BLEND, + .enable_mask = 0x7, + .ib = &pipeline->blend.state_ib, + }; + } + + tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_group_count); + for (uint32_t i = 0; i < draw_state_group_count; i++) { + const struct tu_draw_state_group *group = &draw_state_groups[i]; + + uint32_t cp_set_draw_state = + CP_SET_DRAW_STATE__0_COUNT(group->ib->size / 4) | + CP_SET_DRAW_STATE__0_ENABLE_MASK(group->enable_mask) | + CP_SET_DRAW_STATE__0_GROUP_ID(group->id); + uint64_t iova; + if (group->ib->size) { + iova = group->ib->bo->iova + group->ib->offset; } else { - for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) - tu6_emit_user_consts(&cs, pipeline, type, cmd->push_constants); + cp_set_draw_state |= CP_SET_DRAW_STATE__0_DISABLE; + iova = 0; } - } - return tu_cs_end_draw_state(&cmd->sub_cs, &cs); -} - -/* Various frontends (ANGLE, zink at least) will enable stencil testing with - * what works out to be no-op writes. Simplify what they give us into flags - * that LRZ can use. - */ -static void -tu6_update_simplified_stencil_state(struct tu_cmd_buffer *cmd) -{ - bool stencil_test_enable = - cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE; - - if (!stencil_test_enable) { - cmd->state.stencil_front_write = false; - cmd->state.stencil_back_write = false; - return; - } - - bool stencil_front_writemask = - (cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ? - (cmd->state.dynamic_stencil_wrmask & 0xff) : - (cmd->state.pipeline->ds.stencil_wrmask & 0xff); - - bool stencil_back_writemask = - (cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ? - ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) : - (cmd->state.pipeline->ds.stencil_wrmask & 0xff00) >> 8; - - VkStencilOp front_fail_op = - (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL__SHIFT; - VkStencilOp front_pass_op = - (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS__SHIFT; - VkStencilOp front_depth_fail_op = - (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL__SHIFT; - VkStencilOp back_fail_op = - (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL_BF__SHIFT; - VkStencilOp back_pass_op = - (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS_BF__SHIFT; - VkStencilOp back_depth_fail_op = - (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__SHIFT; - - bool stencil_front_op_writes = - front_pass_op != VK_STENCIL_OP_KEEP || - front_fail_op != VK_STENCIL_OP_KEEP || - front_depth_fail_op != VK_STENCIL_OP_KEEP; - - bool stencil_back_op_writes = - back_pass_op != VK_STENCIL_OP_KEEP || - back_fail_op != VK_STENCIL_OP_KEEP || - back_depth_fail_op != VK_STENCIL_OP_KEEP; - - cmd->state.stencil_front_write = - stencil_front_op_writes && stencil_front_writemask; - cmd->state.stencil_back_write = - stencil_back_op_writes && stencil_back_writemask; -} - -static bool -tu6_writes_depth(struct tu_cmd_buffer *cmd, bool depth_test_enable) -{ - bool depth_write_enable = - cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; - - VkCompareOp depth_compare_op = - (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT; - - bool depth_compare_op_writes = depth_compare_op != VK_COMPARE_OP_NEVER; - - return depth_test_enable && depth_write_enable && depth_compare_op_writes; -} - -static bool -tu6_writes_stencil(struct tu_cmd_buffer *cmd) -{ - return cmd->state.stencil_front_write || cmd->state.stencil_back_write; -} - -static void -tu6_build_depth_plane_z_mode(struct tu_cmd_buffer *cmd, struct tu_cs *cs) -{ - enum a6xx_ztest_mode zmode = A6XX_EARLY_Z; - bool depth_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; - bool depth_write = tu6_writes_depth(cmd, depth_test_enable); - bool stencil_write = tu6_writes_stencil(cmd); - - if ((cmd->state.pipeline->lrz.fs.has_kill || - cmd->state.pipeline->output.subpass_feedback_loop_ds) && - (depth_write || stencil_write)) { - zmode = (cmd->state.lrz.valid && cmd->state.lrz.enabled) - ? A6XX_EARLY_LRZ_LATE_Z - : A6XX_LATE_Z; + tu_cs_emit(cs, cp_set_draw_state); + tu_cs_emit_qw(cs, iova); } - if ((cmd->state.pipeline->lrz.force_late_z && - !cmd->state.pipeline->lrz.fs.force_early_z) || !depth_test_enable) - zmode = A6XX_LATE_Z; - - /* User defined early tests take precedence above all else */ - if (cmd->state.pipeline->lrz.fs.early_fragment_tests) - zmode = A6XX_EARLY_Z; - - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1); - tu_cs_emit(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL_Z_MODE(zmode)); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1); - tu_cs_emit(cs, A6XX_RB_DEPTH_PLANE_CNTL_Z_MODE(zmode)); -} + tu_cs_sanity_check(cs); -static void -tu6_emit_blend(struct tu_cs *cs, struct tu_cmd_buffer *cmd) -{ - struct tu_pipeline *pipeline = cmd->state.pipeline; - uint32_t color_write_enable = cmd->state.pipeline_color_write_enable; - - if (pipeline->dynamic_state_mask & - BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE)) - color_write_enable &= cmd->state.color_write_enable; - - for (unsigned i = 0; i < pipeline->blend.num_rts; i++) { - tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_CONTROL(i), 2); - if (color_write_enable & BIT(i)) { - tu_cs_emit(cs, cmd->state.rb_mrt_control[i] | - ((cmd->state.logic_op_enabled ? - cmd->state.rb_mrt_control_rop : 0) & - ~pipeline->blend.rb_mrt_control_mask)); - tu_cs_emit(cs, cmd->state.rb_mrt_blend_control[i]); - } else { - tu_cs_emit(cs, 0); - tu_cs_emit(cs, 0); + /* track BOs */ + if (cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) { + tu_bo_list_add(&cmd->bo_list, &pipeline->program.binary_bo, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); + for (uint32_t i = 0; i < pipeline->cs.bo_count; i++) { + tu_bo_list_add(&cmd->bo_list, pipeline->cs.bos[i], + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); } } - - uint32_t blend_enable_mask = color_write_enable; - if (!(cmd->state.logic_op_enabled && cmd->state.rop_reads_dst)) - blend_enable_mask &= cmd->state.pipeline_blend_enable; - - tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = pipeline->blend.num_rts)); - tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = pipeline->blend.num_rts)); - tu_cs_emit_pkt4(cs, REG_A6XX_SP_BLEND_CNTL, 1); - tu_cs_emit(cs, cmd->state.sp_blend_cntl | - (A6XX_SP_BLEND_CNTL_ENABLE_BLEND(blend_enable_mask) & - ~pipeline->blend.sp_blend_cntl_mask)); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_CNTL, 1); - tu_cs_emit(cs, cmd->state.rb_blend_cntl | - (A6XX_RB_BLEND_CNTL_ENABLE_BLEND(blend_enable_mask) & - ~pipeline->blend.rb_blend_cntl_mask)); -} - -static VkResult -tu6_draw_common(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, - bool indexed, - /* note: draw_count is 0 for indirect */ - uint32_t draw_count) -{ - const struct tu_pipeline *pipeline = cmd->state.pipeline; - struct tu_render_pass_state *rp = &cmd->state.rp; - - /* Fill draw stats for autotuner */ - rp->drawcall_count++; - - rp->drawcall_bandwidth_per_sample_sum += - pipeline->output.color_bandwidth_per_sample; - - /* add depth memory bandwidth cost */ - const uint32_t depth_bandwidth = pipeline->output.depth_cpp_per_sample; - if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE) - rp->drawcall_bandwidth_per_sample_sum += depth_bandwidth; - if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE) - rp->drawcall_bandwidth_per_sample_sum += depth_bandwidth; - - /* add stencil memory bandwidth cost */ - const uint32_t stencil_bandwidth = pipeline->output.stencil_cpp_per_sample; - if (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE) - rp->drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2; - - tu_emit_cache_flush_renderpass(cmd, cs); - - bool primitive_restart_enabled = pipeline->ia.primitive_restart; - if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE)) - primitive_restart_enabled = cmd->state.primitive_restart_enable; - - bool primitive_restart = primitive_restart_enabled && indexed; - bool provoking_vtx_last = pipeline->rast.provoking_vertex_last; - bool tess_upper_left_domain_origin = - pipeline->tess.upper_left_domain_origin; - - struct tu_primitive_params* prim_params = &cmd->state.last_prim_params; - - if (!prim_params->valid || - prim_params->primitive_restart != primitive_restart || - prim_params->provoking_vtx_last != provoking_vtx_last || - prim_params->tess_upper_left_domain_origin != - tess_upper_left_domain_origin) { - tu_cs_emit_regs( - cs, - A6XX_PC_PRIMITIVE_CNTL_0(.primitive_restart = primitive_restart, - .provoking_vtx_last = provoking_vtx_last, - .tess_upper_left_domain_origin = - tess_upper_left_domain_origin)); - prim_params->valid = true; - prim_params->primitive_restart = primitive_restart; - prim_params->provoking_vtx_last = provoking_vtx_last; - prim_params->tess_upper_left_domain_origin = tess_upper_left_domain_origin; - } - - /* Early exit if there is nothing to emit, saves CPU cycles */ - uint32_t dirty = cmd->state.dirty; - if (!(dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD)) - return VK_SUCCESS; - - bool dirty_lrz = - dirty & (TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_RB_DEPTH_CNTL | - TU_CMD_DIRTY_RB_STENCIL_CNTL | TU_CMD_DIRTY_BLEND); - - if (dirty_lrz) { - struct tu_cs cs; - uint32_t size = cmd->device->physical_device->info->a6xx.lrz_track_quirk ? 10 : 8; - - cmd->state.lrz_and_depth_plane_state = - tu_cs_draw_state(&cmd->sub_cs, &cs, size); - tu6_update_simplified_stencil_state(cmd); - tu6_emit_lrz(cmd, &cs); - tu6_build_depth_plane_z_mode(cmd, &cs); - } - - if (dirty & TU_CMD_DIRTY_RASTERIZER_DISCARD) { - struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4); - tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = cmd->state.pc_raster_cntl)); - tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = cmd->state.vpc_unknown_9107)); - } - - if (dirty & TU_CMD_DIRTY_GRAS_SU_CNTL) { - struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2); - tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = cmd->state.gras_su_cntl)); - } - - if (dirty & TU_CMD_DIRTY_RB_DEPTH_CNTL) { - struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2); - uint32_t rb_depth_cntl = cmd->state.rb_depth_cntl; - - if ((rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE) || - (rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE)) - rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; - - if ((rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE) && - !(rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)) - tu6_apply_depth_bounds_workaround(cmd->device, &rb_depth_cntl); - - if (pipeline->output.rb_depth_cntl_disable) - rb_depth_cntl = 0; - - tu_cs_emit_regs(&cs, A6XX_RB_DEPTH_CNTL(.dword = rb_depth_cntl)); - } - - if (dirty & TU_CMD_DIRTY_RB_STENCIL_CNTL) { - struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2); - tu_cs_emit_regs(&cs, A6XX_RB_STENCIL_CONTROL(.dword = cmd->state.rb_stencil_cntl)); - } - - if (dirty & TU_CMD_DIRTY_SHADER_CONSTS) - cmd->state.shader_const = tu6_emit_consts(cmd, pipeline, false); - - if (dirty & TU_CMD_DIRTY_VIEWPORTS) { - struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * cmd->state.max_viewport); - tu6_emit_viewport(&cs, cmd->state.viewport, cmd->state.max_viewport, - pipeline->viewport.z_negative_one_to_one); - } - - if (dirty & TU_CMD_DIRTY_BLEND) { - struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_BLEND, - 8 + 3 * cmd->state.pipeline->blend.num_rts); - tu6_emit_blend(&cs, cmd); - } - - if (dirty & TU_CMD_DIRTY_PATCH_CONTROL_POINTS) { - bool tess = cmd->state.pipeline->active_stages & - VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; - uint32_t state_size = TU6_EMIT_PATCH_CONTROL_POINTS_DWORDS( - pipeline->program.hs_param_dwords); - struct tu_cs cs = tu_cmd_dynamic_state( - cmd, TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS, tess ? state_size : 0); - tu6_emit_patch_control_points(&cs, cmd->state.pipeline, - cmd->state.patch_control_points); - } - - /* for the first draw in a renderpass, re-emit all the draw states - * - * and if a draw-state disabling path (CmdClearAttachments 3D fallback) was - * used, then draw states must be re-emitted. note however this only happens - * in the sysmem path, so this can be skipped this for the gmem path (TODO) - * - * the two input attachment states are excluded because secondary command - * buffer doesn't have a state ib to restore it, and not re-emitting them - * is OK since CmdClearAttachments won't disable/overwrite them - */ - if (dirty & TU_CMD_DIRTY_DRAW_STATE) { - tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2)); - - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast.state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order.state_sysmem); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order.state_gmem); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_CONST, cmd->state.shader_const); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE, cmd->state.lrz_and_depth_plane_state); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_MSAA, cmd->state.msaa); - - for (uint32_t i = 0; i < ARRAY_SIZE(cmd->state.dynamic_state); i++) { - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, - ((pipeline->dynamic_state_mask & BIT(i)) ? - cmd->state.dynamic_state[i] : - pipeline->dynamic_state[i])); - } - } else { - /* emit draw states that were just updated - * note we eventually don't want to have to emit anything here - */ - bool emit_binding_stride = false, emit_blend = false, - emit_patch_control_points = false; - uint32_t draw_state_count = - ((dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 1 : 0) + - ((dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) ? 1 : 0) + - ((dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) + - ((dirty & TU_CMD_DIRTY_VS_PARAMS) ? 1 : 0) + - (dirty_lrz ? 1 : 0); - - if ((dirty & TU_CMD_DIRTY_VB_STRIDE) && - (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) { - emit_binding_stride = true; - draw_state_count += 1; - } - - if ((dirty & TU_CMD_DIRTY_BLEND) && - (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_BLEND))) { - emit_blend = true; - draw_state_count += 1; - } - - if ((dirty & TU_CMD_DIRTY_PATCH_CONTROL_POINTS) && - (pipeline->dynamic_state_mask & - BIT(TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS))) { - emit_patch_control_points = true; - draw_state_count += 1; - } - - if (draw_state_count > 0) - tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count); - - if (dirty & TU_CMD_DIRTY_SHADER_CONSTS) - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_CONST, cmd->state.shader_const); - if (dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state); - if (dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers); - if (emit_binding_stride) { - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_VB_STRIDE, - cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE]); - } - if (emit_blend) { - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_BLEND, - cmd->state.dynamic_state[TU_DYNAMIC_STATE_BLEND]); - } - if (emit_patch_control_points) { - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS, - cmd->state.dynamic_state[TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS]); - } - if (dirty & TU_CMD_DIRTY_VS_PARAMS) - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params); - - if (dirty_lrz) { - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE, cmd->state.lrz_and_depth_plane_state); + if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) { + for (uint32_t i = 0; i < MAX_VBS; i++) { + const struct tu_buffer *buf = cmd->state.vb.buffers[i]; + if (buf) + tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ); } } - tu_cs_sanity_check(cs); - - /* There are too many graphics dirty bits to list here, so just list the - * bits to preserve instead. The only things not emitted here are - * compute-related state. - */ - cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD; - return VK_SUCCESS; + cmd->state.dirty = 0; } -static uint32_t -tu_draw_initiator(struct tu_cmd_buffer *cmd, enum pc_di_src_sel src_sel) +static void +tu6_emit_draw_direct(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + const struct tu_draw_info *draw) { - const struct tu_pipeline *pipeline = cmd->state.pipeline; - enum pc_di_primtype primtype = cmd->state.primtype; - if (primtype == DI_PT_PATCHES0) - primtype += cmd->state.patch_control_points; + const enum pc_di_primtype primtype = cmd->state.pipeline->ia.primtype; - uint32_t initiator = - CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) | - CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(src_sel) | - CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(cmd->state.index_size) | - CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY); + tu_cs_emit_pkt4(cs, REG_A6XX_VFD_INDEX_OFFSET, 2); + tu_cs_emit(cs, draw->vertex_offset); + tu_cs_emit(cs, draw->first_instance); - if (pipeline->active_stages & VK_SHADER_STAGE_GEOMETRY_BIT) - initiator |= CP_DRAW_INDX_OFFSET_0_GS_ENABLE; - - switch (pipeline->tess.patch_type) { - case IR3_TESS_TRIANGLES: - initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_TRIANGLES) | - CP_DRAW_INDX_OFFSET_0_TESS_ENABLE; - break; - case IR3_TESS_ISOLINES: - initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_ISOLINES) | - CP_DRAW_INDX_OFFSET_0_TESS_ENABLE; - break; - case IR3_TESS_NONE: - initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS); - break; - case IR3_TESS_QUADS: - initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS) | - CP_DRAW_INDX_OFFSET_0_TESS_ENABLE; - break; - } - return initiator; -} + /* TODO hw binning */ + if (draw->indexed) { + const enum a4xx_index_size index_size = + tu6_index_size(cmd->state.index_type); + const uint32_t index_bytes = + (cmd->state.index_type == VK_INDEX_TYPE_UINT32) ? 4 : 2; + const struct tu_buffer *buf = cmd->state.index_buffer; + const VkDeviceSize offset = buf->bo_offset + cmd->state.index_offset + + index_bytes * draw->first_index; + const uint32_t size = index_bytes * draw->count; + const uint32_t cp_draw_indx = + CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) | + CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_DMA) | + CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) | + CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) | 0x2000; -static uint32_t -vs_params_offset(struct tu_cmd_buffer *cmd) -{ - const struct tu_program_descriptor_linkage *link = - &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX]; - const struct ir3_const_state *const_state = &link->const_state; - - if (const_state->offsets.driver_param >= link->constlen) - return 0; - - /* this layout is required by CP_DRAW_INDIRECT_MULTI */ - STATIC_ASSERT(IR3_DP_DRAWID == 0); - STATIC_ASSERT(IR3_DP_VTXID_BASE == 1); - STATIC_ASSERT(IR3_DP_INSTID_BASE == 2); - - /* 0 means disabled for CP_DRAW_INDIRECT_MULTI */ - assert(const_state->offsets.driver_param != 0); - - return const_state->offsets.driver_param; -} + tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7); + tu_cs_emit(cs, cp_draw_indx); + tu_cs_emit(cs, draw->instance_count); + tu_cs_emit(cs, draw->count); + tu_cs_emit(cs, 0x0); /* XXX */ + tu_cs_emit_qw(cs, buf->bo->iova + offset); + tu_cs_emit(cs, size); + } else { + const uint32_t cp_draw_indx = + CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) | + CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) | + CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) | 0x2000; -static void -tu6_emit_empty_vs_params(struct tu_cmd_buffer *cmd) -{ - if (cmd->state.vs_params.iova) { - cmd->state.vs_params = (struct tu_draw_state) {}; - cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS; + tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3); + tu_cs_emit(cs, cp_draw_indx); + tu_cs_emit(cs, draw->instance_count); + tu_cs_emit(cs, draw->count); } } static void -tu6_emit_vs_params(struct tu_cmd_buffer *cmd, - uint32_t draw_id, - uint32_t vertex_offset, - uint32_t first_instance) +tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw) { - uint32_t offset = vs_params_offset(cmd); + struct tu_cs *cs = &cmd->draw_cs; - /* Beside re-emitting params when they are changed, we should re-emit - * them after constants are invalidated via HLSQ_INVALIDATE_CMD. - */ - if (!(cmd->state.dirty & (TU_CMD_DIRTY_DRAW_STATE | TU_CMD_DIRTY_VS_PARAMS)) && - (offset == 0 || draw_id == cmd->state.last_vs_params.draw_id) && - vertex_offset == cmd->state.last_vs_params.vertex_offset && - first_instance == cmd->state.last_vs_params.first_instance) { - return; - } + tu6_bind_draw_states(cmd, cs, draw); - struct tu_cs cs; - VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 8 : 0), &cs); + VkResult result = tu_cs_reserve_space(cmd->device, cs, 32); if (result != VK_SUCCESS) { - vk_command_buffer_set_error(&cmd->vk, result); + cmd->record_result = result; return; } - tu_cs_emit_regs(&cs, - A6XX_VFD_INDEX_OFFSET(vertex_offset), - A6XX_VFD_INSTANCE_START_OFFSET(first_instance)); - - if (offset) { - tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4); - tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(offset) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) | - CP_LOAD_STATE6_0_NUM_UNIT(1)); - tu_cs_emit(&cs, 0); - tu_cs_emit(&cs, 0); - - tu_cs_emit(&cs, draw_id); - tu_cs_emit(&cs, vertex_offset); - tu_cs_emit(&cs, first_instance); - tu_cs_emit(&cs, 0); + if (draw->indirect) { + tu_finishme("indirect draw"); + return; } - cmd->state.last_vs_params.vertex_offset = vertex_offset; - cmd->state.last_vs_params.first_instance = first_instance; - cmd->state.last_vs_params.draw_id = draw_id; + /* TODO tu6_emit_marker should pick different regs depending on cs */ + tu6_emit_marker(cmd, cs); + tu6_emit_draw_direct(cmd, cs, draw); + tu6_emit_marker(cmd, cs); - struct tu_cs_entry entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs); - cmd->state.vs_params = (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4}; + cmd->wait_for_idle = true; - cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS; + tu_cs_sanity_check(cs); } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs *cs = &cmd->draw_cs; - - tu6_emit_vs_params(cmd, 0, firstVertex, firstInstance); - - tu6_draw_common(cmd, cs, false, vertexCount); - - tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3); - tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX)); - tu_cs_emit(cs, instanceCount); - tu_cs_emit(cs, vertexCount); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, - uint32_t drawCount, - const VkMultiDrawInfoEXT *pVertexInfo, - uint32_t instanceCount, - uint32_t firstInstance, - uint32_t stride) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs *cs = &cmd->draw_cs; - - if (!drawCount) - return; - - bool has_tess = - cmd->state.pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; - - uint32_t max_vertex_count = 0; - if (has_tess) { - uint32_t i = 0; - vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) { - max_vertex_count = MAX2(max_vertex_count, draw->vertexCount); - } - } - - uint32_t i = 0; - vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) { - tu6_emit_vs_params(cmd, i, draw->firstVertex, firstInstance); - - if (i == 0) - tu6_draw_common(cmd, cs, false, max_vertex_count); + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + struct tu_draw_info info = {}; - if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) { - tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params); - cmd->state.dirty &= ~TU_CMD_DIRTY_VS_PARAMS; - } + info.count = vertexCount; + info.instance_count = instanceCount; + info.first_instance = firstInstance; + info.vertex_offset = firstVertex; - tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3); - tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX)); - tu_cs_emit(cs, instanceCount); - tu_cs_emit(cs, draw->vertexCount); - } + tu_draw(cmd_buffer, &info); } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, @@ -4828,239 +2377,56 @@ tu_CmdDrawIndexed(VkCommandBuffer commandBuffer, int32_t vertexOffset, uint32_t firstInstance) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs *cs = &cmd->draw_cs; - - tu6_emit_vs_params(cmd, 0, vertexOffset, firstInstance); - - tu6_draw_common(cmd, cs, true, indexCount); - - tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7); - tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA)); - tu_cs_emit(cs, instanceCount); - tu_cs_emit(cs, indexCount); - tu_cs_emit(cs, firstIndex); - tu_cs_emit_qw(cs, cmd->state.index_va); - tu_cs_emit(cs, cmd->state.max_index_count); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, - uint32_t drawCount, - const VkMultiDrawIndexedInfoEXT *pIndexInfo, - uint32_t instanceCount, - uint32_t firstInstance, - uint32_t stride, - const int32_t *pVertexOffset) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs *cs = &cmd->draw_cs; - - if (!drawCount) - return; - - bool has_tess = - cmd->state.pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; - - uint32_t max_index_count = 0; - if (has_tess) { - uint32_t i = 0; - vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) { - max_index_count = MAX2(max_index_count, draw->indexCount); - } - } - - uint32_t i = 0; - vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) { - int32_t vertexOffset = pVertexOffset ? *pVertexOffset : draw->vertexOffset; - tu6_emit_vs_params(cmd, i, vertexOffset, firstInstance); - - if (i == 0) - tu6_draw_common(cmd, cs, true, max_index_count); - - if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) { - tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params); - cmd->state.dirty &= ~TU_CMD_DIRTY_VS_PARAMS; - } + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + struct tu_draw_info info = {}; - tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7); - tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA)); - tu_cs_emit(cs, instanceCount); - tu_cs_emit(cs, draw->indexCount); - tu_cs_emit(cs, draw->firstIndex); - tu_cs_emit_qw(cs, cmd->state.index_va); - tu_cs_emit(cs, cmd->state.max_index_count); - } -} + info.indexed = true; + info.count = indexCount; + info.instance_count = instanceCount; + info.first_index = firstIndex; + info.vertex_offset = vertexOffset; + info.first_instance = firstInstance; -/* Various firmware bugs/inconsistencies mean that some indirect draw opcodes - * do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if - * pending for these opcodes. This may result in a few extra WAIT_FOR_ME's - * with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's - * before draw opcodes that don't need it. - */ -static void -draw_wfm(struct tu_cmd_buffer *cmd) -{ - cmd->state.renderpass_cache.flush_bits |= - cmd->state.renderpass_cache.pending_flush_bits & TU_CMD_FLAG_WAIT_FOR_ME; - cmd->state.renderpass_cache.pending_flush_bits &= ~TU_CMD_FLAG_WAIT_FOR_ME; + tu_draw(cmd_buffer, &info); } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_buffer, buf, _buffer); - struct tu_cs *cs = &cmd->draw_cs; - - tu6_emit_empty_vs_params(cmd); - - if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk) - draw_wfm(cmd); + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + TU_FROM_HANDLE(tu_buffer, buffer, _buffer); + struct tu_draw_info info = {}; - tu6_draw_common(cmd, cs, false, 0); + info.count = drawCount; + info.indirect = buffer; + info.indirect_offset = offset; + info.stride = stride; - tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6); - tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX)); - tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) | - A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd))); - tu_cs_emit(cs, drawCount); - tu_cs_emit_qw(cs, buf->iova + offset); - tu_cs_emit(cs, stride); + tu_draw(cmd_buffer, &info); } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_buffer, buf, _buffer); - struct tu_cs *cs = &cmd->draw_cs; - - tu6_emit_empty_vs_params(cmd); - - if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk) - draw_wfm(cmd); - - tu6_draw_common(cmd, cs, true, 0); - - tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9); - tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA)); - tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) | - A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd))); - tu_cs_emit(cs, drawCount); - tu_cs_emit_qw(cs, cmd->state.index_va); - tu_cs_emit(cs, cmd->state.max_index_count); - tu_cs_emit_qw(cs, buf->iova + offset); - tu_cs_emit(cs, stride); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, - VkBuffer _buffer, - VkDeviceSize offset, - VkBuffer countBuffer, - VkDeviceSize countBufferOffset, - uint32_t drawCount, - uint32_t stride) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_buffer, buf, _buffer); - TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer); - struct tu_cs *cs = &cmd->draw_cs; - - tu6_emit_empty_vs_params(cmd); - - /* It turns out that the firmware we have for a650 only partially fixed the - * problem with CP_DRAW_INDIRECT_MULTI not waiting for WFI's to complete - * before reading indirect parameters. It waits for WFI's before reading - * the draw parameters, but after reading the indirect count :(. - */ - draw_wfm(cmd); - - tu6_draw_common(cmd, cs, false, 0); - - tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 8); - tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX)); - tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT) | - A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd))); - tu_cs_emit(cs, drawCount); - tu_cs_emit_qw(cs, buf->iova + offset); - tu_cs_emit_qw(cs, count_buf->iova + countBufferOffset); - tu_cs_emit(cs, stride); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, - VkBuffer _buffer, - VkDeviceSize offset, - VkBuffer countBuffer, - VkDeviceSize countBufferOffset, - uint32_t drawCount, - uint32_t stride) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_buffer, buf, _buffer); - TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer); - struct tu_cs *cs = &cmd->draw_cs; - - tu6_emit_empty_vs_params(cmd); - - draw_wfm(cmd); - - tu6_draw_common(cmd, cs, true, 0); - - tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 11); - tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA)); - tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT_INDEXED) | - A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd))); - tu_cs_emit(cs, drawCount); - tu_cs_emit_qw(cs, cmd->state.index_va); - tu_cs_emit(cs, cmd->state.max_index_count); - tu_cs_emit_qw(cs, buf->iova + offset); - tu_cs_emit_qw(cs, count_buf->iova + countBufferOffset); - tu_cs_emit(cs, stride); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, - uint32_t instanceCount, - uint32_t firstInstance, - VkBuffer _counterBuffer, - VkDeviceSize counterBufferOffset, - uint32_t counterOffset, - uint32_t vertexStride) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer); - struct tu_cs *cs = &cmd->draw_cs; - - /* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO. - * Plus, for the common case where the counter buffer is written by - * vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to - * complete which means we need a WAIT_FOR_ME anyway. - */ - draw_wfm(cmd); - - tu6_emit_vs_params(cmd, 0, 0, firstInstance); + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + TU_FROM_HANDLE(tu_buffer, buffer, _buffer); + struct tu_draw_info info = {}; - tu6_draw_common(cmd, cs, false, 0); + info.indexed = true; + info.count = drawCount; + info.indirect = buffer; + info.indirect_offset = offset; + info.stride = stride; - tu_cs_emit_pkt7(cs, CP_DRAW_AUTO, 6); - tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_XFB)); - tu_cs_emit(cs, instanceCount); - tu_cs_emit_qw(cs, buf->iova + counterBufferOffset); - tu_cs_emit(cs, counterOffset); - tu_cs_emit(cs, vertexStride); + tu_draw(cmd_buffer, &info); } struct tu_dispatch_info @@ -5088,221 +2454,12 @@ struct tu_dispatch_info }; static void -tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, - struct tu_cs *cs, struct tu_pipeline *pipeline, - const struct tu_dispatch_info *info) -{ - gl_shader_stage type = MESA_SHADER_COMPUTE; - const struct tu_program_descriptor_linkage *link = - &pipeline->program.link[type]; - const struct ir3_const_state *const_state = &link->const_state; - uint32_t offset = const_state->offsets.driver_param; - unsigned subgroup_size = pipeline->compute.subgroup_size; - unsigned subgroup_shift = util_logbase2(subgroup_size); - - if (link->constlen <= offset) - return; - - uint32_t num_consts = MIN2(const_state->num_driver_params, - (link->constlen - offset) * 4); - - if (!info->indirect) { - uint32_t driver_params[12] = { - [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0], - [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1], - [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2], - [IR3_DP_BASE_GROUP_X] = info->offsets[0], - [IR3_DP_BASE_GROUP_Y] = info->offsets[1], - [IR3_DP_BASE_GROUP_Z] = info->offsets[2], - [IR3_DP_CS_SUBGROUP_SIZE] = subgroup_size, - [IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift, - }; - - assert(num_consts <= ARRAY_SIZE(driver_params)); - - /* push constants */ - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4)); - tu_cs_emit(cs, 0); - tu_cs_emit(cs, 0); - uint32_t i; - for (i = 0; i < num_consts; i++) - tu_cs_emit(cs, driver_params[i]); - } else if (!(info->indirect_offset & 0xf)) { - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(1)); - tu_cs_emit_qw(cs, info->indirect->iova + info->indirect_offset); - } else { - /* Vulkan guarantees only 4 byte alignment for indirect_offset. - * However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment. - */ - - uint64_t indirect_iova = info->indirect->iova + info->indirect_offset; - - for (uint32_t i = 0; i < 3; i++) { - tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); - tu_cs_emit(cs, 0); - tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[i])); - tu_cs_emit_qw(cs, indirect_iova + i * 4); - } - - tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); - tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE); - - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(1)); - tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0])); - } - - /* Fill out IR3_DP_CS_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for - * indirect dispatch. - */ - if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) { - bool emit_local = num_consts > IR3_DP_LOCAL_GROUP_SIZE_X; - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7 + (emit_local ? 4 : 0)); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4)); - tu_cs_emit_qw(cs, 0); - tu_cs_emit(cs, 0); /* BASE_GROUP_X */ - tu_cs_emit(cs, 0); /* BASE_GROUP_Y */ - tu_cs_emit(cs, 0); /* BASE_GROUP_Z */ - tu_cs_emit(cs, subgroup_size); - if (emit_local) { - assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4)); - tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */ - tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */ - tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */ - tu_cs_emit(cs, subgroup_shift); - } - } -} - -static void -tu_dispatch(struct tu_cmd_buffer *cmd, +tu_dispatch(struct tu_cmd_buffer *cmd_buffer, const struct tu_dispatch_info *info) { - if (!info->indirect && - (info->blocks[0] == 0 || info->blocks[1] == 0 || info->blocks[2] == 0)) - return; - - struct tu_cs *cs = &cmd->cs; - struct tu_pipeline *pipeline = cmd->state.compute_pipeline; - - bool emit_instrlen_workaround = - pipeline->program.cs_instrlen > - cmd->device->physical_device->info->a6xx.instr_cache_size; - - /* There appears to be a HW bug where in some rare circumstances it appears - * to accidentally use the FS instrlen instead of the CS instrlen, which - * affects all known gens. Based on various experiments it appears that the - * issue is that when prefetching a branch destination and there is a cache - * miss, when fetching from memory the HW bounds-checks the fetch against - * SP_CS_INSTRLEN, except when one of the two register contexts is active - * it accidentally fetches SP_FS_INSTRLEN from the other (inactive) - * context. To workaround it we set the FS instrlen here and do a dummy - * event to roll the context (because it fetches SP_FS_INSTRLEN from the - * "wrong" context). Because the bug seems to involve cache misses, we - * don't emit this if the entire CS program fits in cache, which will - * hopefully be the majority of cases. - * - * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/5892 - */ - if (emit_instrlen_workaround) { - tu_cs_emit_regs(cs, A6XX_SP_FS_INSTRLEN(pipeline->program.cs_instrlen)); - tu6_emit_event_write(cmd, cs, LABEL); - } - - /* TODO: We could probably flush less if we add a compute_flush_bits - * bitfield. - */ - tu_emit_cache_flush(cmd, cs); - - /* note: no reason to have this in a separate IB */ - tu_cs_emit_state_ib(cs, tu6_emit_consts(cmd, pipeline, true)); - - tu_emit_compute_driver_params(cmd, cs, pipeline, info); - - if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD) - tu_cs_emit_state_ib(cs, pipeline->load_state); - - cmd->state.dirty &= ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD; - - tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); - tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE)); - - const uint32_t *local_size = pipeline->compute.local_size; - const uint32_t *num_groups = info->blocks; - tu_cs_emit_regs(cs, - A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3, - .localsizex = local_size[0] - 1, - .localsizey = local_size[1] - 1, - .localsizez = local_size[2] - 1), - A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]), - A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0), - A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]), - A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0), - A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]), - A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0)); - - tu_cs_emit_regs(cs, - A6XX_HLSQ_CS_KERNEL_GROUP_X(1), - A6XX_HLSQ_CS_KERNEL_GROUP_Y(1), - A6XX_HLSQ_CS_KERNEL_GROUP_Z(1)); - - trace_start_compute(&cmd->trace, cs); - - if (info->indirect) { - uint64_t iova = info->indirect->iova + info->indirect_offset; - - tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit_qw(cs, iova); - tu_cs_emit(cs, - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) | - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) | - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1)); - } else { - tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4); - tu_cs_emit(cs, 0x00000000); - tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0])); - tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1])); - tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2])); - } - - trace_end_compute(&cmd->trace, cs, - info->indirect != NULL, - local_size[0], local_size[1], local_size[2], - info->blocks[0], info->blocks[1], info->blocks[2]); - - /* For the workaround above, because it's using the "wrong" context for - * SP_FS_INSTRLEN we should emit another dummy event write to avoid a - * potential race between writing the register and the CP_EXEC_CS we just - * did. We don't need to reset the register because it will be re-emitted - * anyway when the next renderpass starts. - */ - if (emit_instrlen_workaround) { - tu6_emit_event_write(cmd, cs, LABEL); - } - - tu_cs_emit_wfi(cs); } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y, @@ -5324,7 +2481,7 @@ tu_CmdDispatchBase(VkCommandBuffer commandBuffer, tu_dispatch(cmd_buffer, &info); } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, @@ -5333,7 +2490,7 @@ tu_CmdDispatch(VkCommandBuffer commandBuffer, tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z); } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset) @@ -5348,410 +2505,133 @@ tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer, tu_dispatch(cmd_buffer, &info); } -VKAPI_ATTR void VKAPI_CALL -tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, - const VkSubpassEndInfo *pSubpassEndInfo) +void +tu_CmdEndRenderPass(VkCommandBuffer commandBuffer) { TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); - if (unlikely(cmd_buffer->device->instance->debug_flags & TU_DEBUG_DYNAMIC)) { - vk_common_CmdEndRenderPass2(commandBuffer, pSubpassEndInfo); - return; - } - tu_cs_end(&cmd_buffer->draw_cs); - tu_cs_end(&cmd_buffer->draw_epilogue_cs); - tu_cmd_render(cmd_buffer); - - cmd_buffer->state.cache.pending_flush_bits |= - cmd_buffer->state.renderpass_cache.pending_flush_bits; - tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true); - - vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments); - - tu_reset_render_pass(cmd_buffer); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdEndRendering(VkCommandBuffer commandBuffer) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); - - if (cmd_buffer->state.suspending) - cmd_buffer->state.suspended_pass.lrz = cmd_buffer->state.lrz; - if (!cmd_buffer->state.suspending) { - tu_cs_end(&cmd_buffer->draw_cs); - tu_cs_end(&cmd_buffer->draw_epilogue_cs); + tu_cmd_render_tiles(cmd_buffer); - if (cmd_buffer->state.suspend_resume == SR_IN_PRE_CHAIN) { - cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace); - tu_save_pre_chain(cmd_buffer); - - /* Even we don't call tu_cmd_render here, renderpass is finished - * and draw states should be disabled. - */ - tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs); - } else { - tu_cmd_render(cmd_buffer); - } + /* discard draw_cs entries now that the tiles are rendered */ + tu_cs_discard_entries(&cmd_buffer->draw_cs); - tu_reset_render_pass(cmd_buffer); - } + vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments); + cmd_buffer->state.attachments = NULL; - if (cmd_buffer->state.resuming && !cmd_buffer->state.suspending) { - /* exiting suspend/resume chain */ - switch (cmd_buffer->state.suspend_resume) { - case SR_IN_CHAIN: - cmd_buffer->state.suspend_resume = SR_NONE; - break; - case SR_IN_PRE_CHAIN: - case SR_IN_CHAIN_AFTER_PRE_CHAIN: - cmd_buffer->state.suspend_resume = SR_AFTER_PRE_CHAIN; - break; - default: - unreachable("suspending render pass not followed by resuming pass"); - } - } + cmd_buffer->state.pass = NULL; + cmd_buffer->state.subpass = NULL; + cmd_buffer->state.framebuffer = NULL; } -static void -tu_barrier(struct tu_cmd_buffer *cmd, - const VkDependencyInfo *dep_info) +void +tu_CmdEndRenderPass2KHR(VkCommandBuffer commandBuffer, + const VkSubpassEndInfoKHR *pSubpassEndInfo) { - VkPipelineStageFlags2 srcStage = 0; - VkPipelineStageFlags2 dstStage = 0; - enum tu_cmd_access_mask src_flags = 0; - enum tu_cmd_access_mask dst_flags = 0; - - /* Inside a renderpass, we don't know yet whether we'll be using sysmem - * so we have to use the sysmem flushes. - */ - bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM && - !cmd->state.pass; - - - for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) { - VkPipelineStageFlags2 sanitized_src_stage = - sanitize_src_stage(dep_info->pMemoryBarriers[i].srcStageMask); - VkPipelineStageFlags2 sanitized_dst_stage = - sanitize_dst_stage(dep_info->pMemoryBarriers[i].dstStageMask); - src_flags |= vk2tu_access(dep_info->pMemoryBarriers[i].srcAccessMask, - sanitized_src_stage, false, gmem); - dst_flags |= vk2tu_access(dep_info->pMemoryBarriers[i].dstAccessMask, - sanitized_dst_stage, false, gmem); - srcStage |= sanitized_src_stage; - dstStage |= sanitized_dst_stage; - } - - for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) { - VkPipelineStageFlags2 sanitized_src_stage = - sanitize_src_stage(dep_info->pBufferMemoryBarriers[i].srcStageMask); - VkPipelineStageFlags2 sanitized_dst_stage = - sanitize_dst_stage(dep_info->pBufferMemoryBarriers[i].dstStageMask); - src_flags |= vk2tu_access(dep_info->pBufferMemoryBarriers[i].srcAccessMask, - sanitized_src_stage, false, gmem); - dst_flags |= vk2tu_access(dep_info->pBufferMemoryBarriers[i].dstAccessMask, - sanitized_dst_stage, false, gmem); - srcStage |= sanitized_src_stage; - dstStage |= sanitized_dst_stage; - } - - for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) { - VkImageLayout old_layout = dep_info->pImageMemoryBarriers[i].oldLayout; - if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED) { - /* The underlying memory for this image may have been used earlier - * within the same queue submission for a different image, which - * means that there may be old, stale cache entries which are in the - * "wrong" location, which could cause problems later after writing - * to the image. We don't want these entries being flushed later and - * overwriting the actual image, so we need to flush the CCU. - */ - TU_FROM_HANDLE(tu_image, image, dep_info->pImageMemoryBarriers[i].image); - - if (vk_format_is_depth_or_stencil(image->vk.format)) { - src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE; - } else { - src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE; - } - } - VkPipelineStageFlags2 sanitized_src_stage = - sanitize_src_stage(dep_info->pImageMemoryBarriers[i].srcStageMask); - VkPipelineStageFlags2 sanitized_dst_stage = - sanitize_dst_stage(dep_info->pImageMemoryBarriers[i].dstStageMask); - src_flags |= vk2tu_access(dep_info->pImageMemoryBarriers[i].srcAccessMask, - sanitized_src_stage, true, gmem); - dst_flags |= vk2tu_access(dep_info->pImageMemoryBarriers[i].dstAccessMask, - sanitized_dst_stage, true, gmem); - srcStage |= sanitized_src_stage; - dstStage |= sanitized_dst_stage; - } - - if (cmd->state.pass) { - const VkPipelineStageFlags framebuffer_space_stages = - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | - VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | - VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - - /* We cannot have non-by-region "fb-space to fb-space" barriers. - * - * From the Vulkan 1.2.185 spec, section 7.6.1 "Subpass Self-dependency": - * - * If the source and destination stage masks both include - * framebuffer-space stages, then dependencyFlags must include - * VK_DEPENDENCY_BY_REGION_BIT. - * [...] - * Each of the synchronization scopes and access scopes of a - * vkCmdPipelineBarrier2 or vkCmdPipelineBarrier command inside - * a render pass instance must be a subset of the scopes of one of - * the self-dependencies for the current subpass. - * - * If the self-dependency has VK_DEPENDENCY_BY_REGION_BIT or - * VK_DEPENDENCY_VIEW_LOCAL_BIT set, then so must the pipeline barrier. - * - * By-region barriers are ok for gmem. All other barriers would involve - * vtx stages which are NOT ok for gmem rendering. - * See dep_invalid_for_gmem(). - */ - if ((srcStage & ~framebuffer_space_stages) || - (dstStage & ~framebuffer_space_stages)) { - cmd->state.rp.disable_gmem = true; - } - } - - struct tu_cache_state *cache = - cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache; - tu_flush_for_access(cache, src_flags, dst_flags); - - enum tu_stage src_stage = vk2tu_src_stage(srcStage); - enum tu_stage dst_stage = vk2tu_dst_stage(dstStage); - tu_flush_for_stage(cache, src_stage, dst_stage); + tu_CmdEndRenderPass(commandBuffer); } -VKAPI_ATTR void VKAPI_CALL -tu_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, - const VkDependencyInfo *pDependencyInfo) +struct tu_barrier_info { - TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); - - tu_barrier(cmd_buffer, pDependencyInfo); -} + uint32_t eventCount; + const VkEvent *pEvents; + VkPipelineStageFlags srcStageMask; +}; static void -write_event(struct tu_cmd_buffer *cmd, struct tu_event *event, - VkPipelineStageFlags2 stageMask, unsigned value) +tu_barrier(struct tu_cmd_buffer *cmd_buffer, + uint32_t memoryBarrierCount, + const VkMemoryBarrier *pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier *pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier *pImageMemoryBarriers, + const struct tu_barrier_info *info) { - struct tu_cs *cs = &cmd->cs; - - /* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */ - assert(!cmd->state.pass); - - tu_emit_cache_flush(cmd, cs); - - /* Flags that only require a top-of-pipe event. DrawIndirect parameters are - * read by the CP, so the draw indirect stage counts as top-of-pipe too. - */ - VkPipelineStageFlags2 top_of_pipe_flags = - VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | - VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT; - - if (!(stageMask & ~top_of_pipe_flags)) { - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); - tu_cs_emit_qw(cs, event->bo->iova); /* ADDR_LO/HI */ - tu_cs_emit(cs, value); - } else { - /* Use a RB_DONE_TS event to wait for everything to complete. */ - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4); - tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS)); - tu_cs_emit_qw(cs, event->bo->iova); - tu_cs_emit(cs, value); - } } -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetEvent2(VkCommandBuffer commandBuffer, - VkEvent _event, - const VkDependencyInfo *pDependencyInfo) +void +tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer, + VkPipelineStageFlags srcStageMask, + VkPipelineStageFlags destStageMask, + VkBool32 byRegion, + uint32_t memoryBarrierCount, + const VkMemoryBarrier *pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier *pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier *pImageMemoryBarriers) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_event, event, _event); - VkPipelineStageFlags2 src_stage_mask = 0; + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + struct tu_barrier_info info; - for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) - src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask; - for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) - src_stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask; - for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) - src_stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask; + info.eventCount = 0; + info.pEvents = NULL; + info.srcStageMask = srcStageMask; - write_event(cmd, event, src_stage_mask, 1); + tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, + bufferMemoryBarrierCount, pBufferMemoryBarriers, + imageMemoryBarrierCount, pImageMemoryBarriers, &info); } -VKAPI_ATTR void VKAPI_CALL -tu_CmdResetEvent2(VkCommandBuffer commandBuffer, - VkEvent _event, - VkPipelineStageFlags2 stageMask) +static void +write_event(struct tu_cmd_buffer *cmd_buffer, + struct tu_event *event, + VkPipelineStageFlags stageMask, + unsigned value) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_event, event, _event); - - write_event(cmd, event, stageMask, 0); } -VKAPI_ATTR void VKAPI_CALL -tu_CmdWaitEvents2(VkCommandBuffer commandBuffer, - uint32_t eventCount, - const VkEvent *pEvents, - const VkDependencyInfo* pDependencyInfos) +void +tu_CmdSetEvent(VkCommandBuffer commandBuffer, + VkEvent _event, + VkPipelineStageFlags stageMask) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; - - for (uint32_t i = 0; i < eventCount; i++) { - TU_FROM_HANDLE(tu_event, event, pEvents[i]); - - tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); - tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | - CP_WAIT_REG_MEM_0_POLL_MEMORY); - tu_cs_emit_qw(cs, event->bo->iova); /* POLL_ADDR_LO/HI */ - tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1)); - tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u)); - tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20)); - } - - tu_barrier(cmd, pDependencyInfos); -} + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + TU_FROM_HANDLE(tu_event, event, _event); -VKAPI_ATTR void VKAPI_CALL -tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask) -{ - /* No-op */ + write_event(cmd_buffer, event, stageMask, 1); } - -VKAPI_ATTR void VKAPI_CALL -tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer, - const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin) +void +tu_CmdResetEvent(VkCommandBuffer commandBuffer, + VkEvent _event, + VkPipelineStageFlags stageMask) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - - cmd->state.predication_active = true; - - struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; - - tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1); - tu_cs_emit(cs, 1); - - /* Wait for any writes to the predicate to land */ - if (cmd->state.pass) - tu_emit_cache_flush_renderpass(cmd, cs); - else - tu_emit_cache_flush(cmd, cs); - - TU_FROM_HANDLE(tu_buffer, buf, pConditionalRenderingBegin->buffer); - uint64_t iova = buf->iova + pConditionalRenderingBegin->offset; - - /* qcom doesn't support 32-bit reference values, only 64-bit, but Vulkan - * mandates 32-bit comparisons. Our workaround is to copy the the reference - * value to the low 32-bits of a location where the high 32 bits are known - * to be 0 and then compare that. - */ - tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); - tu_cs_emit(cs, 0); - tu_cs_emit_qw(cs, global_iova(cmd, predicate)); - tu_cs_emit_qw(cs, iova); - - tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); - tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + TU_FROM_HANDLE(tu_event, event, _event); - bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; - tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3); - tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) | - CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS)); - tu_cs_emit_qw(cs, global_iova(cmd, predicate)); + write_event(cmd_buffer, event, stageMask, 0); } -VKAPI_ATTR void VKAPI_CALL -tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) +void +tu_CmdWaitEvents(VkCommandBuffer commandBuffer, + uint32_t eventCount, + const VkEvent *pEvents, + VkPipelineStageFlags srcStageMask, + VkPipelineStageFlags dstStageMask, + uint32_t memoryBarrierCount, + const VkMemoryBarrier *pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier *pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier *pImageMemoryBarriers) { - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - - cmd->state.predication_active = false; + TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + struct tu_barrier_info info; - struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; + info.eventCount = eventCount; + info.pEvents = pEvents; + info.srcStageMask = 0; - tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1); - tu_cs_emit(cs, 0); + tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, + bufferMemoryBarrierCount, pBufferMemoryBarriers, + imageMemoryBarrierCount, pImageMemoryBarriers, &info); } void -tu_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, - VkPipelineStageFlagBits2 pipelineStage, - VkBuffer dstBuffer, - VkDeviceSize dstOffset, - uint32_t marker) +tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask) { - /* Almost the same as write_event, but also allowed in renderpass */ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); - - uint64_t va = buffer->bo->iova + dstOffset; - - struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; - struct tu_cache_state *cache = - cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache; - - /* From the Vulkan 1.2.203 spec: - * - * The access scope for buffer marker writes falls under - * the VK_ACCESS_TRANSFER_WRITE_BIT, and the pipeline stages for - * identifying the synchronization scope must include both pipelineStage - * and VK_PIPELINE_STAGE_TRANSFER_BIT. - * - * Transfer operations use CCU however here we write via CP. - * Flush CCU in order to make the results of previous transfer - * operation visible to CP. - */ - tu_flush_for_access(cache, 0, TU_ACCESS_SYSMEM_WRITE); - - /* Flags that only require a top-of-pipe event. DrawIndirect parameters are - * read by the CP, so the draw indirect stage counts as top-of-pipe too. - */ - VkPipelineStageFlags2 top_of_pipe_flags = - VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | - VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT; - - bool is_top_of_pipe = !(pipelineStage & ~top_of_pipe_flags); - - /* We have to WFI only if we flushed CCU here and are using CP_MEM_WRITE. - * Otherwise: - * - We do CP_EVENT_WRITE(RB_DONE_TS) which should wait for flushes; - * - There was a barrier to synchronize other writes with WriteBufferMarkerAMD - * and they had to include our pipelineStage which forces the WFI. - */ - if (cache->flush_bits != 0 && is_top_of_pipe) { - cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE; - } - - if (cmd->state.pass) { - tu_emit_cache_flush_renderpass(cmd, cs); - } else { - tu_emit_cache_flush(cmd, cs); - } - - if (is_top_of_pipe) { - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); - tu_cs_emit_qw(cs, va); /* ADDR_LO/HI */ - tu_cs_emit(cs, marker); - } else { - /* Use a RB_DONE_TS event to wait for everything to complete. */ - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4); - tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS)); - tu_cs_emit_qw(cs, va); - tu_cs_emit(cs, marker); - } - - /* Make sure the result of this write is visible to others. */ - tu_flush_for_access(cache, TU_ACCESS_CP_WRITE, 0); + /* No-op */ } diff --git a/lib/mesa/src/freedreno/vulkan/tu_cs.c b/lib/mesa/src/freedreno/vulkan/tu_cs.c index 2e6f215f4..48242f813 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_cs.c +++ b/lib/mesa/src/freedreno/vulkan/tu_cs.c @@ -1,101 +1,108 @@ /* * Copyright © 2019 Google LLC - * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ #include "tu_cs.h" -#include "tu_suballoc.h" - /** * Initialize a command stream. */ void -tu_cs_init(struct tu_cs *cs, - struct tu_device *device, - enum tu_cs_mode mode, - uint32_t initial_size, const char *name) +tu_cs_init(struct tu_cs *cs, enum tu_cs_mode mode, uint32_t initial_size) { assert(mode != TU_CS_MODE_EXTERNAL); memset(cs, 0, sizeof(*cs)); - cs->device = device; cs->mode = mode; cs->next_bo_size = initial_size; - cs->name = name; } /** * Initialize a command stream as a wrapper to an external buffer. */ void -tu_cs_init_external(struct tu_cs *cs, struct tu_device *device, - uint32_t *start, uint32_t *end) +tu_cs_init_external(struct tu_cs *cs, uint32_t *start, uint32_t *end) { memset(cs, 0, sizeof(*cs)); - cs->device = device; cs->mode = TU_CS_MODE_EXTERNAL; cs->start = cs->reserved_end = cs->cur = start; cs->end = end; } /** - * Initialize a sub-command stream as a wrapper to an externally sub-allocated - * buffer. + * Finish and release all resources owned by a command stream. */ void -tu_cs_init_suballoc(struct tu_cs *cs, struct tu_device *device, - struct tu_suballoc_bo *suballoc_bo) +tu_cs_finish(struct tu_device *dev, struct tu_cs *cs) { - uint32_t *start = tu_suballoc_bo_map(suballoc_bo); - uint32_t *end = start + (suballoc_bo->size >> 2); + for (uint32_t i = 0; i < cs->bo_count; ++i) { + tu_bo_finish(dev, cs->bos[i]); + free(cs->bos[i]); + } - memset(cs, 0, sizeof(*cs)); - cs->device = device; - cs->mode = TU_CS_MODE_SUB_STREAM; - cs->start = cs->reserved_end = cs->cur = start; - cs->end = end; - cs->refcount_bo = tu_bo_get_ref(suballoc_bo->bo); + free(cs->entries); + free(cs->bos); } /** - * Finish and release all resources owned by a command stream. + * Get the offset of the command packets emitted since the last call to + * tu_cs_add_entry. */ -void -tu_cs_finish(struct tu_cs *cs) +static uint32_t +tu_cs_get_offset(const struct tu_cs *cs) { - for (uint32_t i = 0; i < cs->bo_count; ++i) { - tu_bo_finish(cs->device, cs->bos[i]); - } - - if (cs->refcount_bo) - tu_bo_finish(cs->device, cs->refcount_bo); + assert(cs->bo_count); + return cs->start - (uint32_t *) cs->bos[cs->bo_count - 1]->map; +} - free(cs->entries); - free(cs->bos); +/** + * Get the size of the command packets emitted since the last call to + * tu_cs_add_entry. + */ +static uint32_t +tu_cs_get_size(const struct tu_cs *cs) +{ + return cs->cur - cs->start; } -static struct tu_bo * -tu_cs_current_bo(const struct tu_cs *cs) +/** + * Get the size of the remaining space in the current BO. + */ +static uint32_t +tu_cs_get_space(const struct tu_cs *cs) { - if (cs->refcount_bo) { - return cs->refcount_bo; - } else { - assert(cs->bo_count); - return cs->bos[cs->bo_count - 1]; - } + return cs->end - cs->cur; } /** - * Get the offset of the command packets emitted since the last call to + * Return true if there is no command packet emitted since the last call to * tu_cs_add_entry. */ static uint32_t -tu_cs_get_offset(const struct tu_cs *cs) +tu_cs_is_empty(const struct tu_cs *cs) { - return cs->start - (uint32_t *) tu_cs_current_bo(cs)->map; + return tu_cs_get_size(cs) == 0; } /* @@ -103,12 +110,10 @@ tu_cs_get_offset(const struct tu_cs *cs) * be emitted to the new BO. */ static VkResult -tu_cs_add_bo(struct tu_cs *cs, uint32_t size) +tu_cs_add_bo(struct tu_device *dev, struct tu_cs *cs, uint32_t size) { /* no BO for TU_CS_MODE_EXTERNAL */ assert(cs->mode != TU_CS_MODE_EXTERNAL); - /* No adding more BOs if suballocating from a suballoc_bo. */ - assert(!cs->refcount_bo); /* no dangling command packet */ assert(tu_cs_is_empty(cs)); @@ -125,18 +130,20 @@ tu_cs_add_bo(struct tu_cs *cs, uint32_t size) cs->bos = new_bos; } - struct tu_bo *new_bo; + struct tu_bo *new_bo = malloc(sizeof(struct tu_bo)); + if (!new_bo) + return VK_ERROR_OUT_OF_HOST_MEMORY; - VkResult result = - tu_bo_init_new(cs->device, &new_bo, size * sizeof(uint32_t), - TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP, cs->name); + VkResult result = tu_bo_init_new(dev, new_bo, size * sizeof(uint32_t)); if (result != VK_SUCCESS) { + free(new_bo); return result; } - result = tu_bo_map(cs->device, new_bo); + result = tu_bo_map(dev, new_bo); if (result != VK_SUCCESS) { - tu_bo_finish(cs->device, new_bo); + tu_bo_finish(dev, new_bo); + free(new_bo); return result; } @@ -152,7 +159,7 @@ tu_cs_add_bo(struct tu_cs *cs, uint32_t size) * Reserve an IB entry. */ static VkResult -tu_cs_reserve_entry(struct tu_cs *cs) +tu_cs_reserve_entry(struct tu_device *dev, struct tu_cs *cs) { /* entries are only for TU_CS_MODE_GROW */ assert(cs->mode == TU_CS_MODE_GROW); @@ -194,7 +201,7 @@ tu_cs_add_entry(struct tu_cs *cs) /* add an entry for [cs->start, cs->cur] */ cs->entries[cs->entry_count++] = (struct tu_cs_entry) { - .bo = tu_cs_current_bo(cs), + .bo = cs->bos[cs->bo_count - 1], .size = tu_cs_get_size(cs) * sizeof(uint32_t), .offset = tu_cs_get_offset(cs) * sizeof(uint32_t), }; @@ -203,30 +210,6 @@ tu_cs_add_entry(struct tu_cs *cs) } /** - * same behavior as tu_cs_emit_call but without the indirect - */ -VkResult -tu_cs_add_entries(struct tu_cs *cs, struct tu_cs *target) -{ - VkResult result; - - assert(cs->mode == TU_CS_MODE_GROW); - assert(target->mode == TU_CS_MODE_GROW); - - if (!tu_cs_is_empty(cs)) - tu_cs_add_entry(cs); - - for (unsigned i = 0; i < target->entry_count; i++) { - result = tu_cs_reserve_entry(cs); - if (result != VK_SUCCESS) - return result; - cs->entries[cs->entry_count++] = target->entries[i]; - } - - return VK_SUCCESS; -} - -/** * Begin (or continue) command packet emission. This does nothing but sanity * checks currently. \a cs must not be in TU_CS_MODE_SUB_STREAM mode. */ @@ -259,58 +242,27 @@ tu_cs_end(struct tu_cs *cs) * emission. */ VkResult -tu_cs_begin_sub_stream(struct tu_cs *cs, uint32_t size, struct tu_cs *sub_cs) +tu_cs_begin_sub_stream(struct tu_device *dev, + struct tu_cs *cs, + uint32_t size, + struct tu_cs *sub_cs) { assert(cs->mode == TU_CS_MODE_SUB_STREAM); assert(size); - VkResult result = tu_cs_reserve_space(cs, size); + VkResult result = tu_cs_reserve_space(dev, cs, size); if (result != VK_SUCCESS) return result; - tu_cs_init_external(sub_cs, cs->device, cs->cur, cs->reserved_end); + tu_cs_init_external(sub_cs, cs->cur, cs->reserved_end); tu_cs_begin(sub_cs); - result = tu_cs_reserve_space(sub_cs, size); + result = tu_cs_reserve_space(dev, sub_cs, size); assert(result == VK_SUCCESS); return VK_SUCCESS; } /** - * Allocate count*size dwords, aligned to size dwords. - * \a cs must be in TU_CS_MODE_SUB_STREAM mode. - * - */ -VkResult -tu_cs_alloc(struct tu_cs *cs, - uint32_t count, - uint32_t size, - struct tu_cs_memory *memory) -{ - assert(cs->mode == TU_CS_MODE_SUB_STREAM); - assert(size && size <= 1024); - - if (!count) - return VK_SUCCESS; - - /* TODO: smarter way to deal with alignment? */ - - VkResult result = tu_cs_reserve_space(cs, count * size + (size-1)); - if (result != VK_SUCCESS) - return result; - - struct tu_bo *bo = tu_cs_current_bo(cs); - size_t offset = align(tu_cs_get_offset(cs), size); - - memory->map = bo->map + offset * sizeof(uint32_t); - memory->iova = bo->iova + offset * sizeof(uint32_t); - - cs->start = cs->cur = (uint32_t*) bo->map + offset + count * size; - - return VK_SUCCESS; -} - -/** * End command packet emission to a sub-stream. \a sub_cs becomes invalid * after this call. * @@ -321,6 +273,7 @@ struct tu_cs_entry tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs) { assert(cs->mode == TU_CS_MODE_SUB_STREAM); + assert(cs->bo_count); assert(sub_cs->start == cs->cur && sub_cs->end == cs->reserved_end); tu_cs_sanity_check(sub_cs); @@ -329,7 +282,7 @@ tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs) cs->cur = sub_cs->cur; struct tu_cs_entry entry = { - .bo = tu_cs_current_bo(cs), + .bo = cs->bos[cs->bo_count - 1], .size = tu_cs_get_size(cs) * sizeof(uint32_t), .offset = tu_cs_get_offset(cs) * sizeof(uint32_t), }; @@ -344,7 +297,9 @@ tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs) * This never fails when \a cs has mode TU_CS_MODE_EXTERNAL. */ VkResult -tu_cs_reserve_space(struct tu_cs *cs, uint32_t reserved_size) +tu_cs_reserve_space(struct tu_device *dev, + struct tu_cs *cs, + uint32_t reserved_size) { if (tu_cs_get_space(cs) < reserved_size) { if (cs->mode == TU_CS_MODE_EXTERNAL) { @@ -360,39 +315,14 @@ tu_cs_reserve_space(struct tu_cs *cs, uint32_t reserved_size) tu_cs_add_entry(cs); } - for (uint32_t i = 0; i < cs->cond_stack_depth; i++) { - /* Subtract one here to account for the DWORD field itself. */ - *cs->cond_dwords[i] = cs->cur - cs->cond_dwords[i] - 1; - - /* space for CP_COND_REG_EXEC in next bo */ - reserved_size += 3; - } - /* switch to a new BO */ uint32_t new_size = MAX2(cs->next_bo_size, reserved_size); - VkResult result = tu_cs_add_bo(cs, new_size); + VkResult result = tu_cs_add_bo(dev, cs, new_size); if (result != VK_SUCCESS) return result; - if (cs->cond_stack_depth) { - cs->reserved_end = cs->cur + reserved_size; - } - - /* Re-emit CP_COND_REG_EXECs */ - for (uint32_t i = 0; i < cs->cond_stack_depth; i++) { - tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2); - tu_cs_emit(cs, cs->cond_flags[i]); - - cs->cond_dwords[i] = cs->cur; - - /* Emit dummy DWORD field here */ - tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(0)); - } - - /* double the size for the next bo, also there is an upper - * bound on IB size, which appears to be 0x0fffff - */ - new_size = MIN2(new_size << 1, 0x0fffff); + /* double the size for the next bo */ + new_size <<= 1; if (cs->next_bo_size < new_size) cs->next_bo_size = new_size; } @@ -402,7 +332,7 @@ tu_cs_reserve_space(struct tu_cs *cs, uint32_t reserved_size) if (cs->mode == TU_CS_MODE_GROW) { /* reserve an entry for the next call to this function or tu_cs_end */ - return tu_cs_reserve_entry(cs); + return tu_cs_reserve_entry(dev, cs); } return VK_SUCCESS; @@ -413,16 +343,17 @@ tu_cs_reserve_space(struct tu_cs *cs, uint32_t reserved_size) * packets in \a cs, but does not necessarily release all resources. */ void -tu_cs_reset(struct tu_cs *cs) +tu_cs_reset(struct tu_device *dev, struct tu_cs *cs) { if (cs->mode == TU_CS_MODE_EXTERNAL) { - assert(!cs->bo_count && !cs->refcount_bo && !cs->entry_count); + assert(!cs->bo_count && !cs->entry_count); cs->reserved_end = cs->cur = cs->start; return; } for (uint32_t i = 0; i + 1 < cs->bo_count; ++i) { - tu_bo_finish(cs->device, cs->bos[i]); + tu_bo_finish(dev, cs->bos[i]); + free(cs->bos[i]); } if (cs->bo_count) { diff --git a/lib/mesa/src/freedreno/vulkan/tu_descriptor_set.c b/lib/mesa/src/freedreno/vulkan/tu_descriptor_set.c index 14d8b4b07..0f49d26e2 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_descriptor_set.c +++ b/lib/mesa/src/freedreno/vulkan/tu_descriptor_set.c @@ -1,100 +1,67 @@ /* * Copyright © 2016 Red Hat. * Copyright © 2016 Bas Nieuwenhuizen - * SPDX-License-Identifier: MIT - */ - -/** - * @file * - * We use the bindless descriptor model, which maps fairly closely to how - * Vulkan descriptor sets work. The two exceptions are input attachments and - * dynamic descriptors, which have to be patched when recording command - * buffers. We reserve an extra descriptor set for these. This descriptor set - * contains all the input attachments in the pipeline, in order, and then all - * the dynamic descriptors. The dynamic descriptors are stored in the CPU-side - * datastructure for each tu_descriptor_set, and then combined into one big - * descriptor set at CmdBindDescriptors time/draw time. + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ +#include "tu_private.h" -#include "tu_descriptor_set.h" - +#include <assert.h> #include <fcntl.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> #include "util/mesa-sha1.h" -#include "vk_descriptors.h" #include "vk_util.h" -#include "tu_device.h" -#include "tu_image.h" - -static inline uint8_t * -pool_base(struct tu_descriptor_pool *pool) +static int +binding_compare(const void *av, const void *bv) { - return pool->host_bo ?: pool->bo->map; -} + const VkDescriptorSetLayoutBinding *a = + (const VkDescriptorSetLayoutBinding *) av; + const VkDescriptorSetLayoutBinding *b = + (const VkDescriptorSetLayoutBinding *) bv; -static uint32_t -descriptor_size(struct tu_device *dev, - const VkDescriptorSetLayoutBinding *binding, - VkDescriptorType type) -{ - switch (type) { - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - if (unlikely(dev->instance->debug_flags & TU_DEBUG_DYNAMIC)) - return A6XX_TEX_CONST_DWORDS * 4; - - /* Input attachment doesn't use descriptor sets at all */ - return 0; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - /* We make offsets and sizes all 16 dwords, to match how the hardware - * interprets indices passed to sample/load/store instructions in - * multiples of 16 dwords. This means that "normal" descriptors are all - * of size 16, with padding for smaller descriptors like uniform storage - * descriptors which are less than 16 dwords. However combined images - * and samplers are actually two descriptors, so they have size 2. - */ - return A6XX_TEX_CONST_DWORDS * 4 * 2; - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - /* When we support 16-bit storage, we need an extra descriptor setup as - * a 32-bit array for isam to work. - */ - if (dev->physical_device->info->a6xx.storage_16bit) { - return A6XX_TEX_CONST_DWORDS * 4 * 2; - } else { - return A6XX_TEX_CONST_DWORDS * 4; - } - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: - return A6XX_TEX_CONST_DWORDS * 4 + - ALIGN(binding->descriptorCount, A6XX_TEX_CONST_DWORDS * 4); - default: - return A6XX_TEX_CONST_DWORDS * 4; - } + return (a->binding < b->binding) ? -1 : (a->binding > b->binding) ? 1 : 0; } -static bool -is_dynamic(VkDescriptorType type) +static VkDescriptorSetLayoutBinding * +create_sorted_bindings(const VkDescriptorSetLayoutBinding *bindings, + unsigned count) { - return type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC || - type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; -} + VkDescriptorSetLayoutBinding *sorted_bindings = + malloc(count * sizeof(VkDescriptorSetLayoutBinding)); + if (!sorted_bindings) + return NULL; -static uint32_t -mutable_descriptor_size(struct tu_device *dev, - const VkMutableDescriptorTypeListEXT *list) -{ - uint32_t max_size = 0; + memcpy(sorted_bindings, bindings, + count * sizeof(VkDescriptorSetLayoutBinding)); - for (uint32_t i = 0; i < list->descriptorTypeCount; i++) { - uint32_t size = descriptor_size(dev, NULL, list->pDescriptorTypes[i]); - max_size = MAX2(max_size, size); - } + qsort(sorted_bindings, count, sizeof(VkDescriptorSetLayoutBinding), + binding_compare); - return max_size; + return sorted_bindings; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreateDescriptorSetLayout( VkDevice _device, const VkDescriptorSetLayoutCreateInfo *pCreateInfo, @@ -106,191 +73,178 @@ tu_CreateDescriptorSetLayout( assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO); - const VkDescriptorSetLayoutBindingFlagsCreateInfo *variable_flags = - vk_find_struct_const( - pCreateInfo->pNext, - DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO); - const VkMutableDescriptorTypeCreateInfoEXT *mutable_info = + const VkDescriptorSetLayoutBindingFlagsCreateInfoEXT *variable_flags = vk_find_struct_const( pCreateInfo->pNext, - MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT); + DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO_EXT); - uint32_t num_bindings = 0; + uint32_t max_binding = 0; uint32_t immutable_sampler_count = 0; - uint32_t ycbcr_sampler_count = 0; for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { - num_bindings = MAX2(num_bindings, pCreateInfo->pBindings[j].binding + 1); - if ((pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || - pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) && - pCreateInfo->pBindings[j].pImmutableSamplers) { + max_binding = MAX2(max_binding, pCreateInfo->pBindings[j].binding); + if (pCreateInfo->pBindings[j].pImmutableSamplers) immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount; - - bool has_ycbcr_sampler = false; - for (unsigned i = 0; i < pCreateInfo->pBindings[j].descriptorCount; ++i) { - if (tu_sampler_from_handle(pCreateInfo->pBindings[j].pImmutableSamplers[i])->ycbcr_sampler) - has_ycbcr_sampler = true; - } - - if (has_ycbcr_sampler) - ycbcr_sampler_count += pCreateInfo->pBindings[j].descriptorCount; - } } uint32_t samplers_offset = - offsetof(struct tu_descriptor_set_layout, binding[num_bindings]); - - /* note: only need to store TEX_SAMP_DWORDS for immutable samples, - * but using struct tu_sampler makes things simpler */ - uint32_t size = samplers_offset + - immutable_sampler_count * sizeof(struct tu_sampler) + - ycbcr_sampler_count * sizeof(struct tu_sampler_ycbcr_conversion); + sizeof(struct tu_descriptor_set_layout) + + (max_binding + 1) * sizeof(set_layout->binding[0]); + size_t size = + samplers_offset + immutable_sampler_count * 4 * sizeof(uint32_t); - set_layout = vk_descriptor_set_layout_zalloc(&device->vk, size); + set_layout = vk_alloc2(&device->alloc, pAllocator, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!set_layout) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); set_layout->flags = pCreateInfo->flags; - /* We just allocate all the immutable samplers at the end of the struct */ - struct tu_sampler *samplers = (void*) &set_layout->binding[num_bindings]; - struct tu_sampler_ycbcr_conversion *ycbcr_samplers = - (void*) &samplers[immutable_sampler_count]; - - VkDescriptorSetLayoutBinding *bindings = NULL; - VkResult result = vk_create_sorted_bindings( - pCreateInfo->pBindings, pCreateInfo->bindingCount, &bindings); - if (result != VK_SUCCESS) { - vk_object_free(&device->vk, pAllocator, set_layout); - return vk_error(device, result); + /* We just allocate all the samplers at the end of the struct */ + uint32_t *samplers = (uint32_t *) &set_layout->binding[max_binding + 1]; + (void) samplers; /* TODO: Use me */ + + VkDescriptorSetLayoutBinding *bindings = create_sorted_bindings( + pCreateInfo->pBindings, pCreateInfo->bindingCount); + if (!bindings) { + vk_free2(&device->alloc, pAllocator, set_layout); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } - set_layout->binding_count = num_bindings; + set_layout->binding_count = max_binding + 1; set_layout->shader_stages = 0; + set_layout->dynamic_shader_stages = 0; set_layout->has_immutable_samplers = false; - set_layout->has_inline_uniforms = false; set_layout->size = 0; - uint32_t dynamic_offset_size = 0; + memset(set_layout->binding, 0, + size - sizeof(struct tu_descriptor_set_layout)); + + uint32_t buffer_count = 0; + uint32_t dynamic_offset_count = 0; for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { const VkDescriptorSetLayoutBinding *binding = bindings + j; uint32_t b = binding->binding; + uint32_t alignment; + unsigned binding_buffer_count = 0; - set_layout->binding[b].type = binding->descriptorType; - set_layout->binding[b].array_size = - binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ? - 1 : binding->descriptorCount; - set_layout->binding[b].offset = set_layout->size; - set_layout->binding[b].dynamic_offset_offset = dynamic_offset_size; - set_layout->binding[b].shader_stages = binding->stageFlags; - - if (binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) { - /* For mutable descriptor types we must allocate a size that fits the - * largest descriptor type that the binding can mutate to. - */ - set_layout->binding[b].size = - mutable_descriptor_size(device, &mutable_info->pMutableDescriptorTypeLists[j]); - } else { - set_layout->binding[b].size = - descriptor_size(device, binding, binding->descriptorType); + switch (binding->descriptorType) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + assert(!(pCreateInfo->flags & + VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); + set_layout->binding[b].dynamic_offset_count = 1; + set_layout->dynamic_shader_stages |= binding->stageFlags; + set_layout->binding[b].size = 0; + binding_buffer_count = 1; + alignment = 1; + break; + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + set_layout->binding[b].size = 16; + binding_buffer_count = 1; + alignment = 16; + break; + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + /* main descriptor + fmask descriptor */ + set_layout->binding[b].size = 64; + binding_buffer_count = 1; + alignment = 32; + break; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + /* main descriptor + fmask descriptor + sampler */ + set_layout->binding[b].size = 96; + binding_buffer_count = 1; + alignment = 32; + break; + case VK_DESCRIPTOR_TYPE_SAMPLER: + set_layout->binding[b].size = 16; + alignment = 16; + break; + default: + unreachable("unknown descriptor type\n"); + break; } - if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) - set_layout->has_inline_uniforms = true; + set_layout->size = align(set_layout->size, alignment); + set_layout->binding[b].type = binding->descriptorType; + set_layout->binding[b].array_size = binding->descriptorCount; + set_layout->binding[b].offset = set_layout->size; + set_layout->binding[b].buffer_offset = buffer_count; + set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count; if (variable_flags && binding->binding < variable_flags->bindingCount && (variable_flags->pBindingFlags[binding->binding] & - VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) { + VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT_EXT)) { assert(!binding->pImmutableSamplers); /* Terribly ill defined how many samplers are valid */ - assert(binding->binding == num_bindings - 1); + assert(binding->binding == max_binding); set_layout->has_variable_descriptors = true; } - if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || - binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) && - binding->pImmutableSamplers) { + if (binding->pImmutableSamplers) { set_layout->binding[b].immutable_samplers_offset = samplers_offset; set_layout->has_immutable_samplers = true; - - for (uint32_t i = 0; i < binding->descriptorCount; i++) - samplers[i] = *tu_sampler_from_handle(binding->pImmutableSamplers[i]); - - samplers += binding->descriptorCount; - samplers_offset += sizeof(struct tu_sampler) * binding->descriptorCount; - - bool has_ycbcr_sampler = false; - for (unsigned i = 0; i < pCreateInfo->pBindings[j].descriptorCount; ++i) { - if (tu_sampler_from_handle(binding->pImmutableSamplers[i])->ycbcr_sampler) - has_ycbcr_sampler = true; - } - - if (has_ycbcr_sampler) { - set_layout->binding[b].ycbcr_samplers_offset = - (const char*)ycbcr_samplers - (const char*)set_layout; - for (uint32_t i = 0; i < binding->descriptorCount; i++) { - struct tu_sampler *sampler = tu_sampler_from_handle(binding->pImmutableSamplers[i]); - if (sampler->ycbcr_sampler) - ycbcr_samplers[i] = *sampler->ycbcr_sampler; - else - ycbcr_samplers[i].ycbcr_model = VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY; - } - ycbcr_samplers += binding->descriptorCount; - } else { - set_layout->binding[b].ycbcr_samplers_offset = 0; - } - } - - uint32_t size = - ALIGN_POT(set_layout->binding[b].array_size * set_layout->binding[b].size, 4 * A6XX_TEX_CONST_DWORDS); - if (is_dynamic(binding->descriptorType)) { - dynamic_offset_size += size; - } else { - set_layout->size += size; } + set_layout->size += + binding->descriptorCount * set_layout->binding[b].size; + buffer_count += binding->descriptorCount * binding_buffer_count; + dynamic_offset_count += binding->descriptorCount * + set_layout->binding[b].dynamic_offset_count; set_layout->shader_stages |= binding->stageFlags; } free(bindings); - set_layout->dynamic_offset_size = dynamic_offset_size; + set_layout->buffer_count = buffer_count; + set_layout->dynamic_offset_count = dynamic_offset_count; *pSetLayout = tu_descriptor_set_layout_to_handle(set_layout); return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL +void +tu_DestroyDescriptorSetLayout(VkDevice _device, + VkDescriptorSetLayout _set_layout, + const VkAllocationCallbacks *pAllocator) +{ + TU_FROM_HANDLE(tu_device, device, _device); + TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout, _set_layout); + + if (!set_layout) + return; + + vk_free2(&device->alloc, pAllocator, set_layout); +} + +void tu_GetDescriptorSetLayoutSupport( - VkDevice _device, + VkDevice device, const VkDescriptorSetLayoutCreateInfo *pCreateInfo, VkDescriptorSetLayoutSupport *pSupport) { - TU_FROM_HANDLE(tu_device, device, _device); - - VkDescriptorSetLayoutBinding *bindings = NULL; - VkResult result = vk_create_sorted_bindings( - pCreateInfo->pBindings, pCreateInfo->bindingCount, &bindings); - if (result != VK_SUCCESS) { + VkDescriptorSetLayoutBinding *bindings = create_sorted_bindings( + pCreateInfo->pBindings, pCreateInfo->bindingCount); + if (!bindings) { pSupport->supported = false; return; } - const VkDescriptorSetLayoutBindingFlagsCreateInfo *variable_flags = + const VkDescriptorSetLayoutBindingFlagsCreateInfoEXT *variable_flags = vk_find_struct_const( pCreateInfo->pNext, - DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO); - VkDescriptorSetVariableDescriptorCountLayoutSupport *variable_count = + DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO_EXT); + VkDescriptorSetVariableDescriptorCountLayoutSupportEXT *variable_count = vk_find_struct( (void *) pCreateInfo->pNext, - DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT); - const VkMutableDescriptorTypeCreateInfoEXT *mutable_info = - vk_find_struct_const( - pCreateInfo->pNext, - MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT); - + DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT_EXT); if (variable_count) { variable_count->maxVariableDescriptorCount = 0; } @@ -300,157 +254,71 @@ tu_GetDescriptorSetLayoutSupport( for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) { const VkDescriptorSetLayoutBinding *binding = bindings + i; - uint64_t descriptor_sz; - - if (is_dynamic(binding->descriptorType)) { - descriptor_sz = 0; - } else if (binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) { - const VkMutableDescriptorTypeListEXT *list = - &mutable_info->pMutableDescriptorTypeLists[i]; - - for (uint32_t j = 0; j < list->descriptorTypeCount; j++) { - /* Don't support the input attachement and combined image sampler type - * for mutable descriptors */ - if (list->pDescriptorTypes[j] == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT || - list->pDescriptorTypes[j] == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || - list->pDescriptorTypes[j] == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - supported = false; - goto out; - } - } - - descriptor_sz = - mutable_descriptor_size(device, &mutable_info->pMutableDescriptorTypeLists[i]); - } else { - descriptor_sz = descriptor_size(device, binding, binding->descriptorType); + uint64_t descriptor_size = 0; + uint64_t descriptor_alignment = 1; + switch (binding->descriptorType) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + break; + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + descriptor_size = 16; + descriptor_alignment = 16; + break; + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + descriptor_size = 64; + descriptor_alignment = 32; + break; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + descriptor_size = 96; + descriptor_alignment = 32; + break; + case VK_DESCRIPTOR_TYPE_SAMPLER: + descriptor_size = 16; + descriptor_alignment = 16; + break; + default: + unreachable("unknown descriptor type\n"); + break; } - uint64_t descriptor_alignment = 4 * A6XX_TEX_CONST_DWORDS; - if (size && !ALIGN_POT(size, descriptor_alignment)) { + if (size && !align_u64(size, descriptor_alignment)) { supported = false; } - size = ALIGN_POT(size, descriptor_alignment); - - uint64_t max_count = MAX_SET_SIZE; - unsigned descriptor_count = binding->descriptorCount; - if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - max_count = MAX_SET_SIZE - size; - descriptor_count = descriptor_sz; - descriptor_sz = 1; - } else if (descriptor_sz) { - max_count = (MAX_SET_SIZE - size) / descriptor_sz; - } + size = align_u64(size, descriptor_alignment); - if (max_count < descriptor_count) { + uint64_t max_count = UINT64_MAX; + if (descriptor_size) + max_count = (UINT64_MAX - size) / descriptor_size; + + if (max_count < binding->descriptorCount) { supported = false; } - if (variable_flags && binding->binding < variable_flags->bindingCount && variable_count && (variable_flags->pBindingFlags[binding->binding] & - VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) { + VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT_EXT)) { variable_count->maxVariableDescriptorCount = MIN2(UINT32_MAX, max_count); } - size += descriptor_count * descriptor_sz; + size += binding->descriptorCount * descriptor_size; } -out: free(bindings); pSupport->supported = supported; } -/* Note: we must hash any values used in tu_lower_io(). */ - -#define SHA1_UPDATE_VALUE(ctx, x) _mesa_sha1_update(ctx, &(x), sizeof(x)); - -static void -sha1_update_ycbcr_sampler(struct mesa_sha1 *ctx, - const struct tu_sampler_ycbcr_conversion *sampler) -{ - SHA1_UPDATE_VALUE(ctx, sampler->ycbcr_model); - SHA1_UPDATE_VALUE(ctx, sampler->ycbcr_range); - SHA1_UPDATE_VALUE(ctx, sampler->format); -} - -static void -sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx, - const struct tu_descriptor_set_binding_layout *layout, - const struct tu_descriptor_set_layout *set_layout) -{ - SHA1_UPDATE_VALUE(ctx, layout->type); - SHA1_UPDATE_VALUE(ctx, layout->offset); - SHA1_UPDATE_VALUE(ctx, layout->size); - SHA1_UPDATE_VALUE(ctx, layout->array_size); - SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_offset); - SHA1_UPDATE_VALUE(ctx, layout->immutable_samplers_offset); - - const struct tu_sampler_ycbcr_conversion *ycbcr_samplers = - tu_immutable_ycbcr_samplers(set_layout, layout); - - if (ycbcr_samplers) { - for (unsigned i = 0; i < layout->array_size; i++) - sha1_update_ycbcr_sampler(ctx, ycbcr_samplers + i); - } -} - - -static void -sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx, - const struct tu_descriptor_set_layout *layout) -{ - for (uint16_t i = 0; i < layout->binding_count; i++) - sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i], - layout); -} - /* * Pipeline layouts. These have nothing to do with the pipeline. They are * just multiple descriptor set layouts pasted together. */ -void -tu_pipeline_layout_init(struct tu_pipeline_layout *layout) -{ - unsigned dynamic_offset_size = 0; - - for (uint32_t set = 0; set < layout->num_sets; set++) { - assert(set < MAX_SETS); - layout->set[set].dynamic_offset_start = dynamic_offset_size; - - if (layout->set[set].layout) - dynamic_offset_size += layout->set[set].layout->dynamic_offset_size; - } - - layout->dynamic_offset_size = dynamic_offset_size; - - /* We only care about INDEPENDENT_SETS for dynamic-offset descriptors, - * where all the descriptors from all the sets are combined into one set - * and we have to provide the dynamic_offset_start dynamically with fast - * linking. - */ - if (dynamic_offset_size == 0) { - layout->independent_sets = false; - } - - struct mesa_sha1 ctx; - _mesa_sha1_init(&ctx); - for (unsigned s = 0; s < layout->num_sets; s++) { - if (layout->set[s].layout) - sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout); - _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start, - sizeof(layout->set[s].dynamic_offset_start)); - } - _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets)); - _mesa_sha1_update(&ctx, &layout->push_constant_size, - sizeof(layout->push_constant_size)); - _mesa_sha1_update(&ctx, &layout->independent_sets, - sizeof(layout->independent_sets)); - _mesa_sha1_final(&ctx, layout->sha1); -} - -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreatePipelineLayout(VkDevice _device, const VkPipelineLayoutCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, @@ -458,26 +326,42 @@ tu_CreatePipelineLayout(VkDevice _device, { TU_FROM_HANDLE(tu_device, device, _device); struct tu_pipeline_layout *layout; + struct mesa_sha1 ctx; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO); - layout = vk_object_alloc(&device->vk, pAllocator, sizeof(*layout), - VK_OBJECT_TYPE_PIPELINE_LAYOUT); + layout = vk_alloc2(&device->alloc, pAllocator, sizeof(*layout), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (layout == NULL) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); layout->num_sets = pCreateInfo->setLayoutCount; + + unsigned dynamic_offset_count = 0; + + _mesa_sha1_init(&ctx); for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) { TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout, pCreateInfo->pSetLayouts[set]); - - assert(set < MAX_SETS); layout->set[set].layout = set_layout; - if (set_layout) - vk_descriptor_set_layout_ref(&set_layout->vk); + + layout->set[set].dynamic_offset_start = dynamic_offset_count; + for (uint32_t b = 0; b < set_layout->binding_count; b++) { + dynamic_offset_count += set_layout->binding[b].array_size * + set_layout->binding[b].dynamic_offset_count; + if (set_layout->binding[b].immutable_samplers_offset) + _mesa_sha1_update( + &ctx, + tu_immutable_samplers(set_layout, set_layout->binding + b), + set_layout->binding[b].array_size * 4 * sizeof(uint32_t)); + } + _mesa_sha1_update( + &ctx, set_layout->binding, + sizeof(set_layout->binding[0]) * set_layout->binding_count); } + layout->dynamic_offset_count = dynamic_offset_count; layout->push_constant_size = 0; for (unsigned i = 0; i < pCreateInfo->pushConstantRangeCount; ++i) { @@ -487,17 +371,15 @@ tu_CreatePipelineLayout(VkDevice _device, } layout->push_constant_size = align(layout->push_constant_size, 16); - layout->independent_sets = - pCreateInfo->flags & VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT; - - tu_pipeline_layout_init(layout); - + _mesa_sha1_update(&ctx, &layout->push_constant_size, + sizeof(layout->push_constant_size)); + _mesa_sha1_final(&ctx, layout->sha1); *pPipelineLayout = tu_pipeline_layout_to_handle(layout); return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyPipelineLayout(VkDevice _device, VkPipelineLayout _pipelineLayout, const VkAllocationCallbacks *pAllocator) @@ -507,329 +389,31 @@ tu_DestroyPipelineLayout(VkDevice _device, if (!pipeline_layout) return; - - for (uint32_t i = 0; i < pipeline_layout->num_sets; i++) { - if (pipeline_layout->set[i].layout) - vk_descriptor_set_layout_unref(&device->vk, &pipeline_layout->set[i].layout->vk); - } - - vk_object_free(&device->vk, pAllocator, pipeline_layout); + vk_free2(&device->alloc, pAllocator, pipeline_layout); } #define EMPTY 1 -static VkResult -tu_descriptor_set_create(struct tu_device *device, - struct tu_descriptor_pool *pool, - struct tu_descriptor_set_layout *layout, - uint32_t variable_count, - struct tu_descriptor_set **out_set) -{ - struct tu_descriptor_set *set; - unsigned dynamic_offset = sizeof(struct tu_descriptor_set); - unsigned mem_size = dynamic_offset + layout->dynamic_offset_size; - - if (pool->host_memory_base) { - if (pool->host_memory_end - pool->host_memory_ptr < mem_size) - return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY); - - set = (struct tu_descriptor_set*)pool->host_memory_ptr; - pool->host_memory_ptr += mem_size; - } else { - set = vk_alloc2(&device->vk.alloc, NULL, mem_size, 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - - if (!set) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - } - - memset(set, 0, mem_size); - vk_object_base_init(&device->vk, &set->base, VK_OBJECT_TYPE_DESCRIPTOR_SET); - - if (layout->dynamic_offset_size) { - set->dynamic_descriptors = (uint32_t *)((uint8_t*)set + dynamic_offset); - } - - set->layout = layout; - set->pool = pool; - uint32_t layout_size = layout->size; - if (layout->has_variable_descriptors) { - struct tu_descriptor_set_binding_layout *binding = - &layout->binding[layout->binding_count - 1]; - if (binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - layout_size = binding->offset + A6XX_TEX_CONST_DWORDS * 4 + - ALIGN(variable_count, A6XX_TEX_CONST_DWORDS * 4); - } else { - uint32_t stride = binding->size; - layout_size = binding->offset + variable_count * stride; - } - } - - if (layout_size) { - set->size = layout_size; - - if (!pool->host_memory_base && pool->entry_count == pool->max_entry_count) { - vk_object_free(&device->vk, NULL, set); - return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY); - } - - /* try to allocate linearly first, so that we don't spend - * time looking for gaps if the app only allocates & - * resets via the pool. */ - if (pool->current_offset + layout_size <= pool->size) { - set->mapped_ptr = (uint32_t*)(pool_base(pool) + pool->current_offset); - set->va = pool->host_bo ? 0 : pool->bo->iova + pool->current_offset; - - if (!pool->host_memory_base) { - pool->entries[pool->entry_count].offset = pool->current_offset; - pool->entries[pool->entry_count].size = layout_size; - pool->entries[pool->entry_count].set = set; - pool->entry_count++; - } - pool->current_offset += layout_size; - } else if (!pool->host_memory_base) { - uint64_t offset = 0; - int index; - - for (index = 0; index < pool->entry_count; ++index) { - if (pool->entries[index].offset - offset >= layout_size) - break; - offset = pool->entries[index].offset + pool->entries[index].size; - } - - if (pool->size - offset < layout_size) { - vk_object_free(&device->vk, NULL, set); - return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY); - } - - set->mapped_ptr = (uint32_t*)(pool_base(pool) + offset); - set->va = pool->host_bo ? 0 : pool->bo->iova + offset; - - memmove(&pool->entries[index + 1], &pool->entries[index], - sizeof(pool->entries[0]) * (pool->entry_count - index)); - pool->entries[index].offset = offset; - pool->entries[index].size = layout_size; - pool->entries[index].set = set; - pool->entry_count++; - } else - return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY); - } - - if (layout->has_immutable_samplers) { - for (unsigned i = 0; i < layout->binding_count; ++i) { - if (!layout->binding[i].immutable_samplers_offset) - continue; - - unsigned offset = layout->binding[i].offset / 4; - if (layout->binding[i].type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) - offset += A6XX_TEX_CONST_DWORDS; - - const struct tu_sampler *samplers = - (const struct tu_sampler *)((const char *)layout + - layout->binding[i].immutable_samplers_offset); - for (unsigned j = 0; j < layout->binding[i].array_size; ++j) { - memcpy(set->mapped_ptr + offset, samplers[j].descriptor, - sizeof(samplers[j].descriptor)); - offset += layout->binding[i].size / 4; - } - } - } - - if (layout->has_inline_uniforms) { - for (unsigned i = 0; i < layout->binding_count; i++) { - if (layout->binding[i].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) - continue; - - uint32_t *ptr = set->mapped_ptr + layout->binding[i].offset / 4; - uint64_t va = set->va + layout->binding[i].offset + - A6XX_TEX_CONST_DWORDS * 4; - uint32_t size = - (layout->has_variable_descriptors && i == layout->binding_count - 1) ? - variable_count : layout->binding[i].size - A6XX_TEX_CONST_DWORDS * 4; - size = ALIGN_POT(size, 16) / 16; - - ptr[0] = A6XX_UBO_0_BASE_LO(va); - ptr[1] = A6XX_UBO_1_BASE_HI(va >> 32) | A6XX_UBO_1_SIZE(size); - } - } - - vk_descriptor_set_layout_ref(&layout->vk); - list_addtail(&set->pool_link, &pool->desc_sets); - - *out_set = set; - return VK_SUCCESS; -} - -static void -tu_descriptor_set_destroy(struct tu_device *device, - struct tu_descriptor_pool *pool, - struct tu_descriptor_set *set, - bool free_bo) -{ - assert(!pool->host_memory_base); - - if (free_bo && set->size && !pool->host_memory_base) { - uint32_t offset = (uint8_t*)set->mapped_ptr - pool_base(pool); - - for (int i = 0; i < pool->entry_count; ++i) { - if (pool->entries[i].offset == offset) { - memmove(&pool->entries[i], &pool->entries[i+1], - sizeof(pool->entries[i]) * (pool->entry_count - i - 1)); - --pool->entry_count; - break; - } - } - } - - vk_object_free(&device->vk, NULL, set); -} - -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreateDescriptorPool(VkDevice _device, const VkDescriptorPoolCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkDescriptorPool *pDescriptorPool) { TU_FROM_HANDLE(tu_device, device, _device); - struct tu_descriptor_pool *pool; - uint64_t size = sizeof(struct tu_descriptor_pool); - uint64_t bo_size = 0, dynamic_size = 0; - VkResult ret; - - const VkMutableDescriptorTypeCreateInfoEXT *mutable_info = - vk_find_struct_const( pCreateInfo->pNext, - MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT); - - const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info = - vk_find_struct_const(pCreateInfo->pNext, - DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO); - - if (inline_info) { - /* In addition to the size of the descriptors, we have to factor in the - * padding for each binding. The sizes are 4 aligned but we have to - * align to a descriptor size, and in the worst case each inline - * binding has a size of 4 bytes and we have to pad each one out. - */ - bo_size += (2 * 4 * A6XX_TEX_CONST_DWORDS - 4) * - inline_info->maxInlineUniformBlockBindings; - } - - for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) { - const VkDescriptorPoolSize *pool_size = &pCreateInfo->pPoolSizes[i]; - - switch (pool_size->type) { - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - dynamic_size += descriptor_size(device, NULL, pool_size->type) * - pool_size->descriptorCount; - break; - case VK_DESCRIPTOR_TYPE_MUTABLE_EXT: - if (mutable_info && i < mutable_info->mutableDescriptorTypeListCount && - mutable_info->pMutableDescriptorTypeLists[i].descriptorTypeCount > 0) { - bo_size += - mutable_descriptor_size(device, &mutable_info->pMutableDescriptorTypeLists[i]) * - pool_size->descriptorCount; - } else { - /* Allocate the maximum size possible. */ - bo_size += 2 * A6XX_TEX_CONST_DWORDS * 4 * - pool_size->descriptorCount; - } - break; - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: - bo_size += pool_size->descriptorCount; - break; - default: - bo_size += descriptor_size(device, NULL, pool_size->type) * - pool_size->descriptorCount; - break; - } - } - - if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) { - uint64_t host_size = pCreateInfo->maxSets * sizeof(struct tu_descriptor_set); - host_size += dynamic_size; - size += host_size; - } else { - size += sizeof(struct tu_descriptor_pool_entry) * pCreateInfo->maxSets; - } - - pool = vk_object_zalloc(&device->vk, pAllocator, size, - VK_OBJECT_TYPE_DESCRIPTOR_POOL); - if (!pool) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) { - pool->host_memory_base = (uint8_t*)pool + sizeof(struct tu_descriptor_pool); - pool->host_memory_ptr = pool->host_memory_base; - pool->host_memory_end = (uint8_t*)pool + size; - } - - if (bo_size) { - if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT)) { - ret = tu_bo_init_new(device, &pool->bo, bo_size, TU_BO_ALLOC_ALLOW_DUMP, "descriptor pool"); - if (ret) - goto fail_alloc; - - ret = tu_bo_map(device, pool->bo); - if (ret) - goto fail_map; - } else { - pool->host_bo = vk_alloc2(&device->vk.alloc, pAllocator, bo_size, 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (!pool->host_bo) { - ret = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail_alloc; - } - } - } - pool->size = bo_size; - pool->max_entry_count = pCreateInfo->maxSets; - - list_inithead(&pool->desc_sets); - - *pDescriptorPool = tu_descriptor_pool_to_handle(pool); + tu_use_args(device); + tu_stub(); return VK_SUCCESS; - -fail_map: - tu_bo_finish(device, pool->bo); -fail_alloc: - vk_object_free(&device->vk, pAllocator, pool); - return ret; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyDescriptorPool(VkDevice _device, VkDescriptorPool _pool, const VkAllocationCallbacks *pAllocator) { - TU_FROM_HANDLE(tu_device, device, _device); - TU_FROM_HANDLE(tu_descriptor_pool, pool, _pool); - - if (!pool) - return; - - list_for_each_entry_safe(struct tu_descriptor_set, set, - &pool->desc_sets, pool_link) { - vk_descriptor_set_layout_unref(&device->vk, &set->layout->vk); - } - - if (!pool->host_memory_base) { - for(int i = 0; i < pool->entry_count; ++i) { - tu_descriptor_set_destroy(device, pool, pool->entries[i].set, false); - } - } - - if (pool->size) { - if (pool->host_bo) - vk_free2(&device->vk.alloc, pAllocator, pool->host_bo); - else - tu_bo_finish(device, pool->bo); - } - - vk_object_free(&device->vk, pAllocator, pool); } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_ResetDescriptorPool(VkDevice _device, VkDescriptorPool descriptorPool, VkDescriptorPoolResetFlags flags) @@ -837,26 +421,12 @@ tu_ResetDescriptorPool(VkDevice _device, TU_FROM_HANDLE(tu_device, device, _device); TU_FROM_HANDLE(tu_descriptor_pool, pool, descriptorPool); - list_for_each_entry_safe(struct tu_descriptor_set, set, - &pool->desc_sets, pool_link) { - vk_descriptor_set_layout_unref(&device->vk, &set->layout->vk); - } - list_inithead(&pool->desc_sets); - - if (!pool->host_memory_base) { - for(int i = 0; i < pool->entry_count; ++i) { - tu_descriptor_set_destroy(device, pool, pool->entries[i].set, false); - } - pool->entry_count = 0; - } - - pool->current_offset = 0; - pool->host_memory_ptr = pool->host_memory_base; - + tu_use_args(device, pool); + tu_stub(); return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_AllocateDescriptorSets(VkDevice _device, const VkDescriptorSetAllocateInfo *pAllocateInfo, VkDescriptorSet *pDescriptorSets) @@ -864,42 +434,12 @@ tu_AllocateDescriptorSets(VkDevice _device, TU_FROM_HANDLE(tu_device, device, _device); TU_FROM_HANDLE(tu_descriptor_pool, pool, pAllocateInfo->descriptorPool); - VkResult result = VK_SUCCESS; - uint32_t i; - struct tu_descriptor_set *set = NULL; - - const VkDescriptorSetVariableDescriptorCountAllocateInfo *variable_counts = - vk_find_struct_const(pAllocateInfo->pNext, DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO); - if (variable_counts && !variable_counts->descriptorSetCount) - variable_counts = NULL; - - /* allocate a set of buffers for each shader to contain descriptors */ - for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) { - TU_FROM_HANDLE(tu_descriptor_set_layout, layout, - pAllocateInfo->pSetLayouts[i]); - - assert(!(layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); - - result = tu_descriptor_set_create( - device, pool, layout, - variable_counts ? variable_counts->pDescriptorCounts[i] : 0, &set); - if (result != VK_SUCCESS) - break; - - pDescriptorSets[i] = tu_descriptor_set_to_handle(set); - } - - if (result != VK_SUCCESS) { - tu_FreeDescriptorSets(_device, pAllocateInfo->descriptorPool, - i, pDescriptorSets); - for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) { - pDescriptorSets[i] = VK_NULL_HANDLE; - } - } - return result; + tu_use_args(device, pool); + tu_stub(); + return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_FreeDescriptorSets(VkDevice _device, VkDescriptorPool descriptorPool, uint32_t count, @@ -908,338 +448,23 @@ tu_FreeDescriptorSets(VkDevice _device, TU_FROM_HANDLE(tu_device, device, _device); TU_FROM_HANDLE(tu_descriptor_pool, pool, descriptorPool); - for (uint32_t i = 0; i < count; i++) { - TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]); - - if (set) { - vk_descriptor_set_layout_unref(&device->vk, &set->layout->vk); - list_del(&set->pool_link); - } - - if (set && !pool->host_memory_base) - tu_descriptor_set_destroy(device, pool, set, true); - } + tu_use_args(device, pool); + tu_stub(); return VK_SUCCESS; } -static void -write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view) -{ - if (buffer_view == VK_NULL_HANDLE) { - memset(dst, 0, A6XX_TEX_CONST_DWORDS * sizeof(uint32_t)); - } else { - TU_FROM_HANDLE(tu_buffer_view, view, buffer_view); - - memcpy(dst, view->descriptor, sizeof(view->descriptor)); - } -} - -static void -write_buffer_descriptor(const struct tu_device *device, - uint32_t *dst, - const VkDescriptorBufferInfo *buffer_info) -{ - bool storage_16bit = device->physical_device->info->a6xx.storage_16bit; - /* newer a6xx allows using 16-bit descriptor for both 16-bit and 32-bit - * access, but we need to keep a 32-bit descriptor for readonly access via - * isam. - */ - unsigned descriptors = storage_16bit ? 2 : 1; - if (buffer_info->buffer == VK_NULL_HANDLE) { - memset(dst, 0, descriptors * A6XX_TEX_CONST_DWORDS * sizeof(uint32_t)); - return; - } - - TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer); - - assert((buffer_info->offset & 63) == 0); /* minStorageBufferOffsetAlignment */ - uint64_t va = buffer->iova + buffer_info->offset; - uint32_t range = vk_buffer_range(&buffer->vk, buffer_info->offset, buffer_info->range); - - for (unsigned i = 0; i < descriptors; i++) { - if (storage_16bit && i == 0) { - dst[0] = A6XX_TEX_CONST_0_TILE_MODE(TILE6_LINEAR) | A6XX_TEX_CONST_0_FMT(FMT6_16_UINT); - dst[1] = DIV_ROUND_UP(range, 2); - } else { - dst[0] = A6XX_TEX_CONST_0_TILE_MODE(TILE6_LINEAR) | A6XX_TEX_CONST_0_FMT(FMT6_32_UINT); - dst[1] = DIV_ROUND_UP(range, 4); - } - dst[2] = - A6XX_TEX_CONST_2_BUFFER | A6XX_TEX_CONST_2_TYPE(A6XX_TEX_BUFFER); - dst[3] = 0; - dst[4] = A6XX_TEX_CONST_4_BASE_LO(va); - dst[5] = A6XX_TEX_CONST_5_BASE_HI(va >> 32); - for (int j = 6; j < A6XX_TEX_CONST_DWORDS; j++) - dst[j] = 0; - dst += A6XX_TEX_CONST_DWORDS; - } -} - -static void -write_ubo_descriptor(uint32_t *dst, const VkDescriptorBufferInfo *buffer_info) -{ - if (buffer_info->buffer == VK_NULL_HANDLE) { - dst[0] = dst[1] = 0; - return; - } - - TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer); - - uint32_t range = vk_buffer_range(&buffer->vk, buffer_info->offset, buffer_info->range); - /* The HW range is in vec4 units */ - range = ALIGN_POT(range, 16) / 16; - uint64_t va = buffer->iova + buffer_info->offset; - - dst[0] = A6XX_UBO_0_BASE_LO(va); - dst[1] = A6XX_UBO_1_BASE_HI(va >> 32) | A6XX_UBO_1_SIZE(range); -} - -static void -write_image_descriptor(uint32_t *dst, - VkDescriptorType descriptor_type, - const VkDescriptorImageInfo *image_info) -{ - if (image_info->imageView == VK_NULL_HANDLE) { - memset(dst, 0, A6XX_TEX_CONST_DWORDS * sizeof(uint32_t)); - return; - } - - TU_FROM_HANDLE(tu_image_view, iview, image_info->imageView); - - if (descriptor_type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) { - memcpy(dst, iview->view.storage_descriptor, sizeof(iview->view.storage_descriptor)); - } else { - memcpy(dst, iview->view.descriptor, sizeof(iview->view.descriptor)); - } -} - -static void -write_combined_image_sampler_descriptor(uint32_t *dst, - VkDescriptorType descriptor_type, - const VkDescriptorImageInfo *image_info, - bool has_sampler) -{ - write_image_descriptor(dst, descriptor_type, image_info); - /* copy over sampler state */ - if (has_sampler) { - TU_FROM_HANDLE(tu_sampler, sampler, image_info->sampler); - memcpy(dst + A6XX_TEX_CONST_DWORDS, sampler->descriptor, sizeof(sampler->descriptor)); - } -} - -static void -write_sampler_descriptor(uint32_t *dst, const VkDescriptorImageInfo *image_info) -{ - TU_FROM_HANDLE(tu_sampler, sampler, image_info->sampler); - - memcpy(dst, sampler->descriptor, sizeof(sampler->descriptor)); -} - -/* note: this is used with immutable samplers in push descriptors */ -static void -write_sampler_push(uint32_t *dst, const struct tu_sampler *sampler) -{ - memcpy(dst, sampler->descriptor, sizeof(sampler->descriptor)); -} - void -tu_update_descriptor_sets(const struct tu_device *device, +tu_update_descriptor_sets(struct tu_device *device, + struct tu_cmd_buffer *cmd_buffer, VkDescriptorSet dstSetOverride, uint32_t descriptorWriteCount, const VkWriteDescriptorSet *pDescriptorWrites, uint32_t descriptorCopyCount, const VkCopyDescriptorSet *pDescriptorCopies) { - uint32_t i, j; - for (i = 0; i < descriptorWriteCount; i++) { - const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i]; - TU_FROM_HANDLE(tu_descriptor_set, set, dstSetOverride ?: writeset->dstSet); - const struct tu_descriptor_set_binding_layout *binding_layout = - set->layout->binding + writeset->dstBinding; - uint32_t *ptr = set->mapped_ptr; - if (writeset->descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || - writeset->descriptorType == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { - ptr = set->dynamic_descriptors; - ptr += binding_layout->dynamic_offset_offset / 4; - } else { - ptr = set->mapped_ptr; - ptr += binding_layout->offset / 4; - } - - /* for immutable samplers with push descriptors: */ - const bool copy_immutable_samplers = - dstSetOverride && binding_layout->immutable_samplers_offset; - const struct tu_sampler *samplers = - tu_immutable_samplers(set->layout, binding_layout); - - if (writeset->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - /* We need to respect this note: - * - * The same behavior applies to bindings with a descriptor type of - * VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK where descriptorCount - * specifies the number of bytes to update while dstArrayElement - * specifies the starting byte offset, thus in this case if the - * dstBinding has a smaller byte size than the sum of - * dstArrayElement and descriptorCount, then the remainder will be - * used to update the subsequent binding - dstBinding+1 starting - * at offset zero. This falls out as a special case of the above - * rule. - * - * This means we can't just do a straight memcpy, because due to - * alignment padding and the descriptor itself there are gaps between - * sequential bindings. We have to loop over each binding updated. - */ - const VkWriteDescriptorSetInlineUniformBlock *inline_write = - vk_find_struct_const(writeset->pNext, - WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK); - uint32_t remaining = inline_write->dataSize; - const uint8_t *src = inline_write->pData; - uint32_t dst_offset = writeset->dstArrayElement; - do { - uint8_t *dst = (uint8_t *)(ptr + A6XX_TEX_CONST_DWORDS) + dst_offset; - uint32_t binding_size = - binding_layout->size - A6XX_TEX_CONST_DWORDS * 4 - dst_offset; - uint32_t to_write = MIN2(remaining, binding_size); - memcpy(dst, src, to_write); - - binding_layout++; - ptr = set->mapped_ptr + binding_layout->offset / 4; - dst_offset = 0; - src += to_write; - remaining -= to_write; - } while (remaining > 0); - - continue; - } - - ptr += binding_layout->size / 4 * writeset->dstArrayElement; - for (j = 0; j < writeset->descriptorCount; ++j) { - switch(writeset->descriptorType) { - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - write_ubo_descriptor(ptr, writeset->pBufferInfo + j); - break; - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - write_buffer_descriptor(device, ptr, writeset->pBufferInfo + j); - break; - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - write_texel_buffer_descriptor(ptr, writeset->pTexelBufferView[j]); - break; - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - write_image_descriptor(ptr, writeset->descriptorType, writeset->pImageInfo + j); - break; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - write_combined_image_sampler_descriptor(ptr, - writeset->descriptorType, - writeset->pImageInfo + j, - !binding_layout->immutable_samplers_offset); - - if (copy_immutable_samplers) - write_sampler_push(ptr + A6XX_TEX_CONST_DWORDS, &samplers[writeset->dstArrayElement + j]); - break; - case VK_DESCRIPTOR_TYPE_SAMPLER: - if (!binding_layout->immutable_samplers_offset) - write_sampler_descriptor(ptr, writeset->pImageInfo + j); - else if (copy_immutable_samplers) - write_sampler_push(ptr, &samplers[writeset->dstArrayElement + j]); - break; - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - /* nothing in descriptor set - framebuffer state is used instead */ - if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC)) - write_image_descriptor(ptr, writeset->descriptorType, writeset->pImageInfo + j); - break; - default: - unreachable("unimplemented descriptor type"); - break; - } - ptr += binding_layout->size / 4; - } - } - - for (i = 0; i < descriptorCopyCount; i++) { - const VkCopyDescriptorSet *copyset = &pDescriptorCopies[i]; - TU_FROM_HANDLE(tu_descriptor_set, src_set, - copyset->srcSet); - TU_FROM_HANDLE(tu_descriptor_set, dst_set, - copyset->dstSet); - const struct tu_descriptor_set_binding_layout *src_binding_layout = - src_set->layout->binding + copyset->srcBinding; - const struct tu_descriptor_set_binding_layout *dst_binding_layout = - dst_set->layout->binding + copyset->dstBinding; - uint32_t *src_ptr = src_set->mapped_ptr; - uint32_t *dst_ptr = dst_set->mapped_ptr; - if (src_binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || - src_binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { - src_ptr = src_set->dynamic_descriptors; - dst_ptr = dst_set->dynamic_descriptors; - src_ptr += src_binding_layout->dynamic_offset_offset / 4; - dst_ptr += dst_binding_layout->dynamic_offset_offset / 4; - } else { - src_ptr = src_set->mapped_ptr; - dst_ptr = dst_set->mapped_ptr; - src_ptr += src_binding_layout->offset / 4; - dst_ptr += dst_binding_layout->offset / 4; - } - - if (src_binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - uint32_t remaining = copyset->descriptorCount; - uint32_t src_start = copyset->srcArrayElement; - uint32_t dst_start = copyset->dstArrayElement; - uint8_t *src = (uint8_t *)(src_ptr + A6XX_TEX_CONST_DWORDS) + src_start; - uint8_t *dst = (uint8_t *)(dst_ptr + A6XX_TEX_CONST_DWORDS) + dst_start; - uint32_t src_remaining = - src_binding_layout->size - src_start - 4 * A6XX_TEX_CONST_DWORDS; - uint32_t dst_remaining = - dst_binding_layout->size - dst_start - 4 * A6XX_TEX_CONST_DWORDS; - do { - uint32_t to_write = MIN3(remaining, src_remaining, dst_remaining); - memcpy(dst, src, to_write); - - src += to_write; - dst += to_write; - src_remaining -= to_write; - dst_remaining -= to_write; - remaining -= to_write; - - if (src_remaining == 0) { - src_binding_layout++; - src_ptr = src_set->mapped_ptr + src_binding_layout->offset / 4; - src = (uint8_t *)(src_ptr + A6XX_TEX_CONST_DWORDS); - src_remaining = src_binding_layout->size - 4 * A6XX_TEX_CONST_DWORDS; - } - - if (dst_remaining == 0) { - dst_binding_layout++; - dst_ptr = dst_set->mapped_ptr + dst_binding_layout->offset / 4; - dst = (uint8_t *)(dst_ptr + A6XX_TEX_CONST_DWORDS); - dst_remaining = dst_binding_layout->size - 4 * A6XX_TEX_CONST_DWORDS; - } - } while (remaining > 0); - - continue; - } - - src_ptr += src_binding_layout->size * copyset->srcArrayElement / 4; - dst_ptr += dst_binding_layout->size * copyset->dstArrayElement / 4; - - /* In case of copies between mutable descriptor types - * and non-mutable descriptor types. - */ - uint32_t copy_size = MIN2(src_binding_layout->size, dst_binding_layout->size); - - for (j = 0; j < copyset->descriptorCount; ++j) { - memcpy(dst_ptr, src_ptr, copy_size); - - src_ptr += src_binding_layout->size / 4; - dst_ptr += dst_binding_layout->size / 4; - } - } } -VKAPI_ATTR void VKAPI_CALL +void tu_UpdateDescriptorSets(VkDevice _device, uint32_t descriptorWriteCount, const VkWriteDescriptorSet *pDescriptorWrites, @@ -1247,12 +472,13 @@ tu_UpdateDescriptorSets(VkDevice _device, const VkCopyDescriptorSet *pDescriptorCopies) { TU_FROM_HANDLE(tu_device, device, _device); - tu_update_descriptor_sets(device, VK_NULL_HANDLE, + + tu_update_descriptor_sets(device, NULL, VK_NULL_HANDLE, descriptorWriteCount, pDescriptorWrites, descriptorCopyCount, pDescriptorCopies); } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreateDescriptorUpdateTemplate( VkDevice _device, const VkDescriptorUpdateTemplateCreateInfo *pCreateInfo, @@ -1260,142 +486,28 @@ tu_CreateDescriptorUpdateTemplate( VkDescriptorUpdateTemplate *pDescriptorUpdateTemplate) { TU_FROM_HANDLE(tu_device, device, _device); - struct tu_descriptor_set_layout *set_layout = NULL; + TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout, + pCreateInfo->descriptorSetLayout); const uint32_t entry_count = pCreateInfo->descriptorUpdateEntryCount; - uint32_t dst_entry_count = 0; - - if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR) { - TU_FROM_HANDLE(tu_pipeline_layout, pipeline_layout, pCreateInfo->pipelineLayout); - - /* descriptorSetLayout should be ignored for push descriptors - * and instead it refers to pipelineLayout and set. - */ - assert(pCreateInfo->set < MAX_SETS); - set_layout = pipeline_layout->set[pCreateInfo->set].layout; - } else { - TU_FROM_HANDLE(tu_descriptor_set_layout, _set_layout, - pCreateInfo->descriptorSetLayout); - set_layout = _set_layout; - } - - for (uint32_t i = 0; i < entry_count; i++) { - const VkDescriptorUpdateTemplateEntry *entry = &pCreateInfo->pDescriptorUpdateEntries[i]; - if (entry->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - dst_entry_count++; - continue; - } - - /* Calculate how many bindings this update steps over, so we can split - * up the template entry. This lets the actual update be a simple - * memcpy. - */ - uint32_t remaining = entry->descriptorCount; - const struct tu_descriptor_set_binding_layout *binding_layout = - set_layout->binding + entry->dstBinding; - uint32_t dst_start = entry->dstArrayElement; - do { - uint32_t size = binding_layout->size - A6XX_TEX_CONST_DWORDS * 4; - uint32_t count = MIN2(remaining, size - dst_start); - remaining -= count; - binding_layout++; - dst_entry_count++; - dst_start = 0; - } while (remaining > 0); - } - const size_t size = sizeof(struct tu_descriptor_update_template) + - sizeof(struct tu_descriptor_update_template_entry) * dst_entry_count; + sizeof(struct tu_descriptor_update_template_entry) * entry_count; struct tu_descriptor_update_template *templ; - templ = vk_object_alloc(&device->vk, pAllocator, size, - VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE); + templ = vk_alloc2(&device->alloc, pAllocator, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!templ) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - templ->entry_count = dst_entry_count; - - if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR) { - templ->bind_point = pCreateInfo->pipelineBindPoint; - } - - uint32_t j = 0; - for (uint32_t i = 0; i < entry_count; i++) { - const VkDescriptorUpdateTemplateEntry *entry = &pCreateInfo->pDescriptorUpdateEntries[i]; - - const struct tu_descriptor_set_binding_layout *binding_layout = - set_layout->binding + entry->dstBinding; - uint32_t dst_offset, dst_stride; - const struct tu_sampler *immutable_samplers = NULL; - - /* dst_offset is an offset into dynamic_descriptors when the descriptor - * is dynamic, and an offset into mapped_ptr otherwise. - */ - switch (entry->descriptorType) { - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - dst_offset = binding_layout->dynamic_offset_offset / 4; - break; - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: { - uint32_t remaining = entry->descriptorCount; - uint32_t dst_start = entry->dstArrayElement; - uint32_t src_offset = entry->offset; - /* See comment in update_descriptor_sets() */ - do { - dst_offset = - binding_layout->offset + A6XX_TEX_CONST_DWORDS * 4 + dst_start; - uint32_t size = binding_layout->size - A6XX_TEX_CONST_DWORDS * 4; - uint32_t count = MIN2(remaining, size - dst_start); - templ->entry[j++] = (struct tu_descriptor_update_template_entry) { - .descriptor_type = entry->descriptorType, - .descriptor_count = count, - .src_offset = src_offset, - .dst_offset = dst_offset, - }; - remaining -= count; - src_offset += count; - binding_layout++; - dst_start = 0; - } while (remaining > 0); - - continue; - } - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - case VK_DESCRIPTOR_TYPE_SAMPLER: - if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR && - binding_layout->immutable_samplers_offset) { - immutable_samplers = - tu_immutable_samplers(set_layout, binding_layout) + entry->dstArrayElement; - } - FALLTHROUGH; - default: - dst_offset = binding_layout->offset / 4; - } - - dst_offset += (binding_layout->size * entry->dstArrayElement) / 4; - dst_stride = binding_layout->size / 4; - - templ->entry[j++] = (struct tu_descriptor_update_template_entry) { - .descriptor_type = entry->descriptorType, - .descriptor_count = entry->descriptorCount, - .src_offset = entry->offset, - .src_stride = entry->stride, - .dst_offset = dst_offset, - .dst_stride = dst_stride, - .has_sampler = !binding_layout->immutable_samplers_offset, - .immutable_samplers = immutable_samplers, - }; - } - - assert(j == dst_entry_count); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); *pDescriptorUpdateTemplate = tu_descriptor_update_template_to_handle(templ); + tu_use_args(set_layout); + tu_stub(); return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyDescriptorUpdateTemplate( VkDevice _device, VkDescriptorUpdateTemplate descriptorUpdateTemplate, @@ -1408,90 +520,23 @@ tu_DestroyDescriptorUpdateTemplate( if (!templ) return; - vk_object_free(&device->vk, pAllocator, templ); + vk_free2(&device->alloc, pAllocator, templ); } void tu_update_descriptor_set_with_template( - const struct tu_device *device, + struct tu_device *device, + struct tu_cmd_buffer *cmd_buffer, struct tu_descriptor_set *set, VkDescriptorUpdateTemplate descriptorUpdateTemplate, const void *pData) { TU_FROM_HANDLE(tu_descriptor_update_template, templ, descriptorUpdateTemplate); - - for (uint32_t i = 0; i < templ->entry_count; i++) { - uint32_t *ptr = set->mapped_ptr; - const void *src = ((const char *) pData) + templ->entry[i].src_offset; - const struct tu_sampler *samplers = templ->entry[i].immutable_samplers; - - if (templ->entry[i].descriptor_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - memcpy(((uint8_t *) ptr) + templ->entry[i].dst_offset, src, - templ->entry[i].descriptor_count); - continue; - } - - ptr += templ->entry[i].dst_offset; - unsigned dst_offset = templ->entry[i].dst_offset; - for (unsigned j = 0; j < templ->entry[i].descriptor_count; ++j) { - switch(templ->entry[i].descriptor_type) { - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: { - assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); - write_ubo_descriptor(set->dynamic_descriptors + dst_offset, src); - break; - } - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - write_ubo_descriptor(ptr, src); - break; - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { - assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); - write_buffer_descriptor(device, set->dynamic_descriptors + dst_offset, src); - break; - } - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - write_buffer_descriptor(device, ptr, src); - break; - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - write_texel_buffer_descriptor(ptr, *(VkBufferView *) src); - break; - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { - write_image_descriptor(ptr, templ->entry[i].descriptor_type, src); - break; - } - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - write_combined_image_sampler_descriptor(ptr, - templ->entry[i].descriptor_type, - src, - templ->entry[i].has_sampler); - if (samplers) - write_sampler_push(ptr + A6XX_TEX_CONST_DWORDS, &samplers[j]); - break; - case VK_DESCRIPTOR_TYPE_SAMPLER: - if (templ->entry[i].has_sampler) - write_sampler_descriptor(ptr, src); - else if (samplers) - write_sampler_push(ptr, &samplers[j]); - break; - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - /* nothing in descriptor set - framebuffer state is used instead */ - if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC)) - write_image_descriptor(ptr, templ->entry[i].descriptor_type, src); - break; - default: - unreachable("unimplemented descriptor type"); - break; - } - src = (char *) src + templ->entry[i].src_stride; - ptr += templ->entry[i].dst_stride; - dst_offset += templ->entry[i].dst_stride; - } - } + tu_use_args(templ); } -VKAPI_ATTR void VKAPI_CALL +void tu_UpdateDescriptorSetWithTemplate( VkDevice _device, VkDescriptorSet descriptorSet, @@ -1501,46 +546,25 @@ tu_UpdateDescriptorSetWithTemplate( TU_FROM_HANDLE(tu_device, device, _device); TU_FROM_HANDLE(tu_descriptor_set, set, descriptorSet); - tu_update_descriptor_set_with_template(device, set, descriptorUpdateTemplate, pData); + tu_update_descriptor_set_with_template(device, NULL, set, + descriptorUpdateTemplate, pData); } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreateSamplerYcbcrConversion( - VkDevice _device, + VkDevice device, const VkSamplerYcbcrConversionCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkSamplerYcbcrConversion *pYcbcrConversion) { - TU_FROM_HANDLE(tu_device, device, _device); - struct tu_sampler_ycbcr_conversion *conversion; - - conversion = vk_object_alloc(&device->vk, pAllocator, sizeof(*conversion), - VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION); - if (!conversion) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - conversion->format = pCreateInfo->format; - conversion->ycbcr_model = pCreateInfo->ycbcrModel; - conversion->ycbcr_range = pCreateInfo->ycbcrRange; - conversion->components = pCreateInfo->components; - conversion->chroma_offsets[0] = pCreateInfo->xChromaOffset; - conversion->chroma_offsets[1] = pCreateInfo->yChromaOffset; - conversion->chroma_filter = pCreateInfo->chromaFilter; - - *pYcbcrConversion = tu_sampler_ycbcr_conversion_to_handle(conversion); + *pYcbcrConversion = VK_NULL_HANDLE; return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL -tu_DestroySamplerYcbcrConversion(VkDevice _device, +void +tu_DestroySamplerYcbcrConversion(VkDevice device, VkSamplerYcbcrConversion ycbcrConversion, const VkAllocationCallbacks *pAllocator) { - TU_FROM_HANDLE(tu_device, device, _device); - TU_FROM_HANDLE(tu_sampler_ycbcr_conversion, ycbcr_conversion, ycbcrConversion); - - if (!ycbcr_conversion) - return; - - vk_object_free(&device->vk, pAllocator, ycbcr_conversion); + /* Do nothing. */ } diff --git a/lib/mesa/src/freedreno/vulkan/tu_device.c b/lib/mesa/src/freedreno/vulkan/tu_device.c index 83f782635..901f02486 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_device.c +++ b/lib/mesa/src/freedreno/vulkan/tu_device.c @@ -1,401 +1,358 @@ /* * Copyright © 2016 Red Hat. * Copyright © 2016 Bas Nieuwenhuizen - * SPDX-License-Identifier: MIT * * based in part on anv driver which is: * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ -#include "tu_device.h" +#include "tu_private.h" #include <fcntl.h> -#include <poll.h> +#include <libsync.h> +#include <stdbool.h> +#include <string.h> +#include <sys/mman.h> #include <sys/sysinfo.h> +#include <unistd.h> +#include <xf86drm.h> -#include "git_sha1.h" -#include "util/u_debug.h" +#include "compiler/glsl_types.h" +#include "util/debug.h" #include "util/disk_cache.h" -#include "util/driconf.h" -#include "util/os_misc.h" -#include "vk_shader_module.h" -#include "vk_sampler.h" +#include "vk_format.h" #include "vk_util.h" -/* for fd_get_driver/device_uuid() */ -#include "freedreno/common/freedreno_uuid.h" - -#include "tu_clear_blit.h" -#include "tu_cmd_buffer.h" -#include "tu_cs.h" -#include "tu_descriptor_set.h" -#include "tu_dynamic_rendering.h" -#include "tu_image.h" -#include "tu_pass.h" -#include "tu_query.h" -#include "tu_tracepoints.h" -#include "tu_wsi.h" - -#if defined(VK_USE_PLATFORM_WAYLAND_KHR) || \ - defined(VK_USE_PLATFORM_XCB_KHR) || \ - defined(VK_USE_PLATFORM_XLIB_KHR) || \ - defined(VK_USE_PLATFORM_DISPLAY_KHR) -#define TU_HAS_SURFACE 1 -#else -#define TU_HAS_SURFACE 0 -#endif - +#include "drm-uapi/msm_drm.h" static int -tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid) -{ - struct mesa_sha1 ctx; - unsigned char sha1[20]; - /* Note: IR3_SHADER_DEBUG also affects compilation, but it's not - * initialized until after compiler creation so we have to add it to the - * shader hash instead, since the compiler is only created with the logical - * device. - */ - uint64_t driver_flags = device->instance->debug_flags & TU_DEBUG_NOMULTIPOS; - uint16_t family = fd_dev_gpu_id(&device->dev_id); - +tu_device_get_cache_uuid(uint16_t family, void *uuid) +{ + uint32_t mesa_timestamp; + uint16_t f = family; memset(uuid, 0, VK_UUID_SIZE); - _mesa_sha1_init(&ctx); - - if (!disk_cache_get_function_identifier(tu_device_get_cache_uuid, &ctx)) + if (!disk_cache_get_function_timestamp(tu_device_get_cache_uuid, + &mesa_timestamp)) return -1; - _mesa_sha1_update(&ctx, &family, sizeof(family)); - _mesa_sha1_update(&ctx, &driver_flags, sizeof(driver_flags)); - _mesa_sha1_final(&ctx, sha1); - - memcpy(uuid, sha1, VK_UUID_SIZE); + memcpy(uuid, &mesa_timestamp, 4); + memcpy((char *) uuid + 4, &f, 2); + snprintf((char *) uuid + 6, VK_UUID_SIZE - 10, "tu"); return 0; } -#define TU_API_VERSION VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION) - -VKAPI_ATTR VkResult VKAPI_CALL -tu_EnumerateInstanceVersion(uint32_t *pApiVersion) -{ - *pApiVersion = TU_API_VERSION; - return VK_SUCCESS; -} - -static const struct vk_instance_extension_table tu_instance_extensions_supported = { - .KHR_device_group_creation = true, - .KHR_external_fence_capabilities = true, - .KHR_external_memory_capabilities = true, - .KHR_external_semaphore_capabilities = true, - .KHR_get_physical_device_properties2 = true, - .KHR_surface = TU_HAS_SURFACE, - .KHR_get_surface_capabilities2 = TU_HAS_SURFACE, - .EXT_debug_report = true, - .EXT_debug_utils = true, -#ifdef VK_USE_PLATFORM_WAYLAND_KHR - .KHR_wayland_surface = true, -#endif -#ifdef VK_USE_PLATFORM_XCB_KHR - .KHR_xcb_surface = true, -#endif -#ifdef VK_USE_PLATFORM_XLIB_KHR - .KHR_xlib_surface = true, -#endif -#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT - .EXT_acquire_xlib_display = true, -#endif -#ifdef VK_USE_PLATFORM_DISPLAY_KHR - .KHR_display = true, - .KHR_get_display_properties2 = true, - .EXT_direct_mode_display = true, - .EXT_display_surface_counter = true, - .EXT_acquire_drm_display = true, -#endif -}; +static void +tu_get_driver_uuid(void *uuid) +{ + memset(uuid, 0, VK_UUID_SIZE); + snprintf(uuid, VK_UUID_SIZE, "freedreno"); +} static void -get_device_extensions(const struct tu_physical_device *device, - struct vk_device_extension_table *ext) -{ - *ext = (struct vk_device_extension_table) { - .KHR_16bit_storage = device->info->a6xx.storage_16bit, - .KHR_bind_memory2 = true, - .KHR_copy_commands2 = true, - .KHR_create_renderpass2 = true, - .KHR_dedicated_allocation = true, - .KHR_depth_stencil_resolve = true, - .KHR_descriptor_update_template = true, - .KHR_device_group = true, - .KHR_draw_indirect_count = true, - .KHR_external_fence = true, - .KHR_external_fence_fd = true, - .KHR_external_memory = true, - .KHR_external_memory_fd = true, - .KHR_external_semaphore = true, - .KHR_external_semaphore_fd = true, - .KHR_format_feature_flags2 = true, - .KHR_get_memory_requirements2 = true, - .KHR_global_priority = true, - .KHR_imageless_framebuffer = true, - .KHR_incremental_present = TU_HAS_SURFACE, - .KHR_image_format_list = true, - .KHR_maintenance1 = true, - .KHR_maintenance2 = true, - .KHR_maintenance3 = true, - .KHR_maintenance4 = true, - .KHR_multiview = true, - .KHR_performance_query = device->instance->debug_flags & TU_DEBUG_PERFC, - .KHR_pipeline_executable_properties = true, - .KHR_push_descriptor = true, - .KHR_relaxed_block_layout = true, - .KHR_sampler_mirror_clamp_to_edge = true, - .KHR_sampler_ycbcr_conversion = true, - .KHR_shader_draw_parameters = true, - .KHR_shader_float_controls = true, - .KHR_shader_float16_int8 = true, - .KHR_shader_subgroup_extended_types = true, - .KHR_shader_terminate_invocation = true, - .KHR_spirv_1_4 = true, - .KHR_storage_buffer_storage_class = true, - .KHR_swapchain = TU_HAS_SURFACE, - .KHR_swapchain_mutable_format = TU_HAS_SURFACE, - .KHR_uniform_buffer_standard_layout = true, - .KHR_variable_pointers = true, - .KHR_vulkan_memory_model = true, - .KHR_driver_properties = true, - .KHR_separate_depth_stencil_layouts = true, - .KHR_buffer_device_address = true, - .KHR_shader_integer_dot_product = true, - .KHR_zero_initialize_workgroup_memory = true, - .KHR_shader_non_semantic_info = true, - .KHR_synchronization2 = true, - .KHR_dynamic_rendering = true, -#ifndef TU_USE_KGSL - .KHR_timeline_semaphore = true, -#endif -#ifdef VK_USE_PLATFORM_DISPLAY_KHR - .EXT_display_control = true, -#endif - .EXT_external_memory_dma_buf = true, - .EXT_image_drm_format_modifier = true, - .EXT_sample_locations = device->info->a6xx.has_sample_locations, - .EXT_sampler_filter_minmax = true, - .EXT_transform_feedback = true, - .EXT_4444_formats = true, - .EXT_border_color_swizzle = true, - .EXT_conditional_rendering = true, - .EXT_custom_border_color = true, - .EXT_depth_clip_control = true, - .EXT_depth_clip_enable = true, - .EXT_descriptor_indexing = true, - .EXT_extended_dynamic_state = true, - .EXT_extended_dynamic_state2 = true, - .EXT_filter_cubic = device->info->a6xx.has_tex_filter_cubic, - .EXT_global_priority = true, - .EXT_global_priority_query = true, - .EXT_host_query_reset = true, - .EXT_index_type_uint8 = true, - .EXT_memory_budget = true, - .EXT_primitive_topology_list_restart = true, - .EXT_private_data = true, - .EXT_queue_family_foreign = true, - .EXT_robustness2 = true, - .EXT_scalar_block_layout = true, - .EXT_separate_stencil_usage = true, - .EXT_shader_demote_to_helper_invocation = true, - .EXT_shader_stencil_export = true, - .EXT_shader_viewport_index_layer = true, - .EXT_shader_module_identifier = true, - .EXT_texel_buffer_alignment = true, - .EXT_vertex_attribute_divisor = true, - .EXT_provoking_vertex = true, - .EXT_line_rasterization = true, - .EXT_subgroup_size_control = true, - .EXT_image_robustness = true, - .EXT_primitives_generated_query = true, - .EXT_image_view_min_lod = true, - .EXT_pipeline_creation_feedback = true, - .EXT_pipeline_creation_cache_control = true, - .EXT_vertex_input_dynamic_state = true, - .EXT_attachment_feedback_loop_layout = true, - .EXT_rasterization_order_attachment_access = true, - .EXT_multi_draw = true, -#ifndef TU_USE_KGSL - .EXT_physical_device_drm = true, -#endif - /* For Graphics Flight Recorder (GFR) */ - .AMD_buffer_marker = true, - .ARM_rasterization_order_attachment_access = true, -#ifdef ANDROID - .ANDROID_native_buffer = true, -#endif - .IMG_filter_cubic = device->info->a6xx.has_tex_filter_cubic, - .VALVE_mutable_descriptor_type = true, - .EXT_image_2d_view_of_3d = true, - .EXT_color_write_enable = true, - .EXT_load_store_op_none = true, - .EXT_non_seamless_cube_map = true, - .EXT_tooling_info = true, - .EXT_inline_uniform_block = true, - .EXT_mutable_descriptor_type = true, - .KHR_pipeline_library = true, - .EXT_graphics_pipeline_library = true, +tu_get_device_uuid(void *uuid) +{ + memset(uuid, 0, VK_UUID_SIZE); +} + +static VkResult +tu_bo_init(struct tu_device *dev, + struct tu_bo *bo, + uint32_t gem_handle, + uint64_t size) +{ + uint64_t iova = tu_gem_info_iova(dev, gem_handle); + if (!iova) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + *bo = (struct tu_bo) { + .gem_handle = gem_handle, + .size = size, + .iova = iova, }; + + return VK_SUCCESS; } -static const struct vk_pipeline_cache_object_ops *const cache_import_ops[] = { - &tu_shaders_ops, - &tu_nir_shaders_ops, - NULL, -}; +VkResult +tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size) +{ + /* TODO: Choose better flags. As of 2018-11-12, freedreno/drm/msm_bo.c + * always sets `flags = MSM_BO_WC`, and we copy that behavior here. + */ + uint32_t gem_handle = tu_gem_new(dev, size, MSM_BO_WC); + if (!gem_handle) + return vk_error(dev->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + VkResult result = tu_bo_init(dev, bo, gem_handle, size); + if (result != VK_SUCCESS) { + tu_gem_close(dev, gem_handle); + return vk_error(dev->instance, result); + } + + return VK_SUCCESS; +} + +VkResult +tu_bo_init_dmabuf(struct tu_device *dev, + struct tu_bo *bo, + uint64_t size, + int fd) +{ + uint32_t gem_handle = tu_gem_import_dmabuf(dev, fd, size); + if (!gem_handle) + return vk_error(dev->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE); + + VkResult result = tu_bo_init(dev, bo, gem_handle, size); + if (result != VK_SUCCESS) { + tu_gem_close(dev, gem_handle); + return vk_error(dev->instance, result); + } + + return VK_SUCCESS; +} + +int +tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo) +{ + return tu_gem_export_dmabuf(dev, bo->gem_handle); +} VkResult +tu_bo_map(struct tu_device *dev, struct tu_bo *bo) +{ + if (bo->map) + return VK_SUCCESS; + + uint64_t offset = tu_gem_info_offset(dev, bo->gem_handle); + if (!offset) + return vk_error(dev->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + /* TODO: Should we use the wrapper os_mmap() like Freedreno does? */ + void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED, + dev->physical_device->local_fd, offset); + if (map == MAP_FAILED) + return vk_error(dev->instance, VK_ERROR_MEMORY_MAP_FAILED); + + bo->map = map; + return VK_SUCCESS; +} + +void +tu_bo_finish(struct tu_device *dev, struct tu_bo *bo) +{ + assert(bo->gem_handle); + + if (bo->map) + munmap(bo->map, bo->size); + + tu_gem_close(dev, bo->gem_handle); +} + +static VkResult tu_physical_device_init(struct tu_physical_device *device, - struct tu_instance *instance) + struct tu_instance *instance, + drmDevicePtr drm_device) { + const char *path = drm_device->nodes[DRM_NODE_RENDER]; VkResult result = VK_SUCCESS; + drmVersionPtr version; + int fd; + int master_fd = -1; + + fd = open(path, O_RDWR | O_CLOEXEC); + if (fd < 0) { + return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "failed to open device %s", path); + } + + /* Version 1.3 added MSM_INFO_IOVA. */ + const int min_version_major = 1; + const int min_version_minor = 3; + + version = drmGetVersion(fd); + if (!version) { + close(fd); + return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "failed to query kernel driver version for device %s", + path); + } + + if (strcmp(version->name, "msm")) { + drmFreeVersion(version); + close(fd); + return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "device %s does not use the msm kernel driver", path); + } + + if (version->version_major != min_version_major || + version->version_minor < min_version_minor) { + result = vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "kernel driver for device %s has version %d.%d, " + "but Vulkan requires version >= %d.%d", + path, version->version_major, version->version_minor, + min_version_major, min_version_minor); + drmFreeVersion(version); + close(fd); + return result; + } - const char *fd_name = fd_dev_name(&device->dev_id); - if (strncmp(fd_name, "FD", 2) == 0) { - device->name = vk_asprintf(&instance->vk.alloc, - VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE, - "Turnip Adreno (TM) %s", &fd_name[2]); - } else { - device->name = vk_strdup(&instance->vk.alloc, fd_name, - VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + drmFreeVersion(version); + + if (instance->debug_flags & TU_DEBUG_STARTUP) + tu_logi("Found compatible device '%s'.", path); + device->_loader_data.loaderMagic = ICD_LOADER_MAGIC; + device->instance = instance; + assert(strlen(path) < ARRAY_SIZE(device->path)); + strncpy(device->path, path, ARRAY_SIZE(device->path)); + + if (instance->enabled_extensions.KHR_display) { + master_fd = + open(drm_device->nodes[DRM_NODE_PRIMARY], O_RDWR | O_CLOEXEC); + if (master_fd >= 0) { + /* TODO: free master_fd is accel is not working? */ + } } - if (!device->name) { - return vk_startup_errorf(instance, VK_ERROR_OUT_OF_HOST_MEMORY, - "device name alloc fail"); + + device->master_fd = master_fd; + device->local_fd = fd; + + if (tu_drm_get_gpu_id(device, &device->gpu_id)) { + if (instance->debug_flags & TU_DEBUG_STARTUP) + tu_logi("Could not query the GPU ID"); + result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "could not get GPU ID"); + goto fail; } - const struct fd_dev_info *info = fd_dev_info(&device->dev_id); - if (!info) { - result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, - "device %s is unsupported", device->name); - goto fail_free_name; + if (tu_drm_get_gmem_size(device, &device->gmem_size)) { + if (instance->debug_flags & TU_DEBUG_STARTUP) + tu_logi("Could not query the GMEM size"); + result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "could not get GMEM size"); + goto fail; } - switch (fd_dev_gen(&device->dev_id)) { - case 6: - device->info = info; - device->ccu_offset_bypass = device->info->num_ccu * A6XX_CCU_DEPTH_SIZE; - device->ccu_offset_gmem = (device->gmem_size - - device->info->num_ccu * A6XX_CCU_GMEM_COLOR_SIZE); + + memset(device->name, 0, sizeof(device->name)); + sprintf(device->name, "FD%d", device->gpu_id); + + switch (device->gpu_id) { + case 630: + device->tile_align_w = 32; + device->tile_align_h = 32; break; default: - result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, - "device %s is unsupported", device->name); - goto fail_free_name; - } - if (tu_device_get_cache_uuid(device, device->cache_uuid)) { - result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, - "cannot generate UUID"); - goto fail_free_name; + result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "device %s is unsupported", device->name); + goto fail; } - - if (device->has_set_iova) { - mtx_init(&device->vma_mutex, mtx_plain); - util_vma_heap_init(&device->vma, device->va_start, - ROUND_DOWN_TO(device->va_size, 4096)); + if (tu_device_get_cache_uuid(device->gpu_id, device->cache_uuid)) { + result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "cannot generate UUID"); + goto fail; } - fd_get_driver_uuid(device->driver_uuid); - fd_get_device_uuid(device->device_uuid, &device->dev_id); + /* The gpu id is already embedded in the uuid so we just pass "tu" + * when creating the cache. + */ + char buf[VK_UUID_SIZE * 2 + 1]; + disk_cache_format_hex_id(buf, device->cache_uuid, VK_UUID_SIZE * 2); + device->disk_cache = disk_cache_create(device->name, buf, 0); - struct vk_device_extension_table supported_extensions; - get_device_extensions(device, &supported_extensions); + fprintf(stderr, "WARNING: tu is not a conformant vulkan implementation, " + "testing use only.\n"); - struct vk_physical_device_dispatch_table dispatch_table; - vk_physical_device_dispatch_table_from_entrypoints( - &dispatch_table, &tu_physical_device_entrypoints, true); - vk_physical_device_dispatch_table_from_entrypoints( - &dispatch_table, &wsi_physical_device_entrypoints, false); + tu_get_driver_uuid(&device->device_uuid); + tu_get_device_uuid(&device->device_uuid); - result = vk_physical_device_init(&device->vk, &instance->vk, - &supported_extensions, - &dispatch_table); - if (result != VK_SUCCESS) - goto fail_free_vma; + tu_fill_device_extension_table(device, &device->supported_extensions); - device->vk.supported_sync_types = device->sync_types; + if (result != VK_SUCCESS) { + vk_error(instance, result); + goto fail; + } -#if TU_HAS_SURFACE result = tu_wsi_init(device); if (result != VK_SUCCESS) { - vk_startup_errorf(instance, result, "WSI init failure"); - vk_physical_device_finish(&device->vk); - goto fail_free_vma; + vk_error(instance, result); + goto fail; } -#endif - - /* The gpu id is already embedded in the uuid so we just pass "tu" - * when creating the cache. - */ - char buf[VK_UUID_SIZE * 2 + 1]; - disk_cache_format_hex_id(buf, device->cache_uuid, VK_UUID_SIZE * 2); - device->vk.disk_cache = disk_cache_create(device->name, buf, 0); - - device->vk.pipeline_cache_import_ops = cache_import_ops; return VK_SUCCESS; -fail_free_vma: - if (device->has_set_iova) - util_vma_heap_finish(&device->vma); -fail_free_name: - vk_free(&instance->vk.alloc, (void *)device->name); +fail: + close(fd); + if (master_fd != -1) + close(master_fd); return result; } static void tu_physical_device_finish(struct tu_physical_device *device) { -#if TU_HAS_SURFACE tu_wsi_finish(device); -#endif + disk_cache_destroy(device->disk_cache); close(device->local_fd); if (device->master_fd != -1) close(device->master_fd); +} - if (device->has_set_iova) - util_vma_heap_finish(&device->vma); - - vk_free(&device->instance->vk.alloc, (void *)device->name); +static void * +default_alloc_func(void *pUserData, + size_t size, + size_t align, + VkSystemAllocationScope allocationScope) +{ + return malloc(size); +} - vk_physical_device_finish(&device->vk); +static void * +default_realloc_func(void *pUserData, + void *pOriginal, + size_t size, + size_t align, + VkSystemAllocationScope allocationScope) +{ + return realloc(pOriginal, size); } static void -tu_destroy_physical_device(struct vk_physical_device *device) +default_free_func(void *pUserData, void *pMemory) { - tu_physical_device_finish((struct tu_physical_device *) device); - vk_free(&device->instance->alloc, device); + free(pMemory); } +static const VkAllocationCallbacks default_alloc = { + .pUserData = NULL, + .pfnAllocation = default_alloc_func, + .pfnReallocation = default_realloc_func, + .pfnFree = default_free_func, +}; + static const struct debug_control tu_debug_options[] = { { "startup", TU_DEBUG_STARTUP }, { "nir", TU_DEBUG_NIR }, - { "nobin", TU_DEBUG_NOBIN }, - { "sysmem", TU_DEBUG_SYSMEM }, - { "gmem", TU_DEBUG_GMEM }, - { "forcebin", TU_DEBUG_FORCEBIN }, - { "layout", TU_DEBUG_LAYOUT }, - { "noubwc", TU_DEBUG_NOUBWC }, - { "nomultipos", TU_DEBUG_NOMULTIPOS }, - { "nolrz", TU_DEBUG_NOLRZ }, - { "nolrzfc", TU_DEBUG_NOLRZFC }, - { "perf", TU_DEBUG_PERF }, - { "perfc", TU_DEBUG_PERFC }, - { "flushall", TU_DEBUG_FLUSHALL }, - { "syncdraw", TU_DEBUG_SYNCDRAW }, - { "dontcare_as_load", TU_DEBUG_DONT_CARE_AS_LOAD }, - { "rast_order", TU_DEBUG_RAST_ORDER }, - { "unaligned_store", TU_DEBUG_UNALIGNED_STORE }, - { "log_skip_gmem_ops", TU_DEBUG_LOG_SKIP_GMEM_OPS }, - { "dynamic", TU_DEBUG_DYNAMIC }, - { "bos", TU_DEBUG_BOS }, + { "ir3", TU_DEBUG_IR3 }, { NULL, 0 } }; @@ -406,34 +363,17 @@ tu_get_debug_option_name(int id) return tu_debug_options[id].string; } -static const driOptionDescription tu_dri_options[] = { - DRI_CONF_SECTION_PERFORMANCE - DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0) - DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false) - DRI_CONF_VK_X11_ENSURE_MIN_IMAGE_COUNT(false) - DRI_CONF_VK_XWAYLAND_WAIT_READY(true) - DRI_CONF_SECTION_END - - DRI_CONF_SECTION_DEBUG - DRI_CONF_VK_WSI_FORCE_BGRA8_UNORM_FIRST(false) - DRI_CONF_VK_DONT_CARE_AS_LOAD(false) - DRI_CONF_SECTION_END -}; - -static void -tu_init_dri_options(struct tu_instance *instance) +static int +tu_get_instance_extension_index(const char *name) { - driParseOptionInfo(&instance->available_dri_options, tu_dri_options, - ARRAY_SIZE(tu_dri_options)); - driParseConfigFiles(&instance->dri_options, &instance->available_dri_options, 0, "turnip", NULL, NULL, - instance->vk.app_info.app_name, instance->vk.app_info.app_version, - instance->vk.app_info.engine_name, instance->vk.app_info.engine_version); - - if (driQueryOptionb(&instance->dri_options, "vk_dont_care_as_load")) - instance->debug_flags |= TU_DEBUG_DONT_CARE_AS_LOAD; + for (unsigned i = 0; i < TU_INSTANCE_EXTENSION_COUNT; ++i) { + if (strcmp(name, tu_instance_extensions[i].extensionName) == 0) + return i; + } + return -1; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreateInstance(const VkInstanceCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkInstance *pInstance) @@ -443,66 +383,63 @@ tu_CreateInstance(const VkInstanceCreateInfo *pCreateInfo, assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO); - if (pAllocator == NULL) - pAllocator = vk_default_allocator(); - - instance = vk_zalloc(pAllocator, sizeof(*instance), 8, - VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + uint32_t client_version; + if (pCreateInfo->pApplicationInfo && + pCreateInfo->pApplicationInfo->apiVersion != 0) { + client_version = pCreateInfo->pApplicationInfo->apiVersion; + } else { + tu_EnumerateInstanceVersion(&client_version); + } + instance = vk_zalloc2(&default_alloc, pAllocator, sizeof(*instance), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); if (!instance) return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); - struct vk_instance_dispatch_table dispatch_table; - vk_instance_dispatch_table_from_entrypoints( - &dispatch_table, &tu_instance_entrypoints, true); - vk_instance_dispatch_table_from_entrypoints( - &dispatch_table, &wsi_instance_entrypoints, false); + instance->_loader_data.loaderMagic = ICD_LOADER_MAGIC; - result = vk_instance_init(&instance->vk, - &tu_instance_extensions_supported, - &dispatch_table, - pCreateInfo, pAllocator); - if (result != VK_SUCCESS) { - vk_free(pAllocator, instance); - return vk_error(NULL, result); - } + if (pAllocator) + instance->alloc = *pAllocator; + else + instance->alloc = default_alloc; -#ifndef TU_USE_KGSL - instance->vk.physical_devices.try_create_for_drm = - tu_physical_device_try_create; -#else - instance->vk.physical_devices.enumerate = tu_enumerate_devices; -#endif - instance->vk.physical_devices.destroy = tu_destroy_physical_device; + instance->api_version = client_version; + instance->physical_device_count = -1; instance->debug_flags = - parse_debug_string(os_get_option("TU_DEBUG"), tu_debug_options); - -#ifdef DEBUG - /* Enable startup debugging by default on debug drivers. You almost always - * want to see your startup failures in that case, and it's hard to set - * this env var on android. - */ - instance->debug_flags |= TU_DEBUG_STARTUP; -#endif + parse_debug_string(getenv("TU_DEBUG"), tu_debug_options); if (instance->debug_flags & TU_DEBUG_STARTUP) - mesa_logi("Created an instance"); + tu_logi("Created an instance"); - VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false)); + for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) { + const char *ext_name = pCreateInfo->ppEnabledExtensionNames[i]; + int index = tu_get_instance_extension_index(ext_name); + + if (index < 0 || !tu_supported_instance_extensions.extensions[index]) { + vk_free2(&default_alloc, pAllocator, instance); + return vk_error(instance, VK_ERROR_EXTENSION_NOT_PRESENT); + } - tu_init_dri_options(instance); + instance->enabled_extensions.extensions[index] = true; + } - *pInstance = tu_instance_to_handle(instance); + result = vk_debug_report_instance_init(&instance->debug_report_callbacks); + if (result != VK_SUCCESS) { + vk_free2(&default_alloc, pAllocator, instance); + return vk_error(instance, result); + } + + glsl_type_singleton_init_or_ref(); -#ifdef HAVE_PERFETTO - tu_perfetto_init(); -#endif + VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false)); + + *pInstance = tu_instance_to_handle(instance); return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyInstance(VkInstance _instance, const VkAllocationCallbacks *pAllocator) { @@ -511,623 +448,272 @@ tu_DestroyInstance(VkInstance _instance, if (!instance) return; + for (int i = 0; i < instance->physical_device_count; ++i) { + tu_physical_device_finish(instance->physical_devices + i); + } + VG(VALGRIND_DESTROY_MEMPOOL(instance)); - driDestroyOptionCache(&instance->dri_options); - driDestroyOptionInfo(&instance->available_dri_options); + glsl_type_singleton_decref(); - vk_instance_finish(&instance->vk); - vk_free(&instance->vk.alloc, instance); -} + vk_debug_report_instance_destroy(&instance->debug_report_callbacks); -static void -tu_get_physical_device_features_1_1(struct tu_physical_device *pdevice, - VkPhysicalDeviceVulkan11Features *features) -{ - features->storageBuffer16BitAccess = pdevice->info->a6xx.storage_16bit; - features->uniformAndStorageBuffer16BitAccess = false; - features->storagePushConstant16 = false; - features->storageInputOutput16 = false; - features->multiview = true; - features->multiviewGeometryShader = false; - features->multiviewTessellationShader = false; - features->variablePointersStorageBuffer = true; - features->variablePointers = true; - features->protectedMemory = false; - features->samplerYcbcrConversion = true; - features->shaderDrawParameters = true; + vk_free(&instance->alloc, instance); } -static void -tu_get_physical_device_features_1_2(struct tu_physical_device *pdevice, - VkPhysicalDeviceVulkan12Features *features) -{ - features->samplerMirrorClampToEdge = true; - features->drawIndirectCount = true; - features->storageBuffer8BitAccess = false; - features->uniformAndStorageBuffer8BitAccess = false; - features->storagePushConstant8 = false; - features->shaderBufferInt64Atomics = false; - features->shaderSharedInt64Atomics = false; - features->shaderFloat16 = true; - features->shaderInt8 = false; - - features->descriptorIndexing = true; - features->shaderInputAttachmentArrayDynamicIndexing = false; - features->shaderUniformTexelBufferArrayDynamicIndexing = true; - features->shaderStorageTexelBufferArrayDynamicIndexing = true; - features->shaderUniformBufferArrayNonUniformIndexing = true; - features->shaderSampledImageArrayNonUniformIndexing = true; - features->shaderStorageBufferArrayNonUniformIndexing = true; - features->shaderStorageImageArrayNonUniformIndexing = true; - features->shaderInputAttachmentArrayNonUniformIndexing = false; - features->shaderUniformTexelBufferArrayNonUniformIndexing = true; - features->shaderStorageTexelBufferArrayNonUniformIndexing = true; - features->descriptorBindingUniformBufferUpdateAfterBind = true; - features->descriptorBindingSampledImageUpdateAfterBind = true; - features->descriptorBindingStorageImageUpdateAfterBind = true; - features->descriptorBindingStorageBufferUpdateAfterBind = true; - features->descriptorBindingUniformTexelBufferUpdateAfterBind = true; - features->descriptorBindingStorageTexelBufferUpdateAfterBind = true; - features->descriptorBindingUpdateUnusedWhilePending = true; - features->descriptorBindingPartiallyBound = true; - features->descriptorBindingVariableDescriptorCount = true; - features->runtimeDescriptorArray = true; - - features->samplerFilterMinmax = true; - features->scalarBlockLayout = true; - features->imagelessFramebuffer = true; - features->uniformBufferStandardLayout = true; - features->shaderSubgroupExtendedTypes = true; - features->separateDepthStencilLayouts = true; - features->hostQueryReset = true; - features->timelineSemaphore = true; - features->bufferDeviceAddress = true; - features->bufferDeviceAddressCaptureReplay = pdevice->has_set_iova; - features->bufferDeviceAddressMultiDevice = false; - features->vulkanMemoryModel = true; - features->vulkanMemoryModelDeviceScope = true; - features->vulkanMemoryModelAvailabilityVisibilityChains = true; - features->shaderOutputViewportIndex = true; - features->shaderOutputLayer = true; - features->subgroupBroadcastDynamicId = true; +static VkResult +tu_enumerate_devices(struct tu_instance *instance) +{ + /* TODO: Check for more devices ? */ + drmDevicePtr devices[8]; + VkResult result = VK_ERROR_INCOMPATIBLE_DRIVER; + int max_devices; + + instance->physical_device_count = 0; + + max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices)); + + if (instance->debug_flags & TU_DEBUG_STARTUP) + tu_logi("Found %d drm nodes", max_devices); + + if (max_devices < 1) + return vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER); + + for (unsigned i = 0; i < (unsigned) max_devices; i++) { + if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER && + devices[i]->bustype == DRM_BUS_PLATFORM) { + + result = tu_physical_device_init( + instance->physical_devices + instance->physical_device_count, + instance, devices[i]); + if (result == VK_SUCCESS) + ++instance->physical_device_count; + else if (result != VK_ERROR_INCOMPATIBLE_DRIVER) + break; + } + } + drmFreeDevices(devices, max_devices); + + return result; } -static void -tu_get_physical_device_features_1_3(struct tu_physical_device *pdevice, - VkPhysicalDeviceVulkan13Features *features) -{ - features->robustImageAccess = true; - features->inlineUniformBlock = true; - features->descriptorBindingInlineUniformBlockUpdateAfterBind = true; - features->pipelineCreationCacheControl = true; - features->privateData = true; - features->shaderDemoteToHelperInvocation = true; - features->shaderTerminateInvocation = true; - features->subgroupSizeControl = true; - features->computeFullSubgroups = true; - features->synchronization2 = true; - features->textureCompressionASTC_HDR = false; - features->shaderZeroInitializeWorkgroupMemory = true; - features->dynamicRendering = true; - features->shaderIntegerDotProduct = true; - features->maintenance4 = true; +VkResult +tu_EnumeratePhysicalDevices(VkInstance _instance, + uint32_t *pPhysicalDeviceCount, + VkPhysicalDevice *pPhysicalDevices) +{ + TU_FROM_HANDLE(tu_instance, instance, _instance); + VK_OUTARRAY_MAKE(out, pPhysicalDevices, pPhysicalDeviceCount); + + VkResult result; + + if (instance->physical_device_count < 0) { + result = tu_enumerate_devices(instance); + if (result != VK_SUCCESS && result != VK_ERROR_INCOMPATIBLE_DRIVER) + return result; + } + + for (uint32_t i = 0; i < instance->physical_device_count; ++i) { + vk_outarray_append(&out, p) + { + *p = tu_physical_device_to_handle(instance->physical_devices + i); + } + } + + return vk_outarray_status(&out); } -void -tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, - VkPhysicalDeviceFeatures2 *pFeatures) +VkResult +tu_EnumeratePhysicalDeviceGroups( + VkInstance _instance, + uint32_t *pPhysicalDeviceGroupCount, + VkPhysicalDeviceGroupProperties *pPhysicalDeviceGroupProperties) { - TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice); + TU_FROM_HANDLE(tu_instance, instance, _instance); + VK_OUTARRAY_MAKE(out, pPhysicalDeviceGroupProperties, + pPhysicalDeviceGroupCount); + VkResult result; + + if (instance->physical_device_count < 0) { + result = tu_enumerate_devices(instance); + if (result != VK_SUCCESS && result != VK_ERROR_INCOMPATIBLE_DRIVER) + return result; + } - pFeatures->features = (VkPhysicalDeviceFeatures) { - .robustBufferAccess = true, - .fullDrawIndexUint32 = true, - .imageCubeArray = true, - .independentBlend = true, - .geometryShader = true, - .tessellationShader = true, - .sampleRateShading = true, - .dualSrcBlend = true, - .logicOp = true, - .multiDrawIndirect = true, - .drawIndirectFirstInstance = true, - .depthClamp = true, - .depthBiasClamp = true, - .fillModeNonSolid = true, - .depthBounds = true, + for (uint32_t i = 0; i < instance->physical_device_count; ++i) { + vk_outarray_append(&out, p) + { + p->physicalDeviceCount = 1; + p->physicalDevices[0] = + tu_physical_device_to_handle(instance->physical_devices + i); + p->subsetAllocation = false; + } + } + + return vk_outarray_status(&out); +} + +void +tu_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice, + VkPhysicalDeviceFeatures *pFeatures) +{ + memset(pFeatures, 0, sizeof(*pFeatures)); + + *pFeatures = (VkPhysicalDeviceFeatures) { + .robustBufferAccess = false, + .fullDrawIndexUint32 = false, + .imageCubeArray = false, + .independentBlend = false, + .geometryShader = false, + .tessellationShader = false, + .sampleRateShading = false, + .dualSrcBlend = false, + .logicOp = false, + .multiDrawIndirect = false, + .drawIndirectFirstInstance = false, + .depthClamp = false, + .depthBiasClamp = false, + .fillModeNonSolid = false, + .depthBounds = false, .wideLines = false, - .largePoints = true, - .alphaToOne = true, - .multiViewport = true, - .samplerAnisotropy = true, - .textureCompressionETC2 = true, - .textureCompressionASTC_LDR = true, - .textureCompressionBC = true, - .occlusionQueryPrecise = true, - .pipelineStatisticsQuery = true, - .vertexPipelineStoresAndAtomics = true, - .fragmentStoresAndAtomics = true, - .shaderTessellationAndGeometryPointSize = true, - .shaderImageGatherExtended = true, - .shaderStorageImageExtendedFormats = true, + .largePoints = false, + .alphaToOne = false, + .multiViewport = false, + .samplerAnisotropy = false, + .textureCompressionETC2 = false, + .textureCompressionASTC_LDR = false, + .textureCompressionBC = false, + .occlusionQueryPrecise = false, + .pipelineStatisticsQuery = false, + .vertexPipelineStoresAndAtomics = false, + .fragmentStoresAndAtomics = false, + .shaderTessellationAndGeometryPointSize = false, + .shaderImageGatherExtended = false, + .shaderStorageImageExtendedFormats = false, .shaderStorageImageMultisample = false, - .shaderUniformBufferArrayDynamicIndexing = true, - .shaderSampledImageArrayDynamicIndexing = true, - .shaderStorageBufferArrayDynamicIndexing = true, - .shaderStorageImageArrayDynamicIndexing = true, - .shaderStorageImageReadWithoutFormat = true, - .shaderStorageImageWriteWithoutFormat = true, - .shaderClipDistance = true, - .shaderCullDistance = true, + .shaderUniformBufferArrayDynamicIndexing = false, + .shaderSampledImageArrayDynamicIndexing = false, + .shaderStorageBufferArrayDynamicIndexing = false, + .shaderStorageImageArrayDynamicIndexing = false, + .shaderStorageImageReadWithoutFormat = false, + .shaderStorageImageWriteWithoutFormat = false, + .shaderClipDistance = false, + .shaderCullDistance = false, .shaderFloat64 = false, .shaderInt64 = false, - .shaderInt16 = true, + .shaderInt16 = false, .sparseBinding = false, - .variableMultisampleRate = true, - .inheritedQueries = true, + .variableMultisampleRate = false, + .inheritedQueries = false, }; +} - VkPhysicalDeviceVulkan11Features core_1_1 = { - .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES, - }; - tu_get_physical_device_features_1_1(pdevice, &core_1_1); - - VkPhysicalDeviceVulkan12Features core_1_2 = { - .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES, - }; - tu_get_physical_device_features_1_2(pdevice, &core_1_2); - - VkPhysicalDeviceVulkan13Features core_1_3 = { - .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES, - }; - tu_get_physical_device_features_1_3(pdevice, &core_1_3); - +void +tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, + VkPhysicalDeviceFeatures2 *pFeatures) +{ vk_foreach_struct(ext, pFeatures->pNext) { - if (vk_get_physical_device_core_1_1_feature_ext(ext, &core_1_1)) - continue; - if (vk_get_physical_device_core_1_2_feature_ext(ext, &core_1_2)) - continue; - if (vk_get_physical_device_core_1_3_feature_ext(ext, &core_1_3)) - continue; - switch (ext->sType) { - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: { - VkPhysicalDeviceConditionalRenderingFeaturesEXT *features = - (VkPhysicalDeviceConditionalRenderingFeaturesEXT *) ext; - features->conditionalRendering = true; - features->inheritedConditionalRendering = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT: { - VkPhysicalDeviceTransformFeedbackFeaturesEXT *features = - (VkPhysicalDeviceTransformFeedbackFeaturesEXT *) ext; - features->transformFeedback = true; - features->geometryStreams = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT: { - VkPhysicalDeviceIndexTypeUint8FeaturesEXT *features = - (VkPhysicalDeviceIndexTypeUint8FeaturesEXT *)ext; - features->indexTypeUint8 = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: { - VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features = - (VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *)ext; - features->vertexAttributeInstanceRateDivisor = true; - features->vertexAttributeInstanceRateZeroDivisor = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_ENABLE_FEATURES_EXT: { - VkPhysicalDeviceDepthClipEnableFeaturesEXT *features = - (VkPhysicalDeviceDepthClipEnableFeaturesEXT *)ext; - features->depthClipEnable = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_4444_FORMATS_FEATURES_EXT: { - VkPhysicalDevice4444FormatsFeaturesEXT *features = (void *)ext; - features->formatA4R4G4B4 = true; - features->formatA4B4G4R4 = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BORDER_COLOR_SWIZZLE_FEATURES_EXT: { - VkPhysicalDeviceBorderColorSwizzleFeaturesEXT *features = (void *)ext; - features->borderColorSwizzle = true; - features->borderColorSwizzleFromImage = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: { - VkPhysicalDeviceCustomBorderColorFeaturesEXT *features = (void *) ext; - features->customBorderColors = true; - features->customBorderColorWithoutFormat = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT: { - VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *features = (void *)ext; - features->extendedDynamicState = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_2_FEATURES_EXT: { - VkPhysicalDeviceExtendedDynamicState2FeaturesEXT *features = - (VkPhysicalDeviceExtendedDynamicState2FeaturesEXT *)ext; - features->extendedDynamicState2 = true; - features->extendedDynamicState2LogicOp = true; - features->extendedDynamicState2PatchControlPoints = true; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES: { + VkPhysicalDeviceVariablePointersFeatures *features = (void *) ext; + features->variablePointersStorageBuffer = false; + features->variablePointers = false; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: { - VkPhysicalDevicePerformanceQueryFeaturesKHR *feature = - (VkPhysicalDevicePerformanceQueryFeaturesKHR *)ext; - feature->performanceCounterQueryPools = true; - feature->performanceCounterMultipleQueryPools = false; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES: { + VkPhysicalDeviceMultiviewFeatures *features = + (VkPhysicalDeviceMultiviewFeatures *) ext; + features->multiview = false; + features->multiviewGeometryShader = false; + features->multiviewTessellationShader = false; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR: { - VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *features = - (VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *)ext; - features->pipelineExecutableInfo = true; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES: { + VkPhysicalDeviceShaderDrawParametersFeatures *features = + (VkPhysicalDeviceShaderDrawParametersFeatures *) ext; + features->shaderDrawParameters = false; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES: { - VkPhysicalDeviceShaderFloat16Int8Features *features = - (VkPhysicalDeviceShaderFloat16Int8Features *) ext; - features->shaderFloat16 = true; - features->shaderInt8 = false; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: { + VkPhysicalDeviceProtectedMemoryFeatures *features = + (VkPhysicalDeviceProtectedMemoryFeatures *) ext; + features->protectedMemory = false; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES: { - VkPhysicalDeviceScalarBlockLayoutFeatures *features = (void *)ext; - features->scalarBlockLayout = true; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: { + VkPhysicalDevice16BitStorageFeatures *features = + (VkPhysicalDevice16BitStorageFeatures *) ext; + features->storageBuffer16BitAccess = false; + features->uniformAndStorageBuffer16BitAccess = false; + features->storagePushConstant16 = false; + features->storageInputOutput16 = false; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: { - VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext; - features->robustBufferAccess2 = true; - features->robustImageAccess2 = true; - features->nullDescriptor = true; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: { + VkPhysicalDeviceSamplerYcbcrConversionFeatures *features = + (VkPhysicalDeviceSamplerYcbcrConversionFeatures *) ext; + features->samplerYcbcrConversion = false; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES: { - VkPhysicalDeviceTimelineSemaphoreFeatures *features = - (VkPhysicalDeviceTimelineSemaphoreFeatures *) ext; - features->timelineSemaphore = true; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES_EXT: { + VkPhysicalDeviceDescriptorIndexingFeaturesEXT *features = + (VkPhysicalDeviceDescriptorIndexingFeaturesEXT *) ext; + features->shaderInputAttachmentArrayDynamicIndexing = false; + features->shaderUniformTexelBufferArrayDynamicIndexing = false; + features->shaderStorageTexelBufferArrayDynamicIndexing = false; + features->shaderUniformBufferArrayNonUniformIndexing = false; + features->shaderSampledImageArrayNonUniformIndexing = false; + features->shaderStorageBufferArrayNonUniformIndexing = false; + features->shaderStorageImageArrayNonUniformIndexing = false; + features->shaderInputAttachmentArrayNonUniformIndexing = false; + features->shaderUniformTexelBufferArrayNonUniformIndexing = false; + features->shaderStorageTexelBufferArrayNonUniformIndexing = false; + features->descriptorBindingUniformBufferUpdateAfterBind = false; + features->descriptorBindingSampledImageUpdateAfterBind = false; + features->descriptorBindingStorageImageUpdateAfterBind = false; + features->descriptorBindingStorageBufferUpdateAfterBind = false; + features->descriptorBindingUniformTexelBufferUpdateAfterBind = false; + features->descriptorBindingStorageTexelBufferUpdateAfterBind = false; + features->descriptorBindingUpdateUnusedWhilePending = false; + features->descriptorBindingPartiallyBound = false; + features->descriptorBindingVariableDescriptorCount = false; + features->runtimeDescriptorArray = false; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_FEATURES_EXT: { - VkPhysicalDeviceProvokingVertexFeaturesEXT *features = - (VkPhysicalDeviceProvokingVertexFeaturesEXT *)ext; - features->provokingVertexLast = true; - features->transformFeedbackPreservesProvokingVertex = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MUTABLE_DESCRIPTOR_TYPE_FEATURES_EXT: { - VkPhysicalDeviceMutableDescriptorTypeFeaturesEXT *features = - (VkPhysicalDeviceMutableDescriptorTypeFeaturesEXT *)ext; - features->mutableDescriptorType = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_FEATURES_EXT: { - VkPhysicalDeviceLineRasterizationFeaturesEXT *features = - (VkPhysicalDeviceLineRasterizationFeaturesEXT *)ext; - features->rectangularLines = true; - features->bresenhamLines = true; - features->smoothLines = false; - features->stippledRectangularLines = false; - features->stippledBresenhamLines = false; - features->stippledSmoothLines = false; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVE_TOPOLOGY_LIST_RESTART_FEATURES_EXT: { - VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT *features = - (VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT *)ext; - features->primitiveTopologyListRestart = true; - features->primitiveTopologyPatchListRestart = false; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_FEATURES_EXT: { - VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesEXT *features = - (VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesEXT *)ext; - features->rasterizationOrderColorAttachmentAccess = true; - features->rasterizationOrderDepthAttachmentAccess = true; - features->rasterizationOrderStencilAttachmentAccess = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_CONTROL_FEATURES_EXT: { - VkPhysicalDeviceDepthClipControlFeaturesEXT *features = - (VkPhysicalDeviceDepthClipControlFeaturesEXT *)ext; - features->depthClipControl = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT: { - VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *features = - (VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *)ext; - features->texelBufferAlignment = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVES_GENERATED_QUERY_FEATURES_EXT: { - VkPhysicalDevicePrimitivesGeneratedQueryFeaturesEXT *features = - (VkPhysicalDevicePrimitivesGeneratedQueryFeaturesEXT *)ext; - features->primitivesGeneratedQuery = true; - features->primitivesGeneratedQueryWithRasterizerDiscard = false; - features->primitivesGeneratedQueryWithNonZeroStreams = false; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_VIEW_MIN_LOD_FEATURES_EXT: { - VkPhysicalDeviceImageViewMinLodFeaturesEXT *features = - (VkPhysicalDeviceImageViewMinLodFeaturesEXT *)ext; - features->minLod = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_2D_VIEW_OF_3D_FEATURES_EXT: { - VkPhysicalDeviceImage2DViewOf3DFeaturesEXT *features = - (VkPhysicalDeviceImage2DViewOf3DFeaturesEXT *)ext; - features->image2DViewOf3D = true; - features->sampler2DViewOf3D = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: { - VkPhysicalDeviceColorWriteEnableFeaturesEXT *features = - (VkPhysicalDeviceColorWriteEnableFeaturesEXT *)ext; - features->colorWriteEnable = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MODULE_IDENTIFIER_FEATURES_EXT: { - VkPhysicalDeviceShaderModuleIdentifierFeaturesEXT *features = - (VkPhysicalDeviceShaderModuleIdentifierFeaturesEXT *)ext; - features->shaderModuleIdentifier = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_INPUT_DYNAMIC_STATE_FEATURES_EXT: { - VkPhysicalDeviceVertexInputDynamicStateFeaturesEXT *features = - (VkPhysicalDeviceVertexInputDynamicStateFeaturesEXT *)ext; - features->vertexInputDynamicState = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_NON_SEAMLESS_CUBE_MAP_FEATURES_EXT: { - VkPhysicalDeviceNonSeamlessCubeMapFeaturesEXT *features = - (VkPhysicalDeviceNonSeamlessCubeMapFeaturesEXT *)ext; - features->nonSeamlessCubeMap = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ATTACHMENT_FEEDBACK_LOOP_LAYOUT_FEATURES_EXT: { - VkPhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT *features = - (VkPhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT*)ext; - features->attachmentFeedbackLoopLayout = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GLOBAL_PRIORITY_QUERY_FEATURES_KHR: { - VkPhysicalDeviceGlobalPriorityQueryFeaturesKHR *features = - (VkPhysicalDeviceGlobalPriorityQueryFeaturesKHR*)ext; - features->globalPriorityQuery = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_FEATURES_EXT: { - VkPhysicalDeviceMultiDrawFeaturesEXT *features = - (VkPhysicalDeviceMultiDrawFeaturesEXT *)ext; - features->multiDraw = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GRAPHICS_PIPELINE_LIBRARY_FEATURES_EXT: { - VkPhysicalDeviceGraphicsPipelineLibraryFeaturesEXT *features = - (VkPhysicalDeviceGraphicsPipelineLibraryFeaturesEXT *)ext; - features->graphicsPipelineLibrary = true; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: { + VkPhysicalDeviceConditionalRenderingFeaturesEXT *features = + (VkPhysicalDeviceConditionalRenderingFeaturesEXT *) ext; + features->conditionalRendering = false; + features->inheritedConditionalRendering = false; break; } - default: break; } } + return tu_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features); } - -static void -tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice, - VkPhysicalDeviceVulkan11Properties *p) -{ - assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES); - - memcpy(p->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE); - memcpy(p->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE); - memset(p->deviceLUID, 0, VK_LUID_SIZE); - p->deviceNodeMask = 0; - p->deviceLUIDValid = false; - - p->subgroupSize = 128; - p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT; - p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | - VK_SUBGROUP_FEATURE_VOTE_BIT | - VK_SUBGROUP_FEATURE_BALLOT_BIT | - VK_SUBGROUP_FEATURE_SHUFFLE_BIT | - VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT | - VK_SUBGROUP_FEATURE_ARITHMETIC_BIT; - if (pdevice->info->a6xx.has_getfiberid) { - p->subgroupSupportedStages |= VK_SHADER_STAGE_ALL_GRAPHICS; - p->subgroupSupportedOperations |= VK_SUBGROUP_FEATURE_QUAD_BIT; - } - - p->subgroupQuadOperationsInAllStages = false; - - p->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES; - p->maxMultiviewViewCount = MAX_VIEWS; - p->maxMultiviewInstanceIndex = INT_MAX; - p->protectedNoFault = false; - /* Our largest descriptors are 2 texture descriptors, or a texture and - * sampler descriptor. - */ - p->maxPerSetDescriptors = MAX_SET_SIZE / (2 * A6XX_TEX_CONST_DWORDS * 4); - /* Our buffer size fields allow only this much */ - p->maxMemoryAllocationSize = 0xFFFFFFFFull; - -} - - -static const size_t max_descriptor_set_size = MAX_SET_SIZE / (4 * A6XX_TEX_CONST_DWORDS); -static const VkSampleCountFlags sample_counts = - VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT; - -static void -tu_get_physical_device_properties_1_2(struct tu_physical_device *pdevice, - VkPhysicalDeviceVulkan12Properties *p) -{ - assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES); - - p->driverID = VK_DRIVER_ID_MESA_TURNIP; - memset(p->driverName, 0, sizeof(p->driverName)); - snprintf(p->driverName, VK_MAX_DRIVER_NAME_SIZE, - "turnip Mesa driver"); - memset(p->driverInfo, 0, sizeof(p->driverInfo)); - snprintf(p->driverInfo, VK_MAX_DRIVER_INFO_SIZE, - "Mesa " PACKAGE_VERSION MESA_GIT_SHA1); - p->conformanceVersion = (VkConformanceVersion) { - .major = 1, - .minor = 2, - .subminor = 7, - .patch = 1, - }; - - p->denormBehaviorIndependence = - VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL; - p->roundingModeIndependence = - VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL; - - p->shaderDenormFlushToZeroFloat16 = true; - p->shaderDenormPreserveFloat16 = false; - p->shaderRoundingModeRTEFloat16 = true; - p->shaderRoundingModeRTZFloat16 = false; - p->shaderSignedZeroInfNanPreserveFloat16 = true; - - p->shaderDenormFlushToZeroFloat32 = true; - p->shaderDenormPreserveFloat32 = false; - p->shaderRoundingModeRTEFloat32 = true; - p->shaderRoundingModeRTZFloat32 = false; - p->shaderSignedZeroInfNanPreserveFloat32 = true; - - p->shaderDenormFlushToZeroFloat64 = false; - p->shaderDenormPreserveFloat64 = false; - p->shaderRoundingModeRTEFloat64 = false; - p->shaderRoundingModeRTZFloat64 = false; - p->shaderSignedZeroInfNanPreserveFloat64 = false; - - p->shaderUniformBufferArrayNonUniformIndexingNative = true; - p->shaderSampledImageArrayNonUniformIndexingNative = true; - p->shaderStorageBufferArrayNonUniformIndexingNative = true; - p->shaderStorageImageArrayNonUniformIndexingNative = true; - p->shaderInputAttachmentArrayNonUniformIndexingNative = false; - p->robustBufferAccessUpdateAfterBind = false; - p->quadDivergentImplicitLod = false; - - p->maxUpdateAfterBindDescriptorsInAllPools = max_descriptor_set_size; - p->maxPerStageDescriptorUpdateAfterBindSamplers = max_descriptor_set_size; - p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = max_descriptor_set_size; - p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = max_descriptor_set_size; - p->maxPerStageDescriptorUpdateAfterBindSampledImages = max_descriptor_set_size; - p->maxPerStageDescriptorUpdateAfterBindStorageImages = max_descriptor_set_size; - p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_RTS; - p->maxPerStageUpdateAfterBindResources = max_descriptor_set_size; - p->maxDescriptorSetUpdateAfterBindSamplers = max_descriptor_set_size; - p->maxDescriptorSetUpdateAfterBindUniformBuffers = max_descriptor_set_size; - p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_UNIFORM_BUFFERS; - p->maxDescriptorSetUpdateAfterBindStorageBuffers = max_descriptor_set_size; - p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_STORAGE_BUFFERS; - p->maxDescriptorSetUpdateAfterBindSampledImages = max_descriptor_set_size; - p->maxDescriptorSetUpdateAfterBindStorageImages = max_descriptor_set_size; - p->maxDescriptorSetUpdateAfterBindInputAttachments = MAX_RTS; - - p->supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT; - p->supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT; - p->independentResolveNone = false; - p->independentResolve = false; - - p->filterMinmaxSingleComponentFormats = true; - p->filterMinmaxImageComponentMapping = true; - - p->maxTimelineSemaphoreValueDifference = UINT64_MAX; - - p->framebufferIntegerColorSampleCounts = sample_counts; -} - -static void -tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice, - VkPhysicalDeviceVulkan13Properties *p) -{ - /* TODO move threadsize_base and max_waves to fd_dev_info and use them here */ - p->minSubgroupSize = 64; /* threadsize_base */ - p->maxSubgroupSize = 128; /* threadsize_base * 2 */ - p->maxComputeWorkgroupSubgroups = 16; /* max_waves */ - p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL; - - /* Inline uniform buffers are just normal UBOs */ - p->maxInlineUniformBlockSize = MAX_UNIFORM_BUFFER_RANGE; - - /* Halve the normal limit on the number of descriptors, see below. */ - p->maxPerStageDescriptorInlineUniformBlocks = max_descriptor_set_size / 2; - p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = max_descriptor_set_size / 2; - p->maxDescriptorSetInlineUniformBlocks = max_descriptor_set_size / 2; - p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = max_descriptor_set_size / 2; - /* Because we halve the normal limit on the number of descriptors, in the - * worst case each descriptor takes up half the space, leaving the rest for - * the actual data. - */ - p->maxInlineUniformTotalSize = MAX_SET_SIZE / 2; - - p->integerDotProduct8BitUnsignedAccelerated = false; - p->integerDotProduct8BitSignedAccelerated = false; - p->integerDotProduct8BitMixedSignednessAccelerated = false; - p->integerDotProduct4x8BitPackedUnsignedAccelerated = - pdevice->info->a6xx.has_dp2acc; - /* TODO: we should be able to emulate 4x8BitPackedSigned fast enough */ - p->integerDotProduct4x8BitPackedSignedAccelerated = false; - p->integerDotProduct4x8BitPackedMixedSignednessAccelerated = - pdevice->info->a6xx.has_dp2acc; - p->integerDotProduct16BitUnsignedAccelerated = false; - p->integerDotProduct16BitSignedAccelerated = false; - p->integerDotProduct16BitMixedSignednessAccelerated = false; - p->integerDotProduct32BitUnsignedAccelerated = false; - p->integerDotProduct32BitSignedAccelerated = false; - p->integerDotProduct32BitMixedSignednessAccelerated = false; - p->integerDotProduct64BitUnsignedAccelerated = false; - p->integerDotProduct64BitSignedAccelerated = false; - p->integerDotProduct64BitMixedSignednessAccelerated = false; - p->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false; - p->integerDotProductAccumulatingSaturating8BitSignedAccelerated = false; - p->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false; - p->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = - pdevice->info->a6xx.has_dp2acc; - /* TODO: we should be able to emulate Saturating4x8BitPackedSigned fast enough */ - p->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = false; - p->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = - pdevice->info->a6xx.has_dp2acc; - p->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false; - p->integerDotProductAccumulatingSaturating16BitSignedAccelerated = false; - p->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false; - p->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false; - p->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false; - p->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false; - p->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false; - p->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false; - p->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false; - - p->storageTexelBufferOffsetAlignmentBytes = 64; - p->storageTexelBufferOffsetSingleTexelAlignment = false; - p->uniformTexelBufferOffsetAlignmentBytes = 64; - p->uniformTexelBufferOffsetSingleTexelAlignment = false; - - /* The address space is 4GB for current kernels, so there's no point - * allowing a larger buffer. Our buffer sizes are 64-bit though, so - * GetBufferDeviceRequirements won't fall over if someone actually creates - * a 4GB buffer. - */ - p->maxBufferSize = 1ull << 32; -} - -VKAPI_ATTR void VKAPI_CALL -tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, - VkPhysicalDeviceProperties2 *pProperties) +void +tu_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, + VkPhysicalDeviceProperties *pProperties) { TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice); + VkSampleCountFlags sample_counts = 0xf; + + /* make sure that the entire descriptor set is addressable with a signed + * 32-bit int. So the sum of all limits scaled by descriptor size has to + * be at most 2 GiB. the combined image & samples object count as one of + * both. This limit is for the pipeline layout, not for the set layout, but + * there is no set limit, so we just set a pipeline limit. I don't think + * any app is going to hit this soon. */ + size_t max_descriptor_set_size = + ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) / + (32 /* uniform buffer, 32 due to potential space wasted on alignment */ + + 32 /* storage buffer, 32 due to potential space wasted on alignment */ + + 32 /* sampler, largest when combined with image */ + + 64 /* sampled image */ + 64 /* storage image */); VkPhysicalDeviceLimits limits = { .maxImageDimension1D = (1 << 14), @@ -1136,20 +722,20 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, .maxImageDimensionCube = (1 << 14), .maxImageArrayLayers = (1 << 11), .maxTexelBufferElements = 128 * 1024 * 1024, - .maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE, - .maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE, + .maxUniformBufferRange = UINT32_MAX, + .maxStorageBufferRange = UINT32_MAX, .maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE, .maxMemoryAllocationCount = UINT32_MAX, .maxSamplerAllocationCount = 64 * 1024, .bufferImageGranularity = 64, /* A cache line */ - .sparseAddressSpaceSize = 0, + .sparseAddressSpaceSize = 0xffffffffu, /* buffer max size */ .maxBoundDescriptorSets = MAX_SETS, .maxPerStageDescriptorSamplers = max_descriptor_set_size, .maxPerStageDescriptorUniformBuffers = max_descriptor_set_size, .maxPerStageDescriptorStorageBuffers = max_descriptor_set_size, .maxPerStageDescriptorSampledImages = max_descriptor_set_size, .maxPerStageDescriptorStorageImages = max_descriptor_set_size, - .maxPerStageDescriptorInputAttachments = MAX_RTS, + .maxPerStageDescriptorInputAttachments = max_descriptor_set_size, .maxPerStageResources = max_descriptor_set_size, .maxDescriptorSetSamplers = max_descriptor_set_size, .maxDescriptorSetUniformBuffers = max_descriptor_set_size, @@ -1158,10 +744,10 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, .maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_STORAGE_BUFFERS, .maxDescriptorSetSampledImages = max_descriptor_set_size, .maxDescriptorSetStorageImages = max_descriptor_set_size, - .maxDescriptorSetInputAttachments = MAX_RTS, + .maxDescriptorSetInputAttachments = max_descriptor_set_size, .maxVertexInputAttributes = 32, .maxVertexInputBindings = 32, - .maxVertexInputAttributeOffset = 4095, + .maxVertexInputAttributeOffset = 2047, .maxVertexInputBindingStride = 2048, .maxVertexOutputComponents = 128, .maxTessellationGenerationLevel = 64, @@ -1172,41 +758,41 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, .maxTessellationControlTotalOutputComponents = 4096, .maxTessellationEvaluationInputComponents = 128, .maxTessellationEvaluationOutputComponents = 128, - .maxGeometryShaderInvocations = 32, + .maxGeometryShaderInvocations = 127, .maxGeometryInputComponents = 64, .maxGeometryOutputComponents = 128, .maxGeometryOutputVertices = 256, .maxGeometryTotalOutputComponents = 1024, - .maxFragmentInputComponents = 124, + .maxFragmentInputComponents = 128, .maxFragmentOutputAttachments = 8, .maxFragmentDualSrcAttachments = 1, - .maxFragmentCombinedOutputResources = MAX_RTS + max_descriptor_set_size * 2, + .maxFragmentCombinedOutputResources = 8, .maxComputeSharedMemorySize = 32768, .maxComputeWorkGroupCount = { 65535, 65535, 65535 }, .maxComputeWorkGroupInvocations = 2048, - .maxComputeWorkGroupSize = { 1024, 1024, 1024 }, - .subPixelPrecisionBits = 8, - .subTexelPrecisionBits = 8, - .mipmapPrecisionBits = 8, + .maxComputeWorkGroupSize = { 2048, 2048, 2048 }, + .subPixelPrecisionBits = 4 /* FIXME */, + .subTexelPrecisionBits = 4 /* FIXME */, + .mipmapPrecisionBits = 4 /* FIXME */, .maxDrawIndexedIndexValue = UINT32_MAX, .maxDrawIndirectCount = UINT32_MAX, - .maxSamplerLodBias = 4095.0 / 256.0, /* [-16, 15.99609375] */ + .maxSamplerLodBias = 16, .maxSamplerAnisotropy = 16, .maxViewports = MAX_VIEWPORTS, - .maxViewportDimensions = { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE }, + .maxViewportDimensions = { (1 << 14), (1 << 14) }, .viewportBoundsRange = { INT16_MIN, INT16_MAX }, .viewportSubPixelBits = 8, .minMemoryMapAlignment = 4096, /* A page */ - .minTexelBufferOffsetAlignment = 64, - .minUniformBufferOffsetAlignment = 64, - .minStorageBufferOffsetAlignment = 64, - .minTexelOffset = -16, - .maxTexelOffset = 15, + .minTexelBufferOffsetAlignment = 1, + .minUniformBufferOffsetAlignment = 4, + .minStorageBufferOffsetAlignment = 4, + .minTexelOffset = -32, + .maxTexelOffset = 31, .minTexelGatherOffset = -32, .maxTexelGatherOffset = 31, - .minInterpolationOffset = -0.5, - .maxInterpolationOffset = 0.4375, - .subPixelInterpolationOffsetBits = 4, + .minInterpolationOffset = -2, + .maxInterpolationOffset = 2, + .subPixelInterpolationOffsetBits = 8, .maxFramebufferWidth = (1 << 14), .maxFramebufferHeight = (1 << 14), .maxFramebufferLayers = (1 << 10), @@ -1216,65 +802,51 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, .framebufferNoAttachmentsSampleCounts = sample_counts, .maxColorAttachments = MAX_RTS, .sampledImageColorSampleCounts = sample_counts, - .sampledImageIntegerSampleCounts = sample_counts, + .sampledImageIntegerSampleCounts = VK_SAMPLE_COUNT_1_BIT, .sampledImageDepthSampleCounts = sample_counts, .sampledImageStencilSampleCounts = sample_counts, .storageImageSampleCounts = VK_SAMPLE_COUNT_1_BIT, .maxSampleMaskWords = 1, .timestampComputeAndGraphics = true, - .timestampPeriod = 1000000000.0 / 19200000.0, /* CP_ALWAYS_ON_COUNTER is fixed 19.2MHz */ + .timestampPeriod = 1, .maxClipDistances = 8, .maxCullDistances = 8, .maxCombinedClipAndCullDistances = 8, - .discreteQueuePriorities = 2, - .pointSizeRange = { 1, 4092 }, - .lineWidthRange = { 1.0, 1.0 }, - .pointSizeGranularity = 0.0625, - .lineWidthGranularity = 0.0, - .strictLines = true, + .discreteQueuePriorities = 1, + .pointSizeRange = { 0.125, 255.875 }, + .lineWidthRange = { 0.0, 7.9921875 }, + .pointSizeGranularity = (1.0 / 8.0), + .lineWidthGranularity = (1.0 / 128.0), + .strictLines = false, /* FINISHME */ .standardSampleLocations = true, .optimalBufferCopyOffsetAlignment = 128, .optimalBufferCopyRowPitchAlignment = 128, .nonCoherentAtomSize = 64, }; - pProperties->properties = (VkPhysicalDeviceProperties) { - .apiVersion = TU_API_VERSION, + *pProperties = (VkPhysicalDeviceProperties) { + .apiVersion = tu_physical_device_api_version(pdevice), .driverVersion = vk_get_driver_version(), - .vendorID = 0x5143, - .deviceID = pdevice->dev_id.chip_id, + .vendorID = 0, /* TODO */ + .deviceID = 0, .deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU, .limits = limits, .sparseProperties = { 0 }, }; - strcpy(pProperties->properties.deviceName, pdevice->name); - memcpy(pProperties->properties.pipelineCacheUUID, pdevice->cache_uuid, VK_UUID_SIZE); - - VkPhysicalDeviceVulkan11Properties core_1_1 = { - .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES, - }; - tu_get_physical_device_properties_1_1(pdevice, &core_1_1); - - VkPhysicalDeviceVulkan12Properties core_1_2 = { - .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES, - }; - tu_get_physical_device_properties_1_2(pdevice, &core_1_2); + strcpy(pProperties->deviceName, pdevice->name); + memcpy(pProperties->pipelineCacheUUID, pdevice->cache_uuid, VK_UUID_SIZE); +} - VkPhysicalDeviceVulkan13Properties core_1_3 = { - .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_PROPERTIES, - }; - tu_get_physical_device_properties_1_3(pdevice, &core_1_3); +void +tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, + VkPhysicalDeviceProperties2 *pProperties) +{ + TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice); + tu_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties); vk_foreach_struct(ext, pProperties->pNext) { - if (vk_get_physical_device_core_1_1_property_ext(ext, &core_1_1)) - continue; - if (vk_get_physical_device_core_1_2_property_ext(ext, &core_1_2)) - continue; - if (vk_get_physical_device_core_1_3_property_ext(ext, &core_1_3)) - continue; - switch (ext->sType) { case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR: { VkPhysicalDevicePushDescriptorPropertiesKHR *properties = @@ -1282,109 +854,36 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, properties->maxPushDescriptors = MAX_PUSH_DESCRIPTORS; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: { - VkPhysicalDeviceTransformFeedbackPropertiesEXT *properties = - (VkPhysicalDeviceTransformFeedbackPropertiesEXT *)ext; - - properties->maxTransformFeedbackStreams = IR3_MAX_SO_STREAMS; - properties->maxTransformFeedbackBuffers = IR3_MAX_SO_BUFFERS; - properties->maxTransformFeedbackBufferSize = UINT32_MAX; - properties->maxTransformFeedbackStreamDataSize = 512; - properties->maxTransformFeedbackBufferDataSize = 512; - properties->maxTransformFeedbackBufferDataStride = 512; - properties->transformFeedbackQueries = true; - properties->transformFeedbackStreamsLinesTriangles = true; - properties->transformFeedbackRasterizationStreamSelect = true; - properties->transformFeedbackDraw = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLE_LOCATIONS_PROPERTIES_EXT: { - VkPhysicalDeviceSampleLocationsPropertiesEXT *properties = - (VkPhysicalDeviceSampleLocationsPropertiesEXT *)ext; - properties->sampleLocationSampleCounts = 0; - if (pdevice->vk.supported_extensions.EXT_sample_locations) { - properties->sampleLocationSampleCounts = - VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT; - } - properties->maxSampleLocationGridSize = (VkExtent2D) { 1 , 1 }; - properties->sampleLocationCoordinateRange[0] = 0.0f; - properties->sampleLocationCoordinateRange[1] = 0.9375f; - properties->sampleLocationSubPixelBits = 4; - properties->variableSampleLocations = true; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: { - VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *props = - (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext; - props->maxVertexAttribDivisor = UINT32_MAX; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: { + VkPhysicalDeviceIDProperties *properties = + (VkPhysicalDeviceIDProperties *) ext; + memcpy(properties->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE); + memcpy(properties->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE); + properties->deviceLUIDValid = false; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_PROPERTIES_EXT: { - VkPhysicalDeviceCustomBorderColorPropertiesEXT *props = (void *)ext; - props->maxCustomBorderColorSamplers = TU_BORDER_COLOR_COUNT; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES: { + VkPhysicalDeviceMultiviewProperties *properties = + (VkPhysicalDeviceMultiviewProperties *) ext; + properties->maxMultiviewViewCount = MAX_VIEWS; + properties->maxMultiviewInstanceIndex = INT_MAX; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR: { - VkPhysicalDevicePerformanceQueryPropertiesKHR *properties = - (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext; - properties->allowCommandBufferQueryCopies = false; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: { + VkPhysicalDevicePointClippingProperties *properties = + (VkPhysicalDevicePointClippingProperties *) ext; + properties->pointClippingBehavior = + VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES; break; } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_PROPERTIES_EXT: { - VkPhysicalDeviceRobustness2PropertiesEXT *props = (void *)ext; - /* see write_buffer_descriptor() */ - props->robustStorageBufferAccessSizeAlignment = 4; - /* see write_ubo_descriptor() */ - props->robustUniformBufferAccessSizeAlignment = 16; - break; - } - - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: { - VkPhysicalDeviceProvokingVertexPropertiesEXT *properties = - (VkPhysicalDeviceProvokingVertexPropertiesEXT *)ext; - properties->provokingVertexModePerPipeline = true; - properties->transformFeedbackPreservesTriangleFanProvokingVertex = false; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_PROPERTIES_EXT: { - VkPhysicalDeviceLineRasterizationPropertiesEXT *props = - (VkPhysicalDeviceLineRasterizationPropertiesEXT *)ext; - props->lineSubPixelPrecisionBits = 8; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: { - VkPhysicalDeviceDrmPropertiesEXT *props = - (VkPhysicalDeviceDrmPropertiesEXT *)ext; - props->hasPrimary = pdevice->has_master; - props->primaryMajor = pdevice->master_major; - props->primaryMinor = pdevice->master_minor; - - props->hasRender = pdevice->has_local; - props->renderMajor = pdevice->local_major; - props->renderMinor = pdevice->local_minor; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MODULE_IDENTIFIER_PROPERTIES_EXT: { - VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *props = - (VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *)ext; - STATIC_ASSERT(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) == - sizeof(props->shaderModuleIdentifierAlgorithmUUID)); - memcpy(props->shaderModuleIdentifierAlgorithmUUID, - vk_shaderModuleIdentifierAlgorithmUUID, - sizeof(props->shaderModuleIdentifierAlgorithmUUID)); - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_PROPERTIES_EXT: { - VkPhysicalDeviceMultiDrawPropertiesEXT *properties = - (VkPhysicalDeviceMultiDrawPropertiesEXT *)ext; - properties->maxMultiDrawCount = 2048; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GRAPHICS_PIPELINE_LIBRARY_PROPERTIES_EXT: { - VkPhysicalDeviceGraphicsPipelineLibraryPropertiesEXT *props = - (VkPhysicalDeviceGraphicsPipelineLibraryPropertiesEXT *)ext; - props->graphicsPipelineLibraryFastLinking = true; - props->graphicsPipelineLibraryIndependentInterpolationDecoration = true; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: { + VkPhysicalDeviceMaintenance3Properties *properties = + (VkPhysicalDeviceMaintenance3Properties *) ext; + /* Make sure everything is addressable by a signed 32-bit int, and + * our largest descriptors are 96 bytes. */ + properties->maxPerSetDescriptors = (1ull << 31) / 96; + /* Our buffer size fields allow only this much */ + properties->maxMemoryAllocationSize = 0xFFFFFFFFull; break; } default: @@ -1397,99 +896,36 @@ static const VkQueueFamilyProperties tu_queue_family_properties = { .queueFlags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT, .queueCount = 1, - .timestampValidBits = 48, + .timestampValidBits = 64, .minImageTransferGranularity = { 1, 1, 1 }, }; -static void -tu_physical_device_get_global_priority_properties(const struct tu_physical_device *pdevice, - VkQueueFamilyGlobalPriorityPropertiesKHR *props) +void +tu_GetPhysicalDeviceQueueFamilyProperties( + VkPhysicalDevice physicalDevice, + uint32_t *pQueueFamilyPropertyCount, + VkQueueFamilyProperties *pQueueFamilyProperties) { - props->priorityCount = MIN2(pdevice->submitqueue_priority_count, 3); - switch (props->priorityCount) { - case 1: - props->priorities[0] = VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR; - break; - case 2: - props->priorities[0] = VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR; - props->priorities[1] = VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR; - break; - case 3: - props->priorities[0] = VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR; - props->priorities[1] = VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR; - props->priorities[2] = VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR; - break; - default: - unreachable("unexpected priority count"); - break; - } -} + VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pQueueFamilyPropertyCount); -static int -tu_physical_device_get_submitqueue_priority(const struct tu_physical_device *pdevice, - VkQueueGlobalPriorityKHR global_priority, - bool global_priority_query) -{ - if (global_priority_query) { - VkQueueFamilyGlobalPriorityPropertiesKHR props; - tu_physical_device_get_global_priority_properties(pdevice, &props); - - bool valid = false; - for (uint32_t i = 0; i < props.priorityCount; i++) { - if (props.priorities[i] == global_priority) { - valid = true; - break; - } - } - - if (!valid) - return -1; - } - - /* Valid values are from 0 to (pdevice->submitqueue_priority_count - 1), - * with 0 being the highest priority. This matches what freedreno does. - */ - int priority; - if (global_priority == VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) - priority = pdevice->submitqueue_priority_count / 2; - else if (global_priority < VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) - priority = pdevice->submitqueue_priority_count - 1; - else - priority = 0; - - return priority; + vk_outarray_append(&out, p) { *p = tu_queue_family_properties; } } -VKAPI_ATTR void VKAPI_CALL +void tu_GetPhysicalDeviceQueueFamilyProperties2( VkPhysicalDevice physicalDevice, uint32_t *pQueueFamilyPropertyCount, VkQueueFamilyProperties2 *pQueueFamilyProperties) { - TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice); + VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pQueueFamilyPropertyCount); - VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out, - pQueueFamilyProperties, pQueueFamilyPropertyCount); - - vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) + vk_outarray_append(&out, p) { p->queueFamilyProperties = tu_queue_family_properties; - - vk_foreach_struct(ext, p->pNext) { - switch (ext->sType) { - case VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_KHR: { - VkQueueFamilyGlobalPriorityPropertiesKHR *props = (void *)ext; - tu_physical_device_get_global_priority_properties(pdevice, props); - break; - } - default: - break; - } - } } } -uint64_t +static uint64_t tu_get_system_heap_size() { struct sysinfo info; @@ -1509,101 +945,50 @@ tu_get_system_heap_size() return available_ram; } -static VkDeviceSize -tu_get_budget_memory(struct tu_physical_device *physical_device) -{ - uint64_t heap_size = physical_device->heap.size; - uint64_t heap_used = physical_device->heap.used; - uint64_t sys_available; - ASSERTED bool has_available_memory = - os_get_available_system_memory(&sys_available); - assert(has_available_memory); - - /* - * Let's not incite the app to starve the system: report at most 90% of - * available system memory. - */ - uint64_t heap_available = sys_available * 9 / 10; - return MIN2(heap_size, heap_used + heap_available); -} - -VKAPI_ATTR void VKAPI_CALL -tu_GetPhysicalDeviceMemoryProperties2(VkPhysicalDevice pdev, - VkPhysicalDeviceMemoryProperties2 *props2) +void +tu_GetPhysicalDeviceMemoryProperties( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceMemoryProperties *pMemoryProperties) { - TU_FROM_HANDLE(tu_physical_device, physical_device, pdev); - - VkPhysicalDeviceMemoryProperties *props = &props2->memoryProperties; - props->memoryHeapCount = 1; - props->memoryHeaps[0].size = physical_device->heap.size; - props->memoryHeaps[0].flags = physical_device->heap.flags; + pMemoryProperties->memoryHeapCount = 1; + pMemoryProperties->memoryHeaps[0].size = tu_get_system_heap_size(); + pMemoryProperties->memoryHeaps[0].flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT; - props->memoryTypeCount = 1; - props->memoryTypes[0].propertyFlags = + pMemoryProperties->memoryTypeCount = 1; + pMemoryProperties->memoryTypes[0].propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; - props->memoryTypes[0].heapIndex = 0; + pMemoryProperties->memoryTypes[0].heapIndex = 0; +} - vk_foreach_struct(ext, props2->pNext) - { - switch (ext->sType) { - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT: { - VkPhysicalDeviceMemoryBudgetPropertiesEXT *memory_budget_props = - (VkPhysicalDeviceMemoryBudgetPropertiesEXT *) ext; - memory_budget_props->heapUsage[0] = physical_device->heap.used; - memory_budget_props->heapBudget[0] = tu_get_budget_memory(physical_device); - - /* The heapBudget and heapUsage values must be zero for array elements - * greater than or equal to VkPhysicalDeviceMemoryProperties::memoryHeapCount - */ - for (unsigned i = 1; i < VK_MAX_MEMORY_HEAPS; i++) { - memory_budget_props->heapBudget[i] = 0u; - memory_budget_props->heapUsage[i] = 0u; - } - break; - } - default: - break; - } - } +void +tu_GetPhysicalDeviceMemoryProperties2( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceMemoryProperties2 *pMemoryProperties) +{ + return tu_GetPhysicalDeviceMemoryProperties( + physicalDevice, &pMemoryProperties->memoryProperties); } static VkResult tu_queue_init(struct tu_device *device, struct tu_queue *queue, + uint32_t queue_family_index, int idx, - const VkDeviceQueueCreateInfo *create_info, - bool global_priority_query) -{ - const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info = - vk_find_struct_const(create_info->pNext, - DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR); - const enum VkQueueGlobalPriorityKHR global_priority = priority_info ? - priority_info->globalPriority : VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR; - - const int priority = tu_physical_device_get_submitqueue_priority( - device->physical_device, global_priority, global_priority_query); - if (priority < 0) { - return vk_startup_errorf(device->instance, VK_ERROR_INITIALIZATION_FAILED, - "invalid global priority"); - } - - VkResult result = vk_queue_init(&queue->vk, &device->vk, create_info, idx); - if (result != VK_SUCCESS) - return result; - + VkDeviceQueueCreateFlags flags) +{ + queue->_loader_data.loaderMagic = ICD_LOADER_MAGIC; queue->device = device; -#ifndef TU_USE_KGSL - queue->vk.driver_submit = tu_queue_submit; -#endif + queue->queue_family_index = queue_family_index; + queue->queue_idx = idx; + queue->flags = flags; - int ret = tu_drm_submitqueue_new(device, priority, &queue->msm_queue_id); + int ret = tu_drm_submitqueue_new(device, 0, &queue->msm_queue_id); if (ret) - return vk_startup_errorf(device->instance, VK_ERROR_INITIALIZATION_FAILED, - "submitqueue create failed"); + return VK_ERROR_INITIALIZATION_FAILED; - queue->fence = -1; + tu_fence_init(&queue->submit_fence, false); return VK_SUCCESS; } @@ -1611,271 +996,21 @@ tu_queue_init(struct tu_device *device, static void tu_queue_finish(struct tu_queue *queue) { - vk_queue_finish(&queue->vk); - if (queue->fence >= 0) - close(queue->fence); + tu_fence_finish(&queue->submit_fence); tu_drm_submitqueue_close(queue->device, queue->msm_queue_id); } -uint64_t -tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts) -{ - /* This is based on the 19.2MHz always-on rbbm timer. - * - * TODO we should probably query this value from kernel.. - */ - return ts * (1000000000 / 19200000); -} - -static void* -tu_trace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size) -{ - struct tu_device *device = - container_of(utctx, struct tu_device, trace_context); - - struct tu_bo *bo; - tu_bo_init_new(device, &bo, size, false, "trace"); - - return bo; -} - -static void -tu_trace_destroy_ts_buffer(struct u_trace_context *utctx, void *timestamps) -{ - struct tu_device *device = - container_of(utctx, struct tu_device, trace_context); - struct tu_bo *bo = timestamps; - - tu_bo_finish(device, bo); -} - -static void -tu_trace_record_ts(struct u_trace *ut, void *cs, void *timestamps, - unsigned idx, bool end_of_pipe) -{ - struct tu_bo *bo = timestamps; - struct tu_cs *ts_cs = cs; - - unsigned ts_offset = idx * sizeof(uint64_t); - tu_cs_emit_pkt7(ts_cs, CP_EVENT_WRITE, 4); - tu_cs_emit(ts_cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP); - tu_cs_emit_qw(ts_cs, bo->iova + ts_offset); - tu_cs_emit(ts_cs, 0x00000000); -} - -static uint64_t -tu_trace_read_ts(struct u_trace_context *utctx, - void *timestamps, unsigned idx, void *flush_data) -{ - struct tu_device *device = - container_of(utctx, struct tu_device, trace_context); - struct tu_bo *bo = timestamps; - struct tu_u_trace_submission_data *submission_data = flush_data; - - /* Only need to stall on results for the first entry: */ - if (idx == 0) { - tu_device_wait_u_trace(device, submission_data->syncobj); - } - - if (tu_bo_map(device, bo) != VK_SUCCESS) { - return U_TRACE_NO_TIMESTAMP; - } - - uint64_t *ts = bo->map; - - /* Don't translate the no-timestamp marker: */ - if (ts[idx] == U_TRACE_NO_TIMESTAMP) - return U_TRACE_NO_TIMESTAMP; - - return tu_device_ticks_to_ns(device, ts[idx]); -} - -static void -tu_trace_delete_flush_data(struct u_trace_context *utctx, void *flush_data) -{ - struct tu_device *device = - container_of(utctx, struct tu_device, trace_context); - struct tu_u_trace_submission_data *submission_data = flush_data; - - tu_u_trace_submission_data_finish(device, submission_data); -} - -void -tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream, - void *ts_from, uint32_t from_offset, - void *ts_to, uint32_t to_offset, - uint32_t count) -{ - struct tu_cs *cs = cmdstream; - struct tu_bo *bo_from = ts_from; - struct tu_bo *bo_to = ts_to; - - tu_cs_emit_pkt7(cs, CP_MEMCPY, 5); - tu_cs_emit(cs, count * sizeof(uint64_t) / sizeof(uint32_t)); - tu_cs_emit_qw(cs, bo_from->iova + from_offset * sizeof(uint64_t)); - tu_cs_emit_qw(cs, bo_to->iova + to_offset * sizeof(uint64_t)); -} - -/* Special helpers instead of u_trace_begin_iterator()/u_trace_end_iterator() - * that ignore tracepoints at the beginning/end that are part of a - * suspend/resume chain. - */ -static struct u_trace_iterator -tu_cmd_begin_iterator(struct tu_cmd_buffer *cmdbuf) -{ - switch (cmdbuf->state.suspend_resume) { - case SR_IN_PRE_CHAIN: - return cmdbuf->trace_renderpass_end; - case SR_AFTER_PRE_CHAIN: - case SR_IN_CHAIN_AFTER_PRE_CHAIN: - return cmdbuf->pre_chain.trace_renderpass_end; - default: - return u_trace_begin_iterator(&cmdbuf->trace); - } -} - -static struct u_trace_iterator -tu_cmd_end_iterator(struct tu_cmd_buffer *cmdbuf) -{ - switch (cmdbuf->state.suspend_resume) { - case SR_IN_PRE_CHAIN: - return cmdbuf->trace_renderpass_end; - case SR_IN_CHAIN: - case SR_IN_CHAIN_AFTER_PRE_CHAIN: - return cmdbuf->trace_renderpass_start; - default: - return u_trace_end_iterator(&cmdbuf->trace); - } -} -VkResult -tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs, - struct u_trace **trace_copy) +static int +tu_get_device_extension_index(const char *name) { - *cs = vk_zalloc(&cmdbuf->device->vk.alloc, sizeof(struct tu_cs), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (*cs == NULL) { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - - tu_cs_init(*cs, cmdbuf->device, TU_CS_MODE_GROW, - list_length(&cmdbuf->trace.trace_chunks) * 6 + 3, "trace copy timestamp cs"); - - tu_cs_begin(*cs); - - tu_cs_emit_wfi(*cs); - tu_cs_emit_pkt7(*cs, CP_WAIT_FOR_ME, 0); - - *trace_copy = vk_zalloc(&cmdbuf->device->vk.alloc, sizeof(struct u_trace), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (*trace_copy == NULL) { - return VK_ERROR_OUT_OF_HOST_MEMORY; + for (unsigned i = 0; i < TU_DEVICE_EXTENSION_COUNT; ++i) { + if (strcmp(name, tu_device_extensions[i].extensionName) == 0) + return i; } - - u_trace_init(*trace_copy, cmdbuf->trace.utctx); - u_trace_clone_append(tu_cmd_begin_iterator(cmdbuf), - tu_cmd_end_iterator(cmdbuf), - *trace_copy, *cs, - tu_copy_timestamp_buffer); - - tu_cs_emit_wfi(*cs); - - tu_cs_end(*cs); - - return VK_SUCCESS; + return -1; } VkResult -tu_u_trace_submission_data_create( - struct tu_device *device, - struct tu_cmd_buffer **cmd_buffers, - uint32_t cmd_buffer_count, - struct tu_u_trace_submission_data **submission_data) -{ - *submission_data = - vk_zalloc(&device->vk.alloc, - sizeof(struct tu_u_trace_submission_data), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (!(*submission_data)) { - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - } - - struct tu_u_trace_submission_data *data = *submission_data; - - data->cmd_trace_data = - vk_zalloc(&device->vk.alloc, - cmd_buffer_count * sizeof(struct tu_u_trace_cmd_data), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (!data->cmd_trace_data) { - goto fail; - } - - data->cmd_buffer_count = cmd_buffer_count; - data->last_buffer_with_tracepoints = -1; - - for (uint32_t i = 0; i < cmd_buffer_count; ++i) { - struct tu_cmd_buffer *cmdbuf = cmd_buffers[i]; - - if (!u_trace_has_points(&cmdbuf->trace)) - continue; - - data->last_buffer_with_tracepoints = i; - - if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) { - /* A single command buffer could be submitted several times, but we - * already baked timestamp iova addresses and trace points are - * single-use. Therefor we have to copy trace points and create - * a new timestamp buffer on every submit of reusable command buffer. - */ - if (tu_create_copy_timestamp_cs(cmdbuf, - &data->cmd_trace_data[i].timestamp_copy_cs, - &data->cmd_trace_data[i].trace) != VK_SUCCESS) { - goto fail; - } - - assert(data->cmd_trace_data[i].timestamp_copy_cs->entry_count == 1); - } else { - data->cmd_trace_data[i].trace = &cmdbuf->trace; - } - } - - assert(data->last_buffer_with_tracepoints != -1); - - return VK_SUCCESS; - -fail: - tu_u_trace_submission_data_finish(device, data); - *submission_data = NULL; - - return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); -} - -void -tu_u_trace_submission_data_finish( - struct tu_device *device, - struct tu_u_trace_submission_data *submission_data) -{ - for (uint32_t i = 0; i < submission_data->cmd_buffer_count; ++i) { - /* Only if we had to create a copy of trace we should free it */ - struct tu_u_trace_cmd_data *cmd_data = &submission_data->cmd_trace_data[i]; - if (cmd_data->timestamp_copy_cs) { - tu_cs_finish(cmd_data->timestamp_copy_cs); - vk_free(&device->vk.alloc, cmd_data->timestamp_copy_cs); - - u_trace_fini(cmd_data->trace); - vk_free(&device->vk.alloc, cmd_data->trace); - } - } - - vk_free(&device->vk.alloc, submission_data->cmd_trace_data); - vk_free(&device->vk.alloc, submission_data->syncobj); - vk_free(&device->vk.alloc, submission_data); -} - -VKAPI_ATTR VkResult VKAPI_CALL tu_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, @@ -1884,92 +1019,59 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, TU_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice); VkResult result; struct tu_device *device; - bool custom_border_colors = false; - bool perf_query_pools = false; - bool robust_buffer_access2 = false; - bool border_color_without_format = false; - bool global_priority_query = false; - vk_foreach_struct_const(ext, pCreateInfo->pNext) { - switch (ext->sType) { - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: { - const VkPhysicalDeviceCustomBorderColorFeaturesEXT *border_color_features = (const void *)ext; - custom_border_colors = border_color_features->customBorderColors; - border_color_without_format = - border_color_features->customBorderColorWithoutFormat; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: { - const VkPhysicalDevicePerformanceQueryFeaturesKHR *feature = - (VkPhysicalDevicePerformanceQueryFeaturesKHR *)ext; - perf_query_pools = feature->performanceCounterQueryPools; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: { - VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext; - robust_buffer_access2 = features->robustBufferAccess2; - break; - } - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GLOBAL_PRIORITY_QUERY_FEATURES_KHR: { - VkPhysicalDeviceGlobalPriorityQueryFeaturesKHR *features = (void *)ext; - global_priority_query = features->globalPriorityQuery; - break; - } - default: - break; + /* Check enabled features */ + if (pCreateInfo->pEnabledFeatures) { + VkPhysicalDeviceFeatures supported_features; + tu_GetPhysicalDeviceFeatures(physicalDevice, &supported_features); + VkBool32 *supported_feature = (VkBool32 *) &supported_features; + VkBool32 *enabled_feature = (VkBool32 *) pCreateInfo->pEnabledFeatures; + unsigned num_features = + sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32); + for (uint32_t i = 0; i < num_features; i++) { + if (enabled_feature[i] && !supported_feature[i]) + return vk_error(physical_device->instance, + VK_ERROR_FEATURE_NOT_PRESENT); } } - device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator, + device = vk_zalloc2(&physical_device->instance->alloc, pAllocator, sizeof(*device), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!device) - return vk_startup_errorf(physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY, "OOM"); - - struct vk_device_dispatch_table dispatch_table; - vk_device_dispatch_table_from_entrypoints( - &dispatch_table, &tu_device_entrypoints, true); - vk_device_dispatch_table_from_entrypoints( - &dispatch_table, &wsi_device_entrypoints, false); - - result = vk_device_init(&device->vk, &physical_device->vk, - &dispatch_table, pCreateInfo, pAllocator); - if (result != VK_SUCCESS) { - vk_free(&device->vk.alloc, device); - return vk_startup_errorf(physical_device->instance, result, - "vk_device_init failed"); - } + return vk_error(physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + device->_loader_data.loaderMagic = ICD_LOADER_MAGIC; device->instance = physical_device->instance; device->physical_device = physical_device; - device->fd = physical_device->local_fd; - device->vk.command_buffer_ops = &tu_cmd_buffer_ops; - device->vk.check_status = tu_device_check_status; - mtx_init(&device->bo_mutex, mtx_plain); - mtx_init(&device->pipeline_mutex, mtx_plain); - mtx_init(&device->autotune_mutex, mtx_plain); - u_rwlock_init(&device->dma_bo_lock); - pthread_mutex_init(&device->submit_mutex, NULL); + if (pAllocator) + device->alloc = *pAllocator; + else + device->alloc = physical_device->instance->alloc; - if (device->instance->debug_flags & TU_DEBUG_BOS) - device->bo_sizes = _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal); + for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) { + const char *ext_name = pCreateInfo->ppEnabledExtensionNames[i]; + int index = tu_get_device_extension_index(ext_name); + if (index < 0 || + !physical_device->supported_extensions.extensions[index]) { + vk_free(&device->alloc, device); + return vk_error(physical_device->instance, + VK_ERROR_EXTENSION_NOT_PRESENT); + } -#ifndef TU_USE_KGSL - vk_device_set_drm_fd(&device->vk, device->fd); -#endif + device->enabled_extensions.extensions[index] = true; + } for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i]; uint32_t qfi = queue_create->queueFamilyIndex; device->queues[qfi] = vk_alloc( - &device->vk.alloc, queue_create->queueCount * sizeof(struct tu_queue), + &device->alloc, queue_create->queueCount * sizeof(struct tu_queue), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!device->queues[qfi]) { - result = vk_startup_errorf(physical_device->instance, - VK_ERROR_OUT_OF_HOST_MEMORY, - "OOM"); - goto fail_queues; + result = VK_ERROR_OUT_OF_HOST_MEMORY; + goto fail; } memset(device->queues[qfi], 0, @@ -1978,221 +1080,50 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, device->queue_count[qfi] = queue_create->queueCount; for (unsigned q = 0; q < queue_create->queueCount; q++) { - result = tu_queue_init(device, &device->queues[qfi][q], q, - queue_create, global_priority_query); - if (result != VK_SUCCESS) { - device->queue_count[qfi] = q; - goto fail_queues; - } - } - } - - device->compiler = - ir3_compiler_create(NULL, &physical_device->dev_id, - &(struct ir3_compiler_options) { - .robust_buffer_access2 = robust_buffer_access2, - .push_ubo_with_preamble = true, - .disable_cache = true, - }); - if (!device->compiler) { - result = vk_startup_errorf(physical_device->instance, - VK_ERROR_INITIALIZATION_FAILED, - "failed to initialize ir3 compiler"); - goto fail_queues; - } - - /* Initialize sparse array for refcounting imported BOs */ - util_sparse_array_init(&device->bo_map, sizeof(struct tu_bo), 512); - - /* initial sizes, these will increase if there is overflow */ - device->vsc_draw_strm_pitch = 0x1000 + VSC_PAD; - device->vsc_prim_strm_pitch = 0x4000 + VSC_PAD; - - uint32_t global_size = sizeof(struct tu6_global); - if (custom_border_colors) - global_size += TU_BORDER_COLOR_COUNT * sizeof(struct bcolor_entry); - - tu_bo_suballocator_init(&device->pipeline_suballoc, device, - 128 * 1024, TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP); - tu_bo_suballocator_init(&device->autotune_suballoc, device, - 128 * 1024, 0); - - result = tu_bo_init_new(device, &device->global_bo, global_size, - TU_BO_ALLOC_ALLOW_DUMP, "global"); - if (result != VK_SUCCESS) { - vk_startup_errorf(device->instance, result, "BO init"); - goto fail_global_bo; - } - - result = tu_bo_map(device, device->global_bo); - if (result != VK_SUCCESS) { - vk_startup_errorf(device->instance, result, "BO map"); - goto fail_global_bo_map; - } - - struct tu6_global *global = device->global_bo->map; - tu_init_clear_blit_shaders(device); - global->predicate = 0; - global->vtx_stats_query_not_running = 1; - global->dbg_one = (uint32_t)-1; - global->dbg_gmem_total_loads = 0; - global->dbg_gmem_taken_loads = 0; - global->dbg_gmem_total_stores = 0; - global->dbg_gmem_taken_stores = 0; - for (int i = 0; i < TU_BORDER_COLOR_BUILTIN; i++) { - VkClearColorValue border_color = vk_border_color_value(i); - tu6_pack_border_color(&global->bcolor_builtin[i], &border_color, - vk_border_color_is_int(i)); - } - - /* initialize to ones so ffs can be used to find unused slots */ - BITSET_ONES(device->custom_border_color); - - result = tu_init_dynamic_rendering(device); - if (result != VK_SUCCESS) { - vk_startup_errorf(device->instance, result, "dynamic rendering"); - goto fail_dynamic_rendering; - } - - struct vk_pipeline_cache_create_info pcc_info = { }; - device->mem_cache = vk_pipeline_cache_create(&device->vk, &pcc_info, - false); - if (!device->mem_cache) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - vk_startup_errorf(device->instance, result, "create pipeline cache failed"); - goto fail_pipeline_cache; - } - - if (perf_query_pools) { - /* Prepare command streams setting pass index to the PERF_CNTRS_REG - * from 0 to 31. One of these will be picked up at cmd submit time - * when the perf query is executed. - */ - struct tu_cs *cs; - - if (!(device->perfcntrs_pass_cs = calloc(1, sizeof(struct tu_cs)))) { - result = vk_startup_errorf(device->instance, - VK_ERROR_OUT_OF_HOST_MEMORY, "OOM"); - goto fail_perfcntrs_pass_alloc; - } - - device->perfcntrs_pass_cs_entries = calloc(32, sizeof(struct tu_cs_entry)); - if (!device->perfcntrs_pass_cs_entries) { - result = vk_startup_errorf(device->instance, - VK_ERROR_OUT_OF_HOST_MEMORY, "OOM"); - goto fail_perfcntrs_pass_entries_alloc; - } - - cs = device->perfcntrs_pass_cs; - tu_cs_init(cs, device, TU_CS_MODE_SUB_STREAM, 96, "perfcntrs cs"); - - for (unsigned i = 0; i < 32; i++) { - struct tu_cs sub_cs; - - result = tu_cs_begin_sub_stream(cs, 3, &sub_cs); - if (result != VK_SUCCESS) { - vk_startup_errorf(device->instance, result, - "failed to allocate commands streams"); - goto fail_prepare_perfcntrs_pass_cs; - } - - tu_cs_emit_regs(&sub_cs, A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG, 1 << i)); - tu_cs_emit_pkt7(&sub_cs, CP_WAIT_FOR_ME, 0); - - device->perfcntrs_pass_cs_entries[i] = tu_cs_end_sub_stream(cs, &sub_cs); + result = tu_queue_init(device, &device->queues[qfi][q], qfi, q, + queue_create->flags); + if (result != VK_SUCCESS) + goto fail; } } - /* Initialize a condition variable for timeline semaphore */ - pthread_condattr_t condattr; - if (pthread_condattr_init(&condattr) != 0) { - result = vk_startup_errorf(physical_device->instance, - VK_ERROR_INITIALIZATION_FAILED, - "pthread condattr init"); - goto fail_timeline_cond; - } - if (pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC) != 0) { - pthread_condattr_destroy(&condattr); - result = vk_startup_errorf(physical_device->instance, - VK_ERROR_INITIALIZATION_FAILED, - "pthread condattr clock setup"); - goto fail_timeline_cond; - } - if (pthread_cond_init(&device->timeline_cond, &condattr) != 0) { - pthread_condattr_destroy(&condattr); - result = vk_startup_errorf(physical_device->instance, - VK_ERROR_INITIALIZATION_FAILED, - "pthread cond init"); - goto fail_timeline_cond; - } - pthread_condattr_destroy(&condattr); - - result = tu_autotune_init(&device->autotune, device); - if (result != VK_SUCCESS) { - goto fail_timeline_cond; - } - - for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++) - mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain); - - mtx_init(&device->fiber_pvtmem_bo.mtx, mtx_plain); - mtx_init(&device->wave_pvtmem_bo.mtx, mtx_plain); - - mtx_init(&device->mutex, mtx_plain); - - device->use_z24uint_s8uint = - physical_device->info->a6xx.has_z24uint_s8uint && - !border_color_without_format; - - tu_gpu_tracepoint_config_variable(); + device->compiler = ir3_compiler_create(NULL, physical_device->gpu_id); + if (!device->compiler) + goto fail; - device->submit_count = 0; - u_trace_context_init(&device->trace_context, device, - tu_trace_create_ts_buffer, - tu_trace_destroy_ts_buffer, - tu_trace_record_ts, - tu_trace_read_ts, - tu_trace_delete_flush_data); + VkPipelineCacheCreateInfo ci; + ci.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; + ci.pNext = NULL; + ci.flags = 0; + ci.pInitialData = NULL; + ci.initialDataSize = 0; + VkPipelineCache pc; + result = + tu_CreatePipelineCache(tu_device_to_handle(device), &ci, NULL, &pc); + if (result != VK_SUCCESS) + goto fail; - tu_breadcrumbs_init(device); + device->mem_cache = tu_pipeline_cache_from_handle(pc); *pDevice = tu_device_to_handle(device); return VK_SUCCESS; -fail_timeline_cond: -fail_prepare_perfcntrs_pass_cs: - free(device->perfcntrs_pass_cs_entries); - tu_cs_finish(device->perfcntrs_pass_cs); -fail_perfcntrs_pass_entries_alloc: - free(device->perfcntrs_pass_cs); -fail_perfcntrs_pass_alloc: - vk_pipeline_cache_destroy(device->mem_cache, &device->vk.alloc); -fail_pipeline_cache: - tu_destroy_dynamic_rendering(device); -fail_dynamic_rendering: - tu_destroy_clear_blit_shaders(device); -fail_global_bo_map: - tu_bo_finish(device, device->global_bo); - vk_free(&device->vk.alloc, device->bo_list); -fail_global_bo: - ir3_compiler_destroy(device->compiler); - util_sparse_array_finish(&device->bo_map); - -fail_queues: +fail: for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) { for (unsigned q = 0; q < device->queue_count[i]; q++) tu_queue_finish(&device->queues[i][q]); - if (device->queues[i]) - vk_free(&device->vk.alloc, device->queues[i]); + if (device->queue_count[i]) + vk_free(&device->alloc, device->queues[i]); } - u_rwlock_destroy(&device->dma_bo_lock); - vk_device_finish(&device->vk); - vk_free(&device->vk.alloc, device); + if (device->compiler) + ralloc_free(device->compiler); + + vk_free(&device->alloc, device); return result; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) { TU_FROM_HANDLE(tu_device, device, _device); @@ -2200,158 +1131,231 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) if (!device) return; - tu_breadcrumbs_finish(device); - - u_trace_context_fini(&device->trace_context); - for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) { for (unsigned q = 0; q < device->queue_count[i]; q++) tu_queue_finish(&device->queues[i][q]); if (device->queue_count[i]) - vk_free(&device->vk.alloc, device->queues[i]); + vk_free(&device->alloc, device->queues[i]); } - for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++) { - if (device->scratch_bos[i].initialized) - tu_bo_finish(device, device->scratch_bos[i].bo); - } + /* the compiler does not use pAllocator */ + ralloc_free(device->compiler); - if (device->fiber_pvtmem_bo.bo) - tu_bo_finish(device, device->fiber_pvtmem_bo.bo); - - if (device->wave_pvtmem_bo.bo) - tu_bo_finish(device, device->wave_pvtmem_bo.bo); + VkPipelineCache pc = tu_pipeline_cache_to_handle(device->mem_cache); + tu_DestroyPipelineCache(tu_device_to_handle(device), pc, NULL); - tu_destroy_clear_blit_shaders(device); - - tu_destroy_dynamic_rendering(device); + vk_free(&device->alloc, device); +} - ir3_compiler_destroy(device->compiler); +VkResult +tu_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount, + VkLayerProperties *pProperties) +{ + *pPropertyCount = 0; + return VK_SUCCESS; +} - vk_pipeline_cache_destroy(device->mem_cache, &device->vk.alloc); +VkResult +tu_EnumerateDeviceLayerProperties(VkPhysicalDevice physicalDevice, + uint32_t *pPropertyCount, + VkLayerProperties *pProperties) +{ + *pPropertyCount = 0; + return VK_SUCCESS; +} - if (device->perfcntrs_pass_cs) { - free(device->perfcntrs_pass_cs_entries); - tu_cs_finish(device->perfcntrs_pass_cs); - free(device->perfcntrs_pass_cs); +void +tu_GetDeviceQueue2(VkDevice _device, + const VkDeviceQueueInfo2 *pQueueInfo, + VkQueue *pQueue) +{ + TU_FROM_HANDLE(tu_device, device, _device); + struct tu_queue *queue; + + queue = + &device->queues[pQueueInfo->queueFamilyIndex][pQueueInfo->queueIndex]; + if (pQueueInfo->flags != queue->flags) { + /* From the Vulkan 1.1.70 spec: + * + * "The queue returned by vkGetDeviceQueue2 must have the same + * flags value from this structure as that used at device + * creation time in a VkDeviceQueueCreateInfo instance. If no + * matching flags were specified at device creation time then + * pQueue will return VK_NULL_HANDLE." + */ + *pQueue = VK_NULL_HANDLE; + return; } - tu_autotune_fini(&device->autotune, device); - - tu_bo_suballocator_finish(&device->pipeline_suballoc); - tu_bo_suballocator_finish(&device->autotune_suballoc); + *pQueue = tu_queue_to_handle(queue); +} - util_sparse_array_finish(&device->bo_map); - u_rwlock_destroy(&device->dma_bo_lock); +void +tu_GetDeviceQueue(VkDevice _device, + uint32_t queueFamilyIndex, + uint32_t queueIndex, + VkQueue *pQueue) +{ + const VkDeviceQueueInfo2 info = + (VkDeviceQueueInfo2) { .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_INFO_2, + .queueFamilyIndex = queueFamilyIndex, + .queueIndex = queueIndex }; - pthread_cond_destroy(&device->timeline_cond); - _mesa_hash_table_destroy(device->bo_sizes, NULL); - vk_free(&device->vk.alloc, device->bo_list); - vk_device_finish(&device->vk); - vk_free(&device->vk.alloc, device); + tu_GetDeviceQueue2(_device, &info, pQueue); } VkResult -tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo) +tu_QueueSubmit(VkQueue _queue, + uint32_t submitCount, + const VkSubmitInfo *pSubmits, + VkFence _fence) { - unsigned size_log2 = MAX2(util_logbase2_ceil64(size), MIN_SCRATCH_BO_SIZE_LOG2); - unsigned index = size_log2 - MIN_SCRATCH_BO_SIZE_LOG2; - assert(index < ARRAY_SIZE(dev->scratch_bos)); + TU_FROM_HANDLE(tu_queue, queue, _queue); + + for (uint32_t i = 0; i < submitCount; ++i) { + const VkSubmitInfo *submit = pSubmits + i; + const bool last_submit = (i == submitCount - 1); + struct tu_bo_list bo_list; + tu_bo_list_init(&bo_list); + + uint32_t entry_count = 0; + for (uint32_t j = 0; j < submit->commandBufferCount; ++j) { + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBuffers[j]); + entry_count += cmdbuf->cs.entry_count; + } + + struct drm_msm_gem_submit_cmd cmds[entry_count]; + uint32_t entry_idx = 0; + for (uint32_t j = 0; j < submit->commandBufferCount; ++j) { + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBuffers[j]); + struct tu_cs *cs = &cmdbuf->cs; + for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) { + cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF; + cmds[entry_idx].submit_idx = + tu_bo_list_add(&bo_list, cs->entries[i].bo, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); + cmds[entry_idx].submit_offset = cs->entries[i].offset; + cmds[entry_idx].size = cs->entries[i].size; + cmds[entry_idx].pad = 0; + cmds[entry_idx].nr_relocs = 0; + cmds[entry_idx].relocs = 0; + } - for (unsigned i = index; i < ARRAY_SIZE(dev->scratch_bos); i++) { - if (p_atomic_read(&dev->scratch_bos[i].initialized)) { - /* Fast path: just return the already-allocated BO. */ - *bo = dev->scratch_bos[i].bo; - return VK_SUCCESS; + tu_bo_list_merge(&bo_list, &cmdbuf->bo_list); } - } - /* Slow path: actually allocate the BO. We take a lock because the process - * of allocating it is slow, and we don't want to block the CPU while it - * finishes. - */ - mtx_lock(&dev->scratch_bos[index].construct_mtx); + uint32_t flags = MSM_PIPE_3D0; + if (last_submit) { + flags |= MSM_SUBMIT_FENCE_FD_OUT; + } - /* Another thread may have allocated it already while we were waiting on - * the lock. We need to check this in order to avoid double-allocating. - */ - if (dev->scratch_bos[index].initialized) { - mtx_unlock(&dev->scratch_bos[index].construct_mtx); - *bo = dev->scratch_bos[index].bo; - return VK_SUCCESS; - } + struct drm_msm_gem_submit req = { + .flags = flags, + .queueid = queue->msm_queue_id, + .bos = (uint64_t)(uintptr_t) bo_list.bo_infos, + .nr_bos = bo_list.count, + .cmds = (uint64_t)(uintptr_t)cmds, + .nr_cmds = entry_count, + }; - unsigned bo_size = 1ull << size_log2; - VkResult result = tu_bo_init_new(dev, &dev->scratch_bos[index].bo, bo_size, - TU_BO_ALLOC_NO_FLAGS, "scratch"); - if (result != VK_SUCCESS) { - mtx_unlock(&dev->scratch_bos[index].construct_mtx); - return result; - } + int ret = drmCommandWriteRead(queue->device->physical_device->local_fd, + DRM_MSM_GEM_SUBMIT, + &req, sizeof(req)); + if (ret) { + fprintf(stderr, "submit failed: %s\n", strerror(errno)); + abort(); + } - p_atomic_set(&dev->scratch_bos[index].initialized, true); + tu_bo_list_destroy(&bo_list); - mtx_unlock(&dev->scratch_bos[index].construct_mtx); + if (last_submit) { + /* no need to merge fences as queue execution is serialized */ + tu_fence_update_fd(&queue->submit_fence, req.fence_fd); + } + } - *bo = dev->scratch_bos[index].bo; - return VK_SUCCESS; -} + if (_fence != VK_NULL_HANDLE) { + TU_FROM_HANDLE(tu_fence, fence, _fence); + tu_fence_copy(fence, &queue->submit_fence); + } -VKAPI_ATTR VkResult VKAPI_CALL -tu_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount, - VkLayerProperties *pProperties) -{ - *pPropertyCount = 0; return VK_SUCCESS; } -/* Only used for kgsl since drm started using common implementation */ -#ifdef TU_USE_KGSL -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_QueueWaitIdle(VkQueue _queue) { TU_FROM_HANDLE(tu_queue, queue, _queue); - if (vk_device_is_lost(&queue->device->vk)) - return VK_ERROR_DEVICE_LOST; + tu_fence_wait_idle(&queue->submit_fence); - if (queue->fence < 0) - return VK_SUCCESS; - - struct pollfd fds = { .fd = queue->fence, .events = POLLIN }; - int ret; - do { - ret = poll(&fds, 1, -1); - } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); + return VK_SUCCESS; +} - /* TODO: otherwise set device lost ? */ - assert(ret == 1 && !(fds.revents & (POLLERR | POLLNVAL))); +VkResult +tu_DeviceWaitIdle(VkDevice _device) +{ + TU_FROM_HANDLE(tu_device, device, _device); - close(queue->fence); - queue->fence = -1; + for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) { + for (unsigned q = 0; q < device->queue_count[i]; q++) { + tu_QueueWaitIdle(tu_queue_to_handle(&device->queues[i][q])); + } + } return VK_SUCCESS; } -#endif -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_EnumerateInstanceExtensionProperties(const char *pLayerName, uint32_t *pPropertyCount, VkExtensionProperties *pProperties) { + VK_OUTARRAY_MAKE(out, pProperties, pPropertyCount); + + /* We spport no lyaers */ if (pLayerName) return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT); - return vk_enumerate_instance_extension_properties( - &tu_instance_extensions_supported, pPropertyCount, pProperties); + for (int i = 0; i < TU_INSTANCE_EXTENSION_COUNT; i++) { + if (tu_supported_instance_extensions.extensions[i]) { + vk_outarray_append(&out, prop) { *prop = tu_instance_extensions[i]; } + } + } + + return vk_outarray_status(&out); } -VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL +VkResult +tu_EnumerateDeviceExtensionProperties(VkPhysicalDevice physicalDevice, + const char *pLayerName, + uint32_t *pPropertyCount, + VkExtensionProperties *pProperties) +{ + /* We spport no lyaers */ + TU_FROM_HANDLE(tu_physical_device, device, physicalDevice); + VK_OUTARRAY_MAKE(out, pProperties, pPropertyCount); + + /* We spport no lyaers */ + if (pLayerName) + return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT); + + for (int i = 0; i < TU_DEVICE_EXTENSION_COUNT; i++) { + if (device->supported_extensions.extensions[i]) { + vk_outarray_append(&out, prop) { *prop = tu_device_extensions[i]; } + } + } + + return vk_outarray_status(&out); +} + +PFN_vkVoidFunction tu_GetInstanceProcAddr(VkInstance _instance, const char *pName) { TU_FROM_HANDLE(tu_instance, instance, _instance); - return vk_instance_get_proc_addr(&instance->vk, - &tu_instance_entrypoints, - pName); + + return tu_lookup_entrypoint_checked( + pName, instance ? instance->api_version : 0, + instance ? &instance->enabled_extensions : NULL, NULL); } /* The loader wants us to expose a second GetInstanceProcAddr function @@ -2368,30 +1372,22 @@ vk_icdGetInstanceProcAddr(VkInstance instance, const char *pName) return tu_GetInstanceProcAddr(instance, pName); } -/* With version 4+ of the loader interface the ICD should expose - * vk_icdGetPhysicalDeviceProcAddr() - */ -PUBLIC -VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL -vk_icdGetPhysicalDeviceProcAddr(VkInstance _instance, - const char* pName); - PFN_vkVoidFunction -vk_icdGetPhysicalDeviceProcAddr(VkInstance _instance, - const char* pName) +tu_GetDeviceProcAddr(VkDevice _device, const char *pName) { - TU_FROM_HANDLE(tu_instance, instance, _instance); + TU_FROM_HANDLE(tu_device, device, _device); - return vk_instance_get_physical_device_proc_addr(&instance->vk, pName); + return tu_lookup_entrypoint_checked(pName, device->instance->api_version, + &device->instance->enabled_extensions, + &device->enabled_extensions); } -VKAPI_ATTR VkResult VKAPI_CALL -tu_AllocateMemory(VkDevice _device, - const VkMemoryAllocateInfo *pAllocateInfo, - const VkAllocationCallbacks *pAllocator, - VkDeviceMemory *pMem) +static VkResult +tu_alloc_memory(struct tu_device *device, + const VkMemoryAllocateInfo *pAllocateInfo, + const VkAllocationCallbacks *pAllocator, + VkDeviceMemory *pMem) { - TU_FROM_HANDLE(tu_device, device, _device); struct tu_device_memory *mem; VkResult result; @@ -2403,15 +1399,10 @@ tu_AllocateMemory(VkDevice _device, return VK_SUCCESS; } - struct tu_memory_heap *mem_heap = &device->physical_device->heap; - uint64_t mem_heap_used = p_atomic_read(&mem_heap->used); - if (mem_heap_used > mem_heap->size) - return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); - - mem = vk_object_alloc(&device->vk, pAllocator, sizeof(*mem), - VK_OBJECT_TYPE_DEVICE_MEMORY); + mem = vk_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (mem == NULL) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); const VkImportMemoryFdInfoKHR *fd_info = vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR); @@ -2436,69 +1427,37 @@ tu_AllocateMemory(VkDevice _device, close(fd_info->fd); } } else { - uint64_t client_address = 0; - enum tu_bo_alloc_flags alloc_flags = TU_BO_ALLOC_NO_FLAGS; - - const VkMemoryOpaqueCaptureAddressAllocateInfo *replay_info = - vk_find_struct_const(pAllocateInfo->pNext, - MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO); - if (replay_info && replay_info->opaqueCaptureAddress) { - client_address = replay_info->opaqueCaptureAddress; - alloc_flags |= TU_BO_ALLOC_REPLAYABLE; - } - - const VkMemoryAllocateFlagsInfo *flags_info = vk_find_struct_const( - pAllocateInfo->pNext, MEMORY_ALLOCATE_FLAGS_INFO); - if (flags_info && - (flags_info->flags & - VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT)) { - alloc_flags |= TU_BO_ALLOC_REPLAYABLE; - } - - char name[64] = "vkAllocateMemory()"; - if (device->bo_sizes) - snprintf(name, ARRAY_SIZE(name), "vkAllocateMemory(%ldkb)", - (long)DIV_ROUND_UP(pAllocateInfo->allocationSize, 1024)); - result = tu_bo_init_new_explicit_iova( - device, &mem->bo, pAllocateInfo->allocationSize, client_address, - alloc_flags, name); - } - - if (result == VK_SUCCESS) { - mem_heap_used = p_atomic_add_return(&mem_heap->used, mem->bo->size); - if (mem_heap_used > mem_heap->size) { - p_atomic_add(&mem_heap->used, -mem->bo->size); - tu_bo_finish(device, mem->bo); - result = vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY, - "Out of heap memory"); - } + result = + tu_bo_init_new(device, &mem->bo, pAllocateInfo->allocationSize); } if (result != VK_SUCCESS) { - vk_object_free(&device->vk, pAllocator, mem); + vk_free2(&device->alloc, pAllocator, mem); return result; } - /* Track in the device whether our BO list contains any implicit-sync BOs, so - * we can suppress implicit sync on non-WSI usage. - */ - const struct wsi_memory_allocate_info *wsi_info = - vk_find_struct_const(pAllocateInfo->pNext, WSI_MEMORY_ALLOCATE_INFO_MESA); - if (wsi_info && wsi_info->implicit_sync) { - mtx_lock(&device->bo_mutex); - if (!mem->bo->implicit_sync) { - mem->bo->implicit_sync = true; - device->implicit_sync_bo_count++; - } - mtx_unlock(&device->bo_mutex); - } + mem->size = pAllocateInfo->allocationSize; + mem->type_index = pAllocateInfo->memoryTypeIndex; + + mem->map = NULL; + mem->user_ptr = NULL; *pMem = tu_device_memory_to_handle(mem); return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL +VkResult +tu_AllocateMemory(VkDevice _device, + const VkMemoryAllocateInfo *pAllocateInfo, + const VkAllocationCallbacks *pAllocator, + VkDeviceMemory *pMem) +{ + TU_FROM_HANDLE(tu_device, device, _device); + return tu_alloc_memory(device, pAllocateInfo, pAllocator, pMem); +} + +void tu_FreeMemory(VkDevice _device, VkDeviceMemory _mem, const VkAllocationCallbacks *pAllocator) @@ -2509,12 +1468,11 @@ tu_FreeMemory(VkDevice _device, if (mem == NULL) return; - p_atomic_add(&device->physical_device->heap.used, -mem->bo->size); - tu_bo_finish(device, mem->bo); - vk_object_free(&device->vk, pAllocator, mem); + tu_bo_finish(device, &mem->bo); + vk_free2(&device->alloc, pAllocator, mem); } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_MapMemory(VkDevice _device, VkDeviceMemory _memory, VkDeviceSize offset, @@ -2531,23 +1489,31 @@ tu_MapMemory(VkDevice _device, return VK_SUCCESS; } - if (!mem->bo->map) { - result = tu_bo_map(device, mem->bo); + if (mem->user_ptr) { + *ppData = mem->user_ptr; + } else if (!mem->map) { + result = tu_bo_map(device, &mem->bo); if (result != VK_SUCCESS) return result; + *ppData = mem->map = mem->bo.map; + } else + *ppData = mem->map; + + if (*ppData) { + *ppData += offset; + return VK_SUCCESS; } - *ppData = mem->bo->map + offset; - return VK_SUCCESS; + return vk_error(device->instance, VK_ERROR_MEMORY_MAP_FAILED); } -VKAPI_ATTR void VKAPI_CALL +void tu_UnmapMemory(VkDevice _device, VkDeviceMemory _memory) { - /* TODO: unmap here instead of waiting for FreeMemory */ + /* I do not see any unmapping done by the freedreno Gallium driver. */ } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_FlushMappedMemoryRanges(VkDevice _device, uint32_t memoryRangeCount, const VkMappedMemoryRange *pMemoryRanges) @@ -2555,7 +1521,7 @@ tu_FlushMappedMemoryRanges(VkDevice _device, return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_InvalidateMappedMemoryRanges(VkDevice _device, uint32_t memoryRangeCount, const VkMappedMemoryRange *pMemoryRanges) @@ -2563,52 +1529,71 @@ tu_InvalidateMappedMemoryRanges(VkDevice _device, return VK_SUCCESS; } -static void -tu_get_buffer_memory_requirements(uint64_t size, - VkMemoryRequirements2 *pMemoryRequirements) +void +tu_GetBufferMemoryRequirements(VkDevice _device, + VkBuffer _buffer, + VkMemoryRequirements *pMemoryRequirements) { - pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) { - .memoryTypeBits = 1, - .alignment = 64, - .size = MAX2(align64(size, 64), size), - }; + TU_FROM_HANDLE(tu_buffer, buffer, _buffer); - vk_foreach_struct(ext, pMemoryRequirements->pNext) { - switch (ext->sType) { - case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: { - VkMemoryDedicatedRequirements *req = - (VkMemoryDedicatedRequirements *) ext; - req->requiresDedicatedAllocation = false; - req->prefersDedicatedAllocation = req->requiresDedicatedAllocation; - break; - } - default: - break; - } - } + pMemoryRequirements->memoryTypeBits = 1; + pMemoryRequirements->alignment = 16; + pMemoryRequirements->size = + align64(buffer->size, pMemoryRequirements->alignment); } -VKAPI_ATTR void VKAPI_CALL +void tu_GetBufferMemoryRequirements2( VkDevice device, const VkBufferMemoryRequirementsInfo2 *pInfo, VkMemoryRequirements2 *pMemoryRequirements) { - TU_FROM_HANDLE(tu_buffer, buffer, pInfo->buffer); + tu_GetBufferMemoryRequirements(device, pInfo->buffer, + &pMemoryRequirements->memoryRequirements); +} + +void +tu_GetImageMemoryRequirements(VkDevice _device, + VkImage _image, + VkMemoryRequirements *pMemoryRequirements) +{ + TU_FROM_HANDLE(tu_image, image, _image); + + pMemoryRequirements->memoryTypeBits = 1; + pMemoryRequirements->size = image->size; + pMemoryRequirements->alignment = image->alignment; +} + +void +tu_GetImageMemoryRequirements2(VkDevice device, + const VkImageMemoryRequirementsInfo2 *pInfo, + VkMemoryRequirements2 *pMemoryRequirements) +{ + tu_GetImageMemoryRequirements(device, pInfo->image, + &pMemoryRequirements->memoryRequirements); +} - tu_get_buffer_memory_requirements(buffer->vk.size, pMemoryRequirements); +void +tu_GetImageSparseMemoryRequirements( + VkDevice device, + VkImage image, + uint32_t *pSparseMemoryRequirementCount, + VkSparseImageMemoryRequirements *pSparseMemoryRequirements) +{ + tu_stub(); } -VKAPI_ATTR void VKAPI_CALL -tu_GetDeviceBufferMemoryRequirements( +void +tu_GetImageSparseMemoryRequirements2( VkDevice device, - const VkDeviceBufferMemoryRequirements *pInfo, - VkMemoryRequirements2 *pMemoryRequirements) + const VkImageSparseMemoryRequirementsInfo2 *pInfo, + uint32_t *pSparseMemoryRequirementCount, + VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements) { - tu_get_buffer_memory_requirements(pInfo->pCreateInfo->size, pMemoryRequirements); + tu_stub(); } -VKAPI_ATTR void VKAPI_CALL +void tu_GetDeviceMemoryCommitment(VkDevice device, VkDeviceMemory memory, VkDeviceSize *pCommittedMemoryInBytes) @@ -2616,7 +1601,7 @@ tu_GetDeviceMemoryCommitment(VkDevice device, *pCommittedMemoryInBytes = 0; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_BindBufferMemory2(VkDevice device, uint32_t bindInfoCount, const VkBindBufferMemoryInfo *pBindInfos) @@ -2626,8 +1611,8 @@ tu_BindBufferMemory2(VkDevice device, TU_FROM_HANDLE(tu_buffer, buffer, pBindInfos[i].buffer); if (mem) { - buffer->bo = mem->bo; - buffer->iova = mem->bo->iova + pBindInfos[i].memoryOffset; + buffer->bo = &mem->bo; + buffer->bo_offset = pBindInfos[i].memoryOffset; } else { buffer->bo = NULL; } @@ -2635,7 +1620,23 @@ tu_BindBufferMemory2(VkDevice device, return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult +tu_BindBufferMemory(VkDevice device, + VkBuffer buffer, + VkDeviceMemory memory, + VkDeviceSize memoryOffset) +{ + const VkBindBufferMemoryInfo info = { + .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO, + .buffer = buffer, + .memory = memory, + .memoryOffset = memoryOffset + }; + + return tu_BindBufferMemory2(device, 1, &info); +} + +VkResult tu_BindImageMemory2(VkDevice device, uint32_t bindInfoCount, const VkBindImageMemoryInfo *pBindInfos) @@ -2645,18 +1646,34 @@ tu_BindImageMemory2(VkDevice device, TU_FROM_HANDLE(tu_device_memory, mem, pBindInfos[i].memory); if (mem) { - image->bo = mem->bo; - image->iova = mem->bo->iova + pBindInfos[i].memoryOffset; + image->bo = &mem->bo; + image->bo_offset = pBindInfos[i].memoryOffset; } else { image->bo = NULL; - image->iova = 0; + image->bo_offset = 0; } } return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult +tu_BindImageMemory(VkDevice device, + VkImage image, + VkDeviceMemory memory, + VkDeviceSize memoryOffset) +{ + const VkBindImageMemoryInfo info = { + .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO, + .image = image, + .memory = memory, + .memoryOffset = memoryOffset + }; + + return tu_BindImageMemory2(device, 1, &info); +} + +VkResult tu_QueueBindSparse(VkQueue _queue, uint32_t bindInfoCount, const VkBindSparseInfo *pBindInfo, @@ -2665,41 +1682,59 @@ tu_QueueBindSparse(VkQueue _queue, return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL +// Queue semaphore functions + +VkResult +tu_CreateSemaphore(VkDevice _device, + const VkSemaphoreCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkSemaphore *pSemaphore) +{ + TU_FROM_HANDLE(tu_device, device, _device); + + struct tu_semaphore *sem = + vk_alloc2(&device->alloc, pAllocator, sizeof(*sem), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!sem) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + + *pSemaphore = tu_semaphore_to_handle(sem); + return VK_SUCCESS; +} + +void +tu_DestroySemaphore(VkDevice _device, + VkSemaphore _semaphore, + const VkAllocationCallbacks *pAllocator) +{ + TU_FROM_HANDLE(tu_device, device, _device); + TU_FROM_HANDLE(tu_semaphore, sem, _semaphore); + if (!_semaphore) + return; + + vk_free2(&device->alloc, pAllocator, sem); +} + +VkResult tu_CreateEvent(VkDevice _device, const VkEventCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkEvent *pEvent) { TU_FROM_HANDLE(tu_device, device, _device); - struct tu_event *event = - vk_object_alloc(&device->vk, pAllocator, sizeof(*event), - VK_OBJECT_TYPE_EVENT); - if (!event) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - VkResult result = tu_bo_init_new(device, &event->bo, 0x1000, - TU_BO_ALLOC_NO_FLAGS, "event"); - if (result != VK_SUCCESS) - goto fail_alloc; + vk_alloc2(&device->alloc, pAllocator, sizeof(*event), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - result = tu_bo_map(device, event->bo); - if (result != VK_SUCCESS) - goto fail_map; + if (!event) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); *pEvent = tu_event_to_handle(event); return VK_SUCCESS; - -fail_map: - tu_bo_finish(device, event->bo); -fail_alloc: - vk_object_free(&device->vk, pAllocator, event); - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyEvent(VkDevice _device, VkEvent _event, const VkAllocationCallbacks *pAllocator) @@ -2709,40 +1744,38 @@ tu_DestroyEvent(VkDevice _device, if (!event) return; - - tu_bo_finish(device, event->bo); - vk_object_free(&device->vk, pAllocator, event); + vk_free2(&device->alloc, pAllocator, event); } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_GetEventStatus(VkDevice _device, VkEvent _event) { TU_FROM_HANDLE(tu_event, event, _event); - if (*(uint64_t*) event->bo->map == 1) + if (*event->map == 1) return VK_EVENT_SET; return VK_EVENT_RESET; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_SetEvent(VkDevice _device, VkEvent _event) { TU_FROM_HANDLE(tu_event, event, _event); - *(uint64_t*) event->bo->map = 1; + *event->map = 1; return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_ResetEvent(VkDevice _device, VkEvent _event) { TU_FROM_HANDLE(tu_event, event, _event); - *(uint64_t*) event->bo->map = 0; + *event->map = 0; return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreateBuffer(VkDevice _device, const VkBufferCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, @@ -2751,17 +1784,23 @@ tu_CreateBuffer(VkDevice _device, TU_FROM_HANDLE(tu_device, device, _device); struct tu_buffer *buffer; - buffer = vk_buffer_create(&device->vk, pCreateInfo, pAllocator, - sizeof(*buffer)); + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO); + + buffer = vk_alloc2(&device->alloc, pAllocator, sizeof(*buffer), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (buffer == NULL) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + + buffer->size = pCreateInfo->size; + buffer->usage = pCreateInfo->usage; + buffer->flags = pCreateInfo->flags; *pBuffer = tu_buffer_to_handle(buffer); return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyBuffer(VkDevice _device, VkBuffer _buffer, const VkAllocationCallbacks *pAllocator) @@ -2772,90 +1811,65 @@ tu_DestroyBuffer(VkDevice _device, if (!buffer) return; - vk_buffer_destroy(&device->vk, pAllocator, &buffer->vk); + vk_free2(&device->alloc, pAllocator, buffer); } -VKAPI_ATTR VkResult VKAPI_CALL +static uint32_t +tu_surface_max_layer_count(struct tu_image_view *iview) +{ + return iview->type == VK_IMAGE_VIEW_TYPE_3D + ? iview->extent.depth + : (iview->base_layer + iview->layer_count); +} + +VkResult tu_CreateFramebuffer(VkDevice _device, const VkFramebufferCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkFramebuffer *pFramebuffer) { TU_FROM_HANDLE(tu_device, device, _device); - - if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC)) - return vk_common_CreateFramebuffer(_device, pCreateInfo, pAllocator, - pFramebuffer); - - TU_FROM_HANDLE(tu_render_pass, pass, pCreateInfo->renderPass); struct tu_framebuffer *framebuffer; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO); - bool imageless = pCreateInfo->flags & VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT; - - size_t size = sizeof(*framebuffer); - if (!imageless) - size += sizeof(struct tu_attachment_info) * pCreateInfo->attachmentCount; - framebuffer = vk_object_alloc(&device->vk, pAllocator, size, - VK_OBJECT_TYPE_FRAMEBUFFER); + size_t size = sizeof(*framebuffer) + sizeof(struct tu_attachment_info) * + pCreateInfo->attachmentCount; + framebuffer = vk_alloc2(&device->alloc, pAllocator, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (framebuffer == NULL) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); framebuffer->attachment_count = pCreateInfo->attachmentCount; framebuffer->width = pCreateInfo->width; framebuffer->height = pCreateInfo->height; framebuffer->layers = pCreateInfo->layers; + for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) { + VkImageView _iview = pCreateInfo->pAttachments[i]; + struct tu_image_view *iview = tu_image_view_from_handle(_iview); + framebuffer->attachments[i].attachment = iview; - if (!imageless) { - for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) { - VkImageView _iview = pCreateInfo->pAttachments[i]; - struct tu_image_view *iview = tu_image_view_from_handle(_iview); - framebuffer->attachments[i].attachment = iview; - } + framebuffer->width = MIN2(framebuffer->width, iview->extent.width); + framebuffer->height = MIN2(framebuffer->height, iview->extent.height); + framebuffer->layers = + MIN2(framebuffer->layers, tu_surface_max_layer_count(iview)); } - tu_framebuffer_tiling_config(framebuffer, device, pass); - *pFramebuffer = tu_framebuffer_to_handle(framebuffer); return VK_SUCCESS; } void -tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer, - const VkRenderingInfo *pRenderingInfo) -{ - struct tu_render_pass *pass = &cmd_buffer->dynamic_pass; - struct tu_framebuffer *framebuffer = &cmd_buffer->dynamic_framebuffer; - - framebuffer->attachment_count = pass->attachment_count; - framebuffer->width = pRenderingInfo->renderArea.offset.x + - pRenderingInfo->renderArea.extent.width; - framebuffer->height = pRenderingInfo->renderArea.offset.y + - pRenderingInfo->renderArea.extent.height; - framebuffer->layers = pRenderingInfo->layerCount; - - tu_framebuffer_tiling_config(framebuffer, cmd_buffer->device, pass); -} - -VKAPI_ATTR void VKAPI_CALL tu_DestroyFramebuffer(VkDevice _device, VkFramebuffer _fb, const VkAllocationCallbacks *pAllocator) { TU_FROM_HANDLE(tu_device, device, _device); - - if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC)) { - vk_common_DestroyFramebuffer(_device, _fb, pAllocator); - return; - } - TU_FROM_HANDLE(tu_framebuffer, fb, _fb); if (!fb) return; - - vk_object_free(&device->vk, pAllocator, fb); + vk_free2(&device->alloc, pAllocator, fb); } static void @@ -2863,89 +1877,9 @@ tu_init_sampler(struct tu_device *device, struct tu_sampler *sampler, const VkSamplerCreateInfo *pCreateInfo) { - const struct VkSamplerReductionModeCreateInfo *reduction = - vk_find_struct_const(pCreateInfo->pNext, SAMPLER_REDUCTION_MODE_CREATE_INFO); - const struct VkSamplerYcbcrConversionInfo *ycbcr_conversion = - vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO); - const VkSamplerCustomBorderColorCreateInfoEXT *custom_border_color = - vk_find_struct_const(pCreateInfo->pNext, SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT); - /* for non-custom border colors, the VK enum is translated directly to an offset in - * the border color buffer. custom border colors are located immediately after the - * builtin colors, and thus an offset of TU_BORDER_COLOR_BUILTIN is added. - */ - uint32_t border_color = (unsigned) pCreateInfo->borderColor; - if (pCreateInfo->borderColor == VK_BORDER_COLOR_FLOAT_CUSTOM_EXT || - pCreateInfo->borderColor == VK_BORDER_COLOR_INT_CUSTOM_EXT) { - mtx_lock(&device->mutex); - border_color = BITSET_FFS(device->custom_border_color) - 1; - assert(border_color < TU_BORDER_COLOR_COUNT); - BITSET_CLEAR(device->custom_border_color, border_color); - mtx_unlock(&device->mutex); - - VkClearColorValue color = custom_border_color->customBorderColor; - if (custom_border_color->format == VK_FORMAT_D24_UNORM_S8_UINT && - pCreateInfo->borderColor == VK_BORDER_COLOR_INT_CUSTOM_EXT && - device->use_z24uint_s8uint) { - /* When sampling stencil using the special Z24UINT_S8UINT format, the - * border color is in the second component. Note: if - * customBorderColorWithoutFormat is enabled, we may miss doing this - * here if the format isn't specified, which is why we don't use that - * format. - */ - color.uint32[1] = color.uint32[0]; - } - - tu6_pack_border_color(device->global_bo->map + gb_offset(bcolor[border_color]), - &color, - pCreateInfo->borderColor == VK_BORDER_COLOR_INT_CUSTOM_EXT); - border_color += TU_BORDER_COLOR_BUILTIN; - } - - unsigned aniso = pCreateInfo->anisotropyEnable ? - util_last_bit(MIN2((uint32_t)pCreateInfo->maxAnisotropy >> 1, 8)) : 0; - bool miplinear = (pCreateInfo->mipmapMode == VK_SAMPLER_MIPMAP_MODE_LINEAR); - float min_lod = CLAMP(pCreateInfo->minLod, 0.0f, 4095.0f / 256.0f); - float max_lod = CLAMP(pCreateInfo->maxLod, 0.0f, 4095.0f / 256.0f); - - sampler->descriptor[0] = - COND(miplinear, A6XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR) | - A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(pCreateInfo->magFilter, aniso)) | - A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(pCreateInfo->minFilter, aniso)) | - A6XX_TEX_SAMP_0_ANISO(aniso) | - A6XX_TEX_SAMP_0_WRAP_S(tu6_tex_wrap(pCreateInfo->addressModeU)) | - A6XX_TEX_SAMP_0_WRAP_T(tu6_tex_wrap(pCreateInfo->addressModeV)) | - A6XX_TEX_SAMP_0_WRAP_R(tu6_tex_wrap(pCreateInfo->addressModeW)) | - A6XX_TEX_SAMP_0_LOD_BIAS(pCreateInfo->mipLodBias); - sampler->descriptor[1] = - COND(pCreateInfo->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT, - A6XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF) | - COND(pCreateInfo->unnormalizedCoordinates, A6XX_TEX_SAMP_1_UNNORM_COORDS) | - A6XX_TEX_SAMP_1_MIN_LOD(min_lod) | - A6XX_TEX_SAMP_1_MAX_LOD(max_lod) | - COND(pCreateInfo->compareEnable, - A6XX_TEX_SAMP_1_COMPARE_FUNC(tu6_compare_func(pCreateInfo->compareOp))); - sampler->descriptor[2] = A6XX_TEX_SAMP_2_BCOLOR(border_color); - sampler->descriptor[3] = 0; - - if (reduction) { - sampler->descriptor[2] |= A6XX_TEX_SAMP_2_REDUCTION_MODE( - tu6_reduction_mode(reduction->reductionMode)); - } - - sampler->ycbcr_sampler = ycbcr_conversion ? - tu_sampler_ycbcr_conversion_from_handle(ycbcr_conversion->conversion) : NULL; - - if (sampler->ycbcr_sampler && - sampler->ycbcr_sampler->chroma_filter == VK_FILTER_LINEAR) { - sampler->descriptor[2] |= A6XX_TEX_SAMP_2_CHROMA_LINEAR; - } - - /* TODO: - * A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR disables mipmapping, but vk has no NONE mipfilter? - */ } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreateSampler(VkDevice _device, const VkSamplerCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, @@ -2956,10 +1890,10 @@ tu_CreateSampler(VkDevice _device, assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO); - sampler = vk_object_alloc(&device->vk, pAllocator, sizeof(*sampler), - VK_OBJECT_TYPE_SAMPLER); + sampler = vk_alloc2(&device->alloc, pAllocator, sizeof(*sampler), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!sampler) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); tu_init_sampler(device, sampler, pCreateInfo); *pSampler = tu_sampler_to_handle(sampler); @@ -2967,29 +1901,17 @@ tu_CreateSampler(VkDevice _device, return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroySampler(VkDevice _device, VkSampler _sampler, const VkAllocationCallbacks *pAllocator) { TU_FROM_HANDLE(tu_device, device, _device); TU_FROM_HANDLE(tu_sampler, sampler, _sampler); - uint32_t border_color; if (!sampler) return; - - border_color = (sampler->descriptor[2] & A6XX_TEX_SAMP_2_BCOLOR__MASK) >> A6XX_TEX_SAMP_2_BCOLOR__SHIFT; - if (border_color >= TU_BORDER_COLOR_BUILTIN) { - border_color -= TU_BORDER_COLOR_BUILTIN; - /* if the sampler had a custom border color, free it. TODO: no lock */ - mtx_lock(&device->mutex); - assert(!BITSET_TEST(device->custom_border_color, border_color)); - BITSET_SET(device->custom_border_color, border_color); - mtx_unlock(&device->mutex); - } - - vk_object_free(&device->vk, pAllocator, sampler); + vk_free2(&device->alloc, pAllocator, sampler); } /* vk_icd.h does not declare this function, so we declare it here to @@ -3032,21 +1954,12 @@ vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t *pSupportedVersion) * - The ICD must implement vkCreate{PLATFORM}SurfaceKHR(), * vkDestroySurfaceKHR(), and other API which uses VKSurfaceKHR, * because the loader no longer does so. - * - * - Loader interface v4 differs from v3 in: - * - The ICD must implement vk_icdGetPhysicalDeviceProcAddr(). - * - * - Loader interface v5 differs from v4 in: - * - The ICD must support Vulkan API version 1.1 and must not return - * VK_ERROR_INCOMPATIBLE_DRIVER from vkCreateInstance() unless a - * Vulkan Loader with interface v4 or smaller is being used and the - * application provides an API version that is greater than 1.0. */ - *pSupportedVersion = MIN2(*pSupportedVersion, 5u); + *pSupportedVersion = MIN2(*pSupportedVersion, 3u); return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_GetMemoryFdKHR(VkDevice _device, const VkMemoryGetFdInfoKHR *pGetFdInfo, int *pFd) @@ -3062,15 +1975,15 @@ tu_GetMemoryFdKHR(VkDevice _device, pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); - int prime_fd = tu_bo_export_dmabuf(device, memory->bo); + int prime_fd = tu_bo_export_dmabuf(device, &memory->bo); if (prime_fd < 0) - return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); *pFd = prime_fd; return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_GetMemoryFdPropertiesKHR(VkDevice _device, VkExternalMemoryHandleTypeFlagBits handleType, int fd, @@ -3081,160 +1994,78 @@ tu_GetMemoryFdPropertiesKHR(VkDevice _device, return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL -tu_GetDeviceGroupPeerMemoryFeatures( - VkDevice device, - uint32_t heapIndex, - uint32_t localDeviceIndex, - uint32_t remoteDeviceIndex, - VkPeerMemoryFeatureFlags *pPeerMemoryFeatures) -{ - assert(localDeviceIndex == remoteDeviceIndex); - - *pPeerMemoryFeatures = VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT | - VK_PEER_MEMORY_FEATURE_COPY_DST_BIT | - VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT | - VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT; -} - -VKAPI_ATTR void VKAPI_CALL -tu_GetPhysicalDeviceMultisamplePropertiesEXT( - VkPhysicalDevice physicalDevice, - VkSampleCountFlagBits samples, - VkMultisamplePropertiesEXT* pMultisampleProperties) -{ - TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice); - - if (samples <= VK_SAMPLE_COUNT_4_BIT && pdevice->vk.supported_extensions.EXT_sample_locations) - pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){ 1, 1 }; - else - pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){ 0, 0 }; -} - -VkDeviceAddress -tu_GetBufferDeviceAddress(VkDevice _device, - const VkBufferDeviceAddressInfo* pInfo) -{ - TU_FROM_HANDLE(tu_buffer, buffer, pInfo->buffer); - - return buffer->iova; -} - -uint64_t tu_GetBufferOpaqueCaptureAddress( - VkDevice device, - const VkBufferDeviceAddressInfo* pInfo) +void +tu_GetPhysicalDeviceExternalSemaphoreProperties( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceExternalSemaphoreInfo *pExternalSemaphoreInfo, + VkExternalSemaphoreProperties *pExternalSemaphoreProperties) { - /* We care only about memory allocation opaque addresses */ - return 0; + pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0; + pExternalSemaphoreProperties->compatibleHandleTypes = 0; + pExternalSemaphoreProperties->externalSemaphoreFeatures = 0; } -uint64_t tu_GetDeviceMemoryOpaqueCaptureAddress( - VkDevice device, - const VkDeviceMemoryOpaqueCaptureAddressInfo* pInfo) +void +tu_GetPhysicalDeviceExternalFenceProperties( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceExternalFenceInfo *pExternalFenceInfo, + VkExternalFenceProperties *pExternalFenceProperties) { - TU_FROM_HANDLE(tu_device_memory, mem, pInfo->memory); - return mem->bo->iova; + pExternalFenceProperties->exportFromImportedHandleTypes = 0; + pExternalFenceProperties->compatibleHandleTypes = 0; + pExternalFenceProperties->externalFenceFeatures = 0; } -struct tu_debug_bos_entry { - uint32_t count; - uint64_t size; - const char *name; -}; - -const char * -tu_debug_bos_add(struct tu_device *dev, uint64_t size, const char *name) +VkResult +tu_CreateDebugReportCallbackEXT( + VkInstance _instance, + const VkDebugReportCallbackCreateInfoEXT *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkDebugReportCallbackEXT *pCallback) { - assert(name); - - if (likely(!dev->bo_sizes)) - return NULL; - - mtx_lock(&dev->bo_mutex); - struct hash_entry *entry = _mesa_hash_table_search(dev->bo_sizes, name); - struct tu_debug_bos_entry *debug_bos; - - if (!entry) { - debug_bos = calloc(1, sizeof(struct tu_debug_bos_entry)); - debug_bos->name = strdup(name); - _mesa_hash_table_insert(dev->bo_sizes, debug_bos->name, debug_bos); - } else { - debug_bos = entry->data; - } - - debug_bos->count++; - debug_bos->size += align(size, 4096); - mtx_unlock(&dev->bo_mutex); - - return debug_bos->name; + TU_FROM_HANDLE(tu_instance, instance, _instance); + return vk_create_debug_report_callback(&instance->debug_report_callbacks, + pCreateInfo, pAllocator, + &instance->alloc, pCallback); } void -tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo) +tu_DestroyDebugReportCallbackEXT(VkInstance _instance, + VkDebugReportCallbackEXT _callback, + const VkAllocationCallbacks *pAllocator) { - if (likely(!dev->bo_sizes) || !bo->name) - return; - - mtx_lock(&dev->bo_mutex); - struct hash_entry *entry = - _mesa_hash_table_search(dev->bo_sizes, bo->name); - /* If we're finishing the BO, it should have been added already */ - assert(entry); - - struct tu_debug_bos_entry *debug_bos = entry->data; - debug_bos->count--; - debug_bos->size -= align(bo->size, 4096); - if (!debug_bos->count) { - _mesa_hash_table_remove(dev->bo_sizes, entry); - free((void *) debug_bos->name); - free(debug_bos); - } - mtx_unlock(&dev->bo_mutex); + TU_FROM_HANDLE(tu_instance, instance, _instance); + vk_destroy_debug_report_callback(&instance->debug_report_callbacks, + _callback, pAllocator, &instance->alloc); } -static int debug_bos_count_compare(const void *in_a, const void *in_b) +void +tu_DebugReportMessageEXT(VkInstance _instance, + VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objectType, + uint64_t object, + size_t location, + int32_t messageCode, + const char *pLayerPrefix, + const char *pMessage) { - struct tu_debug_bos_entry *a = *(struct tu_debug_bos_entry **)in_a; - struct tu_debug_bos_entry *b = *(struct tu_debug_bos_entry **)in_b; - return a->count - b->count; + TU_FROM_HANDLE(tu_instance, instance, _instance); + vk_debug_report(&instance->debug_report_callbacks, flags, objectType, + object, location, messageCode, pLayerPrefix, pMessage); } void -tu_debug_bos_print_stats(struct tu_device *dev) +tu_GetDeviceGroupPeerMemoryFeatures( + VkDevice device, + uint32_t heapIndex, + uint32_t localDeviceIndex, + uint32_t remoteDeviceIndex, + VkPeerMemoryFeatureFlags *pPeerMemoryFeatures) { - if (likely(!dev->bo_sizes)) - return; - - mtx_lock(&dev->bo_mutex); - - /* Put the HT's sizes data in an array so we can sort by number of allocations. */ - struct util_dynarray dyn; - util_dynarray_init(&dyn, NULL); - - uint32_t size = 0; - uint32_t count = 0; - hash_table_foreach(dev->bo_sizes, entry) - { - struct tu_debug_bos_entry *debug_bos = (void *) entry->data; - util_dynarray_append(&dyn, struct tu_debug_bos_entry *, debug_bos); - size += debug_bos->size / 1024; - count += debug_bos->count; - } - - qsort(dyn.data, - util_dynarray_num_elements(&dyn, struct tu_debug_bos_entry *), - sizeof(struct tu_debug_bos_entryos_entry *), debug_bos_count_compare); - - util_dynarray_foreach(&dyn, struct tu_debug_bos_entry *, entryp) - { - struct tu_debug_bos_entry *debug_bos = *entryp; - mesa_logi("%30s: %4d bos, %lld kb\n", debug_bos->name, debug_bos->count, - (long long) (debug_bos->size / 1024)); - } - - mesa_logi("submitted %d bos (%d MB)\n", count, DIV_ROUND_UP(size, 1024)); - - util_dynarray_fini(&dyn); + assert(localDeviceIndex == remoteDeviceIndex); - mtx_unlock(&dev->bo_mutex); + *pPeerMemoryFeatures = VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT | + VK_PEER_MEMORY_FEATURE_COPY_DST_BIT | + VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT | + VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT; } diff --git a/lib/mesa/src/freedreno/vulkan/tu_drm.c b/lib/mesa/src/freedreno/vulkan/tu_drm.c index 9a57c6644..9b2e6f788 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_drm.c +++ b/lib/mesa/src/freedreno/vulkan/tu_drm.c @@ -1,61 +1,36 @@ /* * Copyright © 2018 Google, Inc. * Copyright © 2015 Intel Corporation - * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ -#include "tu_drm.h" +#include "tu_private.h" #include <errno.h> #include <fcntl.h> +#include <stdint.h> #include <sys/ioctl.h> -#include <sys/mman.h> #include <xf86drm.h> -#ifdef MAJOR_IN_MKDEV -#include <sys/mkdev.h> -#endif -#ifdef MAJOR_IN_SYSMACROS -#include <sys/sysmacros.h> -#endif - -#include "vk_util.h" - #include "drm-uapi/msm_drm.h" -#include "util/u_debug.h" -#include "util/hash_table.h" -#include "util/timespec.h" -#include "util/os_time.h" - -#include "tu_cmd_buffer.h" -#include "tu_cs.h" -#include "tu_device.h" -#include "tu_dynamic_rendering.h" - -struct tu_queue_submit -{ - struct vk_queue_submit *vk_submit; - struct tu_u_trace_submission_data *u_trace_submission_data; - - struct tu_cmd_buffer **cmd_buffers; - struct drm_msm_gem_submit_cmd *cmds; - struct drm_msm_gem_submit_syncobj *in_syncobjs; - struct drm_msm_gem_submit_syncobj *out_syncobjs; - - uint32_t nr_cmd_buffers; - uint32_t nr_in_syncobjs; - uint32_t nr_out_syncobjs; - uint32_t entry_count; - uint32_t perf_pass_index; - - bool autotune_fence; -}; - -struct tu_u_trace_syncobj -{ - uint32_t msm_queue_id; - uint32_t fence; -}; static int tu_drm_get_param(const struct tu_physical_device *dev, @@ -80,7 +55,7 @@ tu_drm_get_param(const struct tu_physical_device *dev, return 0; } -static int +int tu_drm_get_gpu_id(const struct tu_physical_device *dev, uint32_t *id) { uint64_t value; @@ -92,7 +67,7 @@ tu_drm_get_gpu_id(const struct tu_physical_device *dev, uint32_t *id) return 0; } -static int +int tu_drm_get_gmem_size(const struct tu_physical_device *dev, uint32_t *size) { uint64_t value; @@ -104,85 +79,17 @@ tu_drm_get_gmem_size(const struct tu_physical_device *dev, uint32_t *size) return 0; } -static int -tu_drm_get_gmem_base(const struct tu_physical_device *dev, uint64_t *base) -{ - return tu_drm_get_param(dev, MSM_PARAM_GMEM_BASE, base); -} - -static int -tu_drm_get_va_prop(const struct tu_physical_device *dev, - uint64_t *va_start, uint64_t *va_size) -{ - uint64_t value; - int ret = tu_drm_get_param(dev, MSM_PARAM_VA_START, &value); - if (ret) - return ret; - - *va_start = value; - - ret = tu_drm_get_param(dev, MSM_PARAM_VA_SIZE, &value); - if (ret) - return ret; - - *va_size = value; - - return 0; -} - -static uint32_t -tu_drm_get_priorities(const struct tu_physical_device *dev) -{ - uint64_t val = 1; - tu_drm_get_param(dev, MSM_PARAM_PRIORITIES, &val); - assert(val >= 1); - - return val; -} - -int -tu_device_get_gpu_timestamp(struct tu_device *dev, uint64_t *ts) -{ - return tu_drm_get_param(dev->physical_device, MSM_PARAM_TIMESTAMP, ts); -} - -int -tu_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count) -{ - int ret = tu_drm_get_param(dev->physical_device, MSM_PARAM_SUSPENDS, suspend_count); - return ret; -} - -VkResult -tu_device_check_status(struct vk_device *vk_device) -{ - struct tu_device *device = container_of(vk_device, struct tu_device, vk); - struct tu_physical_device *physical_device = device->physical_device; - - uint64_t last_fault_count = physical_device->fault_count; - int ret = tu_drm_get_param(physical_device, MSM_PARAM_FAULTS, &physical_device->fault_count); - if (ret != 0) - return vk_device_set_lost(&device->vk, "error getting GPU fault count: %d", ret); - - if (last_fault_count != physical_device->fault_count) - return vk_device_set_lost(&device->vk, "GPU faulted or hung"); - - return VK_SUCCESS; -} - int tu_drm_submitqueue_new(const struct tu_device *dev, int priority, uint32_t *queue_id) { - assert(priority >= 0 && - priority < dev->physical_device->submitqueue_priority_count); struct drm_msm_submitqueue req = { .flags = 0, .prio = priority, }; - int ret = drmCommandWriteRead(dev->fd, + int ret = drmCommandWriteRead(dev->physical_device->local_fd, DRM_MSM_SUBMITQUEUE_NEW, &req, sizeof(req)); if (ret) return ret; @@ -194,1156 +101,94 @@ tu_drm_submitqueue_new(const struct tu_device *dev, void tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id) { - drmCommandWrite(dev->fd, DRM_MSM_SUBMITQUEUE_CLOSE, + drmCommandWrite(dev->physical_device->local_fd, DRM_MSM_SUBMITQUEUE_CLOSE, &queue_id, sizeof(uint32_t)); } -static void -tu_gem_close(const struct tu_device *dev, uint32_t gem_handle) -{ - struct drm_gem_close req = { - .handle = gem_handle, - }; - - drmIoctl(dev->fd, DRM_IOCTL_GEM_CLOSE, &req); -} - -/** Helper for DRM_MSM_GEM_INFO, returns 0 on error. */ -static uint64_t -tu_gem_info(const struct tu_device *dev, uint32_t gem_handle, uint32_t info) -{ - struct drm_msm_gem_info req = { - .handle = gem_handle, - .info = info, - }; - - int ret = drmCommandWriteRead(dev->fd, - DRM_MSM_GEM_INFO, &req, sizeof(req)); - if (ret < 0) - return 0; - - return req.value; -} - - -static VkResult -tu_allocate_userspace_iova(struct tu_device *dev, - uint32_t gem_handle, - uint64_t size, - uint64_t client_iova, - enum tu_bo_alloc_flags flags, - uint64_t *iova) -{ - mtx_lock(&dev->physical_device->vma_mutex); - - *iova = 0; - - if (flags & TU_BO_ALLOC_REPLAYABLE) { - if (client_iova) { - if (util_vma_heap_alloc_addr(&dev->physical_device->vma, client_iova, - size)) { - *iova = client_iova; - } else { - return VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS; - } - } else { - /* We have to separate replayable IOVAs from ordinary one in order to - * for them not to clash. The easiest way to do this is to allocate - * them from the other end of the address space. - */ - dev->physical_device->vma.alloc_high = true; - *iova = - util_vma_heap_alloc(&dev->physical_device->vma, size, 0x1000); - } - } else { - dev->physical_device->vma.alloc_high = false; - *iova = util_vma_heap_alloc(&dev->physical_device->vma, size, 0x1000); - } - - mtx_unlock(&dev->physical_device->vma_mutex); - - if (!*iova) - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - - struct drm_msm_gem_info req = { - .handle = gem_handle, - .info = MSM_INFO_SET_IOVA, - .value = *iova, - }; - - int ret = - drmCommandWriteRead(dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req)); - if (ret < 0) - return VK_ERROR_OUT_OF_HOST_MEMORY; - - return VK_SUCCESS; -} - -static VkResult -tu_allocate_kernel_iova(struct tu_device *dev, - uint32_t gem_handle, - uint64_t *iova) -{ - *iova = tu_gem_info(dev, gem_handle, MSM_INFO_GET_IOVA); - if (!*iova) - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - - return VK_SUCCESS; -} - -static VkResult -tu_bo_init(struct tu_device *dev, - struct tu_bo *bo, - uint32_t gem_handle, - uint64_t size, - uint64_t client_iova, - enum tu_bo_alloc_flags flags, - const char *name) -{ - VkResult result = VK_SUCCESS; - uint64_t iova = 0; - - assert(!client_iova || dev->physical_device->has_set_iova); - - if (dev->physical_device->has_set_iova) { - result = tu_allocate_userspace_iova(dev, gem_handle, size, client_iova, - flags, &iova); - } else { - result = tu_allocate_kernel_iova(dev, gem_handle, &iova); - } - - if (result != VK_SUCCESS) - goto fail_bo_list; - - name = tu_debug_bos_add(dev, size, name); - - mtx_lock(&dev->bo_mutex); - uint32_t idx = dev->bo_count++; - - /* grow the bo list if needed */ - if (idx >= dev->bo_list_size) { - uint32_t new_len = idx + 64; - struct drm_msm_gem_submit_bo *new_ptr = - vk_realloc(&dev->vk.alloc, dev->bo_list, new_len * sizeof(*dev->bo_list), - 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - if (!new_ptr) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail_bo_list; - } - - dev->bo_list = new_ptr; - dev->bo_list_size = new_len; - } - - bool dump = flags & TU_BO_ALLOC_ALLOW_DUMP; - dev->bo_list[idx] = (struct drm_msm_gem_submit_bo) { - .flags = MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE | - COND(dump, MSM_SUBMIT_BO_DUMP), - .handle = gem_handle, - .presumed = iova, - }; - - *bo = (struct tu_bo) { - .gem_handle = gem_handle, - .size = size, - .iova = iova, - .refcnt = 1, - .bo_list_idx = idx, - .name = name, - }; - - mtx_unlock(&dev->bo_mutex); - - return VK_SUCCESS; - -fail_bo_list: - tu_gem_close(dev, gem_handle); - return result; -} - /** - * Sets the name in the kernel so that the contents of /debug/dri/0/gem are more - * useful. - * - * We skip this on release builds (when we're also not doing BO debugging) to - * reduce overhead. + * Return gem handle on success. Return 0 on failure. */ -static void -tu_bo_set_kernel_name(struct tu_device *dev, struct tu_bo *bo, const char *name) +uint32_t +tu_gem_new(const struct tu_device *dev, uint64_t size, uint32_t flags) { - bool kernel_bo_names = dev->bo_sizes != NULL; -#ifdef DEBUG - kernel_bo_names = true; -#endif - if (!kernel_bo_names) - return; - - struct drm_msm_gem_info req = { - .handle = bo->gem_handle, - .info = MSM_INFO_SET_NAME, - .value = (uintptr_t)(void *)name, - .len = strlen(name), - }; - - int ret = drmCommandWrite(dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req)); - if (ret) { - mesa_logw_once("Failed to set BO name with DRM_MSM_GEM_INFO: %d", - ret); - } -} - -VkResult -tu_bo_init_new_explicit_iova(struct tu_device *dev, - struct tu_bo **out_bo, - uint64_t size, - uint64_t client_iova, - enum tu_bo_alloc_flags flags, - const char *name) -{ - /* TODO: Choose better flags. As of 2018-11-12, freedreno/drm/msm_bo.c - * always sets `flags = MSM_BO_WC`, and we copy that behavior here. - */ struct drm_msm_gem_new req = { .size = size, - .flags = MSM_BO_WC + .flags = flags, }; - if (flags & TU_BO_ALLOC_GPU_READ_ONLY) - req.flags |= MSM_BO_GPU_READONLY; - - int ret = drmCommandWriteRead(dev->fd, + int ret = drmCommandWriteRead(dev->physical_device->local_fd, DRM_MSM_GEM_NEW, &req, sizeof(req)); if (ret) - return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); - - struct tu_bo* bo = tu_device_lookup_bo(dev, req.handle); - assert(bo && bo->gem_handle == 0); - - VkResult result = - tu_bo_init(dev, bo, req.handle, size, client_iova, flags, name); - - if (result != VK_SUCCESS) - memset(bo, 0, sizeof(*bo)); - else - *out_bo = bo; - - /* We don't use bo->name here because for the !TU_DEBUG=bo case bo->name is NULL. */ - tu_bo_set_kernel_name(dev, bo, name); + return 0; - return result; + return req.handle; } -VkResult -tu_bo_init_dmabuf(struct tu_device *dev, - struct tu_bo **out_bo, - uint64_t size, - int prime_fd) +uint32_t +tu_gem_import_dmabuf(const struct tu_device *dev, int prime_fd, uint64_t size) { /* lseek() to get the real size */ off_t real_size = lseek(prime_fd, 0, SEEK_END); lseek(prime_fd, 0, SEEK_SET); if (real_size < 0 || (uint64_t) real_size < size) - return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE); - - /* Importing the same dmabuf several times would yield the same - * gem_handle. Thus there could be a race when destroying - * BO and importing the same dmabuf from different threads. - * We must not permit the creation of dmabuf BO and its release - * to happen in parallel. - */ - u_rwlock_wrlock(&dev->dma_bo_lock); + return 0; uint32_t gem_handle; - int ret = drmPrimeFDToHandle(dev->fd, prime_fd, + int ret = drmPrimeFDToHandle(dev->physical_device->local_fd, prime_fd, &gem_handle); - if (ret) { - u_rwlock_wrunlock(&dev->dma_bo_lock); - return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE); - } - - struct tu_bo* bo = tu_device_lookup_bo(dev, gem_handle); - - if (bo->refcnt != 0) { - p_atomic_inc(&bo->refcnt); - u_rwlock_wrunlock(&dev->dma_bo_lock); - - *out_bo = bo; - return VK_SUCCESS; - } - - VkResult result = - tu_bo_init(dev, bo, gem_handle, size, 0, TU_BO_ALLOC_NO_FLAGS, "dmabuf"); - - if (result != VK_SUCCESS) - memset(bo, 0, sizeof(*bo)); - else - *out_bo = bo; - - u_rwlock_wrunlock(&dev->dma_bo_lock); + if (ret) + return 0; - return result; + return gem_handle; } int -tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo) +tu_gem_export_dmabuf(const struct tu_device *dev, uint32_t gem_handle) { int prime_fd; - int ret = drmPrimeHandleToFD(dev->fd, bo->gem_handle, - DRM_CLOEXEC | DRM_RDWR, &prime_fd); + int ret = drmPrimeHandleToFD(dev->physical_device->local_fd, gem_handle, + DRM_CLOEXEC, &prime_fd); return ret == 0 ? prime_fd : -1; } -VkResult -tu_bo_map(struct tu_device *dev, struct tu_bo *bo) -{ - if (bo->map) - return VK_SUCCESS; - - uint64_t offset = tu_gem_info(dev, bo->gem_handle, MSM_INFO_GET_OFFSET); - if (!offset) - return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); - - /* TODO: Should we use the wrapper os_mmap() like Freedreno does? */ - void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED, - dev->fd, offset); - if (map == MAP_FAILED) - return vk_error(dev, VK_ERROR_MEMORY_MAP_FAILED); - - bo->map = map; - return VK_SUCCESS; -} - void -tu_bo_finish(struct tu_device *dev, struct tu_bo *bo) -{ - assert(bo->gem_handle); - - u_rwlock_rdlock(&dev->dma_bo_lock); - - if (!p_atomic_dec_zero(&bo->refcnt)) { - u_rwlock_rdunlock(&dev->dma_bo_lock); - return; - } - - if (bo->map) - munmap(bo->map, bo->size); - - tu_debug_bos_del(dev, bo); - - mtx_lock(&dev->bo_mutex); - dev->bo_count--; - dev->bo_list[bo->bo_list_idx] = dev->bo_list[dev->bo_count]; - - struct tu_bo* exchanging_bo = tu_device_lookup_bo(dev, dev->bo_list[bo->bo_list_idx].handle); - exchanging_bo->bo_list_idx = bo->bo_list_idx; - - if (bo->implicit_sync) - dev->implicit_sync_bo_count--; - - mtx_unlock(&dev->bo_mutex); - - if (dev->physical_device->has_set_iova) { - mtx_lock(&dev->physical_device->vma_mutex); - util_vma_heap_free(&dev->physical_device->vma, bo->iova, bo->size); - mtx_unlock(&dev->physical_device->vma_mutex); - } - - /* Our BO structs are stored in a sparse array in the physical device, - * so we don't want to free the BO pointer, instead we want to reset it - * to 0, to signal that array entry as being free. - */ - uint32_t gem_handle = bo->gem_handle; - memset(bo, 0, sizeof(*bo)); - - tu_gem_close(dev, gem_handle); - - u_rwlock_rdunlock(&dev->dma_bo_lock); -} - -extern const struct vk_sync_type tu_timeline_sync_type; - -static inline bool -vk_sync_is_tu_timeline_sync(const struct vk_sync *sync) -{ - return sync->type == &tu_timeline_sync_type; -} - -static struct tu_timeline_sync * -to_tu_timeline_sync(struct vk_sync *sync) -{ - assert(sync->type == &tu_timeline_sync_type); - return container_of(sync, struct tu_timeline_sync, base); -} - -static uint32_t -tu_syncobj_from_vk_sync(struct vk_sync *sync) -{ - uint32_t syncobj = -1; - if (vk_sync_is_tu_timeline_sync(sync)) { - syncobj = to_tu_timeline_sync(sync)->syncobj; - } else if (vk_sync_type_is_drm_syncobj(sync->type)) { - syncobj = vk_sync_as_drm_syncobj(sync)->syncobj; - } - - assert(syncobj != -1); - - return syncobj; -} - -static VkResult -tu_timeline_sync_init(struct vk_device *vk_device, - struct vk_sync *vk_sync, - uint64_t initial_value) -{ - struct tu_device *device = container_of(vk_device, struct tu_device, vk); - struct tu_timeline_sync *sync = to_tu_timeline_sync(vk_sync); - uint32_t flags = 0; - - assert(device->fd >= 0); - - int err = drmSyncobjCreate(device->fd, flags, &sync->syncobj); - - if (err < 0) { - return vk_error(device, VK_ERROR_DEVICE_LOST); - } - - sync->state = initial_value ? TU_TIMELINE_SYNC_STATE_SIGNALED : - TU_TIMELINE_SYNC_STATE_RESET; - - return VK_SUCCESS; -} - -static void -tu_timeline_sync_finish(struct vk_device *vk_device, - struct vk_sync *vk_sync) -{ - struct tu_device *dev = container_of(vk_device, struct tu_device, vk); - struct tu_timeline_sync *sync = to_tu_timeline_sync(vk_sync); - - assert(dev->fd >= 0); - ASSERTED int err = drmSyncobjDestroy(dev->fd, sync->syncobj); - assert(err == 0); -} - -static VkResult -tu_timeline_sync_reset(struct vk_device *vk_device, - struct vk_sync *vk_sync) -{ - struct tu_device *dev = container_of(vk_device, struct tu_device, vk); - struct tu_timeline_sync *sync = to_tu_timeline_sync(vk_sync); - - int err = drmSyncobjReset(dev->fd, &sync->syncobj, 1); - if (err) { - return vk_errorf(dev, VK_ERROR_UNKNOWN, - "DRM_IOCTL_SYNCOBJ_RESET failed: %m"); - } else { - sync->state = TU_TIMELINE_SYNC_STATE_RESET; - } - - return VK_SUCCESS; -} - -static VkResult -drm_syncobj_wait(struct tu_device *device, - uint32_t *handles, uint32_t count_handles, - uint64_t timeout_nsec, bool wait_all) -{ - uint32_t syncobj_wait_flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT; - if (wait_all) syncobj_wait_flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL; - - /* syncobj absolute timeouts are signed. clamp OS_TIMEOUT_INFINITE down. */ - timeout_nsec = MIN2(timeout_nsec, (uint64_t)INT64_MAX); - - int err = drmSyncobjWait(device->fd, handles, - count_handles, timeout_nsec, - syncobj_wait_flags, - NULL /* first_signaled */); - if (err && errno == ETIME) { - return VK_TIMEOUT; - } else if (err) { - return vk_errorf(device, VK_ERROR_UNKNOWN, - "DRM_IOCTL_SYNCOBJ_WAIT failed: %m"); - } - - return VK_SUCCESS; -} - -/* Based on anv_bo_sync_wait */ -static VkResult -tu_timeline_sync_wait(struct vk_device *vk_device, - uint32_t wait_count, - const struct vk_sync_wait *waits, - enum vk_sync_wait_flags wait_flags, - uint64_t abs_timeout_ns) -{ - struct tu_device *dev = container_of(vk_device, struct tu_device, vk); - bool wait_all = !(wait_flags & VK_SYNC_WAIT_ANY); - - uint32_t handles[wait_count]; - uint32_t submit_count; - VkResult ret = VK_SUCCESS; - uint32_t pending = wait_count; - struct tu_timeline_sync *submitted_syncs[wait_count]; - - while (pending) { - pending = 0; - submit_count = 0; - - for (unsigned i = 0; i < wait_count; ++i) { - struct tu_timeline_sync *sync = to_tu_timeline_sync(waits[i].sync); - - if (sync->state == TU_TIMELINE_SYNC_STATE_RESET) { - assert(!(wait_flags & VK_SYNC_WAIT_PENDING)); - pending++; - } else if (sync->state == TU_TIMELINE_SYNC_STATE_SIGNALED) { - if (wait_flags & VK_SYNC_WAIT_ANY) - return VK_SUCCESS; - } else if (sync->state == TU_TIMELINE_SYNC_STATE_SUBMITTED) { - if (!(wait_flags & VK_SYNC_WAIT_PENDING)) { - handles[submit_count] = sync->syncobj; - submitted_syncs[submit_count++] = sync; - } - } - } - - if (submit_count > 0) { - do { - ret = drm_syncobj_wait(dev, handles, submit_count, abs_timeout_ns, wait_all); - } while (ret == VK_TIMEOUT && os_time_get_nano() < abs_timeout_ns); - - if (ret == VK_SUCCESS) { - for (unsigned i = 0; i < submit_count; ++i) { - struct tu_timeline_sync *sync = submitted_syncs[i]; - sync->state = TU_TIMELINE_SYNC_STATE_SIGNALED; - } - } else { - /* return error covering timeout */ - return ret; - } - } else if (pending > 0) { - /* If we've hit this then someone decided to vkWaitForFences before - * they've actually submitted any of them to a queue. This is a - * fairly pessimal case, so it's ok to lock here and use a standard - * pthreads condition variable. - */ - pthread_mutex_lock(&dev->submit_mutex); - - /* It's possible that some of the fences have changed state since the - * last time we checked. Now that we have the lock, check for - * pending fences again and don't wait if it's changed. - */ - uint32_t now_pending = 0; - for (uint32_t i = 0; i < wait_count; i++) { - struct tu_timeline_sync *sync = to_tu_timeline_sync(waits[i].sync); - if (sync->state == TU_TIMELINE_SYNC_STATE_RESET) - now_pending++; - } - assert(now_pending <= pending); - - if (now_pending == pending) { - struct timespec abstime = { - .tv_sec = abs_timeout_ns / NSEC_PER_SEC, - .tv_nsec = abs_timeout_ns % NSEC_PER_SEC, - }; - - ASSERTED int ret; - ret = pthread_cond_timedwait(&dev->timeline_cond, - &dev->submit_mutex, &abstime); - assert(ret != EINVAL); - if (os_time_get_nano() >= abs_timeout_ns) { - pthread_mutex_unlock(&dev->submit_mutex); - return VK_TIMEOUT; - } - } - - pthread_mutex_unlock(&dev->submit_mutex); - } - } - - return ret; -} - -const struct vk_sync_type tu_timeline_sync_type = { - .size = sizeof(struct tu_timeline_sync), - .features = VK_SYNC_FEATURE_BINARY | - VK_SYNC_FEATURE_GPU_WAIT | - VK_SYNC_FEATURE_GPU_MULTI_WAIT | - VK_SYNC_FEATURE_CPU_WAIT | - VK_SYNC_FEATURE_CPU_RESET | - VK_SYNC_FEATURE_WAIT_ANY | - VK_SYNC_FEATURE_WAIT_PENDING, - .init = tu_timeline_sync_init, - .finish = tu_timeline_sync_finish, - .reset = tu_timeline_sync_reset, - .wait_many = tu_timeline_sync_wait, -}; - -VkResult -tu_physical_device_try_create(struct vk_instance *vk_instance, - struct _drmDevice *drm_device, - struct vk_physical_device **out) -{ - struct tu_instance *instance = - container_of(vk_instance, struct tu_instance, vk); - - if (!(drm_device->available_nodes & (1 << DRM_NODE_RENDER)) || - drm_device->bustype != DRM_BUS_PLATFORM) - return VK_ERROR_INCOMPATIBLE_DRIVER; - - const char *primary_path = drm_device->nodes[DRM_NODE_PRIMARY]; - const char *path = drm_device->nodes[DRM_NODE_RENDER]; - VkResult result = VK_SUCCESS; - drmVersionPtr version; - int fd; - int master_fd = -1; - - fd = open(path, O_RDWR | O_CLOEXEC); - if (fd < 0) { - return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, - "failed to open device %s", path); - } - - /* Version 1.6 added SYNCOBJ support. */ - const int min_version_major = 1; - const int min_version_minor = 6; - - version = drmGetVersion(fd); - if (!version) { - close(fd); - return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, - "failed to query kernel driver version for device %s", - path); - } - - if (strcmp(version->name, "msm")) { - drmFreeVersion(version); - close(fd); - return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, - "device %s does not use the msm kernel driver", - path); - } - - if (version->version_major != min_version_major || - version->version_minor < min_version_minor) { - result = vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, - "kernel driver for device %s has version %d.%d, " - "but Vulkan requires version >= %d.%d", - path, - version->version_major, version->version_minor, - min_version_major, min_version_minor); - drmFreeVersion(version); - close(fd); - return result; - } - - struct tu_physical_device *device = - vk_zalloc(&instance->vk.alloc, sizeof(*device), 8, - VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); - if (!device) { - result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); - drmFreeVersion(version); - goto fail; - } - - device->msm_major_version = version->version_major; - device->msm_minor_version = version->version_minor; - - drmFreeVersion(version); - - if (instance->debug_flags & TU_DEBUG_STARTUP) - mesa_logi("Found compatible device '%s'.", path); - - device->instance = instance; - - if (instance->vk.enabled_extensions.KHR_display) { - master_fd = open(primary_path, O_RDWR | O_CLOEXEC); - if (master_fd >= 0) { - /* TODO: free master_fd is accel is not working? */ - } - } - - device->master_fd = master_fd; - device->local_fd = fd; - - if (tu_drm_get_gpu_id(device, &device->dev_id.gpu_id)) { - result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, - "could not get GPU ID"); - goto fail; - } - - if (tu_drm_get_param(device, MSM_PARAM_CHIP_ID, &device->dev_id.chip_id)) { - result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, - "could not get CHIP ID"); - goto fail; - } - - if (tu_drm_get_gmem_size(device, &device->gmem_size)) { - result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, - "could not get GMEM size"); - goto fail; - } - device->gmem_size = debug_get_num_option("TU_GMEM", device->gmem_size); - - if (tu_drm_get_gmem_base(device, &device->gmem_base)) { - result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, - "could not get GMEM size"); - goto fail; - } - - /* - * device->has_set_iova = !tu_drm_get_va_prop(device, &device->va_start, - * &device->va_size); - * - * If BO is freed while kernel considers it busy, our VMA state gets - * desynchronized from kernel's VMA state, because kernel waits - * until BO stops being busy. And whether BO is busy decided at - * submission granularity. - * - * Disable this capability until solution is found. - */ - device->has_set_iova = false; - - struct stat st; - - if (stat(primary_path, &st) == 0) { - device->has_master = true; - device->master_major = major(st.st_rdev); - device->master_minor = minor(st.st_rdev); - } else { - device->has_master = false; - device->master_major = 0; - device->master_minor = 0; - } - - if (stat(path, &st) == 0) { - device->has_local = true; - device->local_major = major(st.st_rdev); - device->local_minor = minor(st.st_rdev); - } else { - result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, - "failed to stat DRM render node %s", path); - goto fail; - } - - int ret = tu_drm_get_param(device, MSM_PARAM_FAULTS, &device->fault_count); - if (ret != 0) { - result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, - "Failed to get initial fault count: %d", ret); - goto fail; - } - - device->submitqueue_priority_count = tu_drm_get_priorities(device); - - device->syncobj_type = vk_drm_syncobj_get_type(fd); - /* we don't support DRM_CAP_SYNCOBJ_TIMELINE, but drm-shim does */ - if (!(device->syncobj_type.features & VK_SYNC_FEATURE_TIMELINE)) - device->timeline_type = vk_sync_timeline_get_type(&tu_timeline_sync_type); - - device->sync_types[0] = &device->syncobj_type; - device->sync_types[1] = &device->timeline_type.sync; - device->sync_types[2] = NULL; - - device->heap.size = tu_get_system_heap_size(); - device->heap.used = 0u; - device->heap.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT; - - result = tu_physical_device_init(device, instance); - - if (result == VK_SUCCESS) { - *out = &device->vk; - return result; - } - -fail: - if (device) - vk_free(&instance->vk.alloc, device); - close(fd); - if (master_fd != -1) - close(master_fd); - return result; -} - -static VkResult -tu_queue_submit_create_locked(struct tu_queue *queue, - struct vk_queue_submit *vk_submit, - const uint32_t nr_in_syncobjs, - const uint32_t nr_out_syncobjs, - uint32_t perf_pass_index, - struct tu_queue_submit *new_submit) -{ - VkResult result; - - bool u_trace_enabled = u_trace_context_actively_tracing(&queue->device->trace_context); - bool has_trace_points = false; - - struct vk_command_buffer **vk_cmd_buffers = vk_submit->command_buffers; - - memset(new_submit, 0, sizeof(struct tu_queue_submit)); - - new_submit->cmd_buffers = (void *)vk_cmd_buffers; - new_submit->nr_cmd_buffers = vk_submit->command_buffer_count; - tu_insert_dynamic_cmdbufs(queue->device, &new_submit->cmd_buffers, - &new_submit->nr_cmd_buffers); - - uint32_t entry_count = 0; - for (uint32_t j = 0; j < new_submit->nr_cmd_buffers; ++j) { - struct tu_cmd_buffer *cmdbuf = new_submit->cmd_buffers[j]; - - if (perf_pass_index != ~0) - entry_count++; - - entry_count += cmdbuf->cs.entry_count; - - if (u_trace_enabled && u_trace_has_points(&cmdbuf->trace)) { - if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) - entry_count++; - - has_trace_points = true; - } - } - - new_submit->autotune_fence = - tu_autotune_submit_requires_fence(new_submit->cmd_buffers, new_submit->nr_cmd_buffers); - if (new_submit->autotune_fence) - entry_count++; - - new_submit->cmds = vk_zalloc(&queue->device->vk.alloc, - entry_count * sizeof(*new_submit->cmds), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (new_submit->cmds == NULL) { - result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); - goto fail_cmds; - } - - if (has_trace_points) { - result = - tu_u_trace_submission_data_create( - queue->device, new_submit->cmd_buffers, - new_submit->nr_cmd_buffers, - &new_submit->u_trace_submission_data); - - if (result != VK_SUCCESS) { - goto fail_u_trace_submission_data; - } - } - - /* Allocate without wait timeline semaphores */ - new_submit->in_syncobjs = vk_zalloc(&queue->device->vk.alloc, - nr_in_syncobjs * sizeof(*new_submit->in_syncobjs), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (new_submit->in_syncobjs == NULL) { - result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); - goto fail_in_syncobjs; - } - - /* Allocate with signal timeline semaphores considered */ - new_submit->out_syncobjs = vk_zalloc(&queue->device->vk.alloc, - nr_out_syncobjs * sizeof(*new_submit->out_syncobjs), 8, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - - if (new_submit->out_syncobjs == NULL) { - result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); - goto fail_out_syncobjs; - } - - new_submit->entry_count = entry_count; - new_submit->nr_in_syncobjs = nr_in_syncobjs; - new_submit->nr_out_syncobjs = nr_out_syncobjs; - new_submit->perf_pass_index = perf_pass_index; - new_submit->vk_submit = vk_submit; - - return VK_SUCCESS; - -fail_out_syncobjs: - vk_free(&queue->device->vk.alloc, new_submit->in_syncobjs); -fail_in_syncobjs: - if (new_submit->u_trace_submission_data) - tu_u_trace_submission_data_finish(queue->device, - new_submit->u_trace_submission_data); -fail_u_trace_submission_data: - vk_free(&queue->device->vk.alloc, new_submit->cmds); -fail_cmds: - return result; -} - -static void -tu_queue_submit_finish(struct tu_queue *queue, struct tu_queue_submit *submit) -{ - vk_free(&queue->device->vk.alloc, submit->cmds); - vk_free(&queue->device->vk.alloc, submit->in_syncobjs); - vk_free(&queue->device->vk.alloc, submit->out_syncobjs); - if (submit->cmd_buffers != (void *) submit->vk_submit->command_buffers) - vk_free(&queue->device->vk.alloc, submit->cmd_buffers); -} - -static void -tu_fill_msm_gem_submit(struct tu_device *dev, - struct drm_msm_gem_submit_cmd *cmd, - struct tu_cs_entry *cs_entry) -{ - cmd->type = MSM_SUBMIT_CMD_BUF; - cmd->submit_idx = cs_entry->bo->bo_list_idx; - cmd->submit_offset = cs_entry->offset; - cmd->size = cs_entry->size; - cmd->pad = 0; - cmd->nr_relocs = 0; - cmd->relocs = 0; -} - -static void -tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue, - struct tu_queue_submit *submit, - struct tu_cs *autotune_cs) +tu_gem_close(const struct tu_device *dev, uint32_t gem_handle) { - struct tu_device *dev = queue->device; - struct drm_msm_gem_submit_cmd *cmds = submit->cmds; - - uint32_t entry_idx = 0; - for (uint32_t j = 0; j < submit->nr_cmd_buffers; ++j) { - struct tu_device *dev = queue->device; - struct tu_cmd_buffer *cmdbuf = submit->cmd_buffers[j]; - struct tu_cs *cs = &cmdbuf->cs; - - if (submit->perf_pass_index != ~0) { - struct tu_cs_entry *perf_cs_entry = - &dev->perfcntrs_pass_cs_entries[submit->perf_pass_index]; - - tu_fill_msm_gem_submit(dev, &cmds[entry_idx], perf_cs_entry); - entry_idx++; - } - - for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) { - tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &cs->entries[i]); - } - - if (submit->u_trace_submission_data) { - struct tu_cs *ts_cs = - submit->u_trace_submission_data->cmd_trace_data[j].timestamp_copy_cs; - if (ts_cs) { - tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &ts_cs->entries[0]); - entry_idx++; - } - } - } + struct drm_gem_close req = { + .handle = gem_handle, + }; - if (autotune_cs) { - assert(autotune_cs->entry_count == 1); - tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &autotune_cs->entries[0]); - entry_idx++; - } + drmIoctl(dev->physical_device->local_fd, DRM_IOCTL_GEM_CLOSE, &req); } -static VkResult -tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit) +/** Return UINT64_MAX on error. */ +static uint64_t +tu_gem_info(const struct tu_device *dev, uint32_t gem_handle, uint32_t info) { - queue->device->submit_count++; - - struct tu_cs *autotune_cs = NULL; - if (submit->autotune_fence) { - autotune_cs = tu_autotune_on_submit(queue->device, - &queue->device->autotune, - submit->cmd_buffers, - submit->nr_cmd_buffers); - } - - uint32_t flags = MSM_PIPE_3D0; - - if (submit->vk_submit->wait_count) - flags |= MSM_SUBMIT_SYNCOBJ_IN; - - if (submit->vk_submit->signal_count) - flags |= MSM_SUBMIT_SYNCOBJ_OUT; - - mtx_lock(&queue->device->bo_mutex); - - if (queue->device->implicit_sync_bo_count == 0) - flags |= MSM_SUBMIT_NO_IMPLICIT; - - /* drm_msm_gem_submit_cmd requires index of bo which could change at any - * time when bo_mutex is not locked. So we build submit cmds here the real - * place to submit. - */ - tu_queue_build_msm_gem_submit_cmds(queue, submit, autotune_cs); - - struct drm_msm_gem_submit req = { - .flags = flags, - .queueid = queue->msm_queue_id, - .bos = (uint64_t)(uintptr_t) queue->device->bo_list, - .nr_bos = submit->entry_count ? queue->device->bo_count : 0, - .cmds = (uint64_t)(uintptr_t)submit->cmds, - .nr_cmds = submit->entry_count, - .in_syncobjs = (uint64_t)(uintptr_t)submit->in_syncobjs, - .out_syncobjs = (uint64_t)(uintptr_t)submit->out_syncobjs, - .nr_in_syncobjs = submit->nr_in_syncobjs, - .nr_out_syncobjs = submit->nr_out_syncobjs, - .syncobj_stride = sizeof(struct drm_msm_gem_submit_syncobj), + struct drm_msm_gem_info req = { + .handle = gem_handle, + .info = info, }; - int ret = drmCommandWriteRead(queue->device->fd, - DRM_MSM_GEM_SUBMIT, - &req, sizeof(req)); - - mtx_unlock(&queue->device->bo_mutex); - - tu_debug_bos_print_stats(queue->device); - - if (ret) - return vk_device_set_lost(&queue->device->vk, "submit failed: %m"); - -#if HAVE_PERFETTO - tu_perfetto_submit(queue->device, queue->device->submit_count); -#endif - - if (submit->u_trace_submission_data) { - struct tu_u_trace_submission_data *submission_data = - submit->u_trace_submission_data; - submission_data->submission_id = queue->device->submit_count; - /* We have to allocate it here since it is different between drm/kgsl */ - submission_data->syncobj = - vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj), - 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - submission_data->syncobj->fence = req.fence; - submission_data->syncobj->msm_queue_id = queue->msm_queue_id; - - submit->u_trace_submission_data = NULL; - - for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) { - bool free_data = i == submission_data->last_buffer_with_tracepoints; - if (submission_data->cmd_trace_data[i].trace) - u_trace_flush(submission_data->cmd_trace_data[i].trace, - submission_data, free_data); - - if (!submission_data->cmd_trace_data[i].timestamp_copy_cs) { - /* u_trace is owned by cmd_buffer */ - submission_data->cmd_trace_data[i].trace = NULL; - } - } - } - - for (uint32_t i = 0; i < submit->vk_submit->wait_count; i++) { - if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->waits[i].sync)) - continue; - - struct tu_timeline_sync *sync = - container_of(submit->vk_submit->waits[i].sync, struct tu_timeline_sync, base); - - assert(sync->state != TU_TIMELINE_SYNC_STATE_RESET); - - /* Set SIGNALED to the state of the wait timeline sync since this means the syncobj - * is done and ready again so this can be garbage-collectioned later. - */ - sync->state = TU_TIMELINE_SYNC_STATE_SIGNALED; - } - - for (uint32_t i = 0; i < submit->vk_submit->signal_count; i++) { - if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->signals[i].sync)) - continue; - - struct tu_timeline_sync *sync = - container_of(submit->vk_submit->signals[i].sync, struct tu_timeline_sync, base); - - assert(sync->state == TU_TIMELINE_SYNC_STATE_RESET); - /* Set SUBMITTED to the state of the signal timeline sync so we could wait for - * this timeline sync until completed if necessary. - */ - sync->state = TU_TIMELINE_SYNC_STATE_SUBMITTED; - } - - pthread_cond_broadcast(&queue->device->timeline_cond); - - return VK_SUCCESS; -} + int ret = drmCommandWriteRead(dev->physical_device->local_fd, + DRM_MSM_GEM_INFO, &req, sizeof(req)); + if (ret == -1) + return UINT64_MAX; -static inline void -get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns) -{ - struct timespec t; - clock_gettime(CLOCK_MONOTONIC, &t); - tv->tv_sec = t.tv_sec + ns / 1000000000; - tv->tv_nsec = t.tv_nsec + ns % 1000000000; + return req.value; } -VkResult -tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj) +/** Return UINT64_MAX on error. */ +uint64_t +tu_gem_info_offset(const struct tu_device *dev, uint32_t gem_handle) { - struct drm_msm_wait_fence req = { - .fence = syncobj->fence, - .queueid = syncobj->msm_queue_id, - }; - int ret; - - get_abs_timeout(&req.timeout, 1000000000); - - ret = drmCommandWrite(dev->fd, DRM_MSM_WAIT_FENCE, &req, sizeof(req)); - if (ret && (ret != -ETIMEDOUT)) { - fprintf(stderr, "wait-fence failed! %d (%s)", ret, strerror(errno)); - return VK_TIMEOUT; - } - - return VK_SUCCESS; + return tu_gem_info(dev, gem_handle, MSM_INFO_GET_OFFSET); } -VkResult -tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit) +/** Return UINT64_MAX on error. */ +uint64_t +tu_gem_info_iova(const struct tu_device *dev, uint32_t gem_handle) { - MESA_TRACE_FUNC(); - struct tu_queue *queue = container_of(vk_queue, struct tu_queue, vk); - uint32_t perf_pass_index = queue->device->perfcntrs_pass_cs ? - submit->perf_pass_index : ~0; - struct tu_queue_submit submit_req; - - if (unlikely(queue->device->physical_device->instance->debug_flags & - TU_DEBUG_LOG_SKIP_GMEM_OPS)) { - tu_dbg_log_gmem_load_store_skips(queue->device); - } - - pthread_mutex_lock(&queue->device->submit_mutex); - - VkResult ret = tu_queue_submit_create_locked(queue, submit, - submit->wait_count, submit->signal_count, - perf_pass_index, &submit_req); - - if (ret != VK_SUCCESS) { - pthread_mutex_unlock(&queue->device->submit_mutex); - return ret; - } - - /* note: assuming there won't be any very large semaphore counts */ - struct drm_msm_gem_submit_syncobj *in_syncobjs = submit_req.in_syncobjs; - struct drm_msm_gem_submit_syncobj *out_syncobjs = submit_req.out_syncobjs; - - uint32_t nr_in_syncobjs = 0, nr_out_syncobjs = 0; - - for (uint32_t i = 0; i < submit->wait_count; i++) { - struct vk_sync *sync = submit->waits[i].sync; - - in_syncobjs[nr_in_syncobjs++] = (struct drm_msm_gem_submit_syncobj) { - .handle = tu_syncobj_from_vk_sync(sync), - .flags = 0, - }; - } - - for (uint32_t i = 0; i < submit->signal_count; i++) { - struct vk_sync *sync = submit->signals[i].sync; - - out_syncobjs[nr_out_syncobjs++] = (struct drm_msm_gem_submit_syncobj) { - .handle = tu_syncobj_from_vk_sync(sync), - .flags = 0, - }; - } - - ret = tu_queue_submit_locked(queue, &submit_req); - - pthread_mutex_unlock(&queue->device->submit_mutex); - tu_queue_submit_finish(queue, &submit_req); - - if (ret != VK_SUCCESS) - return ret; - - u_trace_context_process(&queue->device->trace_context, true); - - return VK_SUCCESS; + return tu_gem_info(dev, gem_handle, MSM_INFO_GET_IOVA); } diff --git a/lib/mesa/src/freedreno/vulkan/tu_formats.c b/lib/mesa/src/freedreno/vulkan/tu_formats.c index 75a3ce74e..537b59d25 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_formats.c +++ b/lib/mesa/src/freedreno/vulkan/tu_formats.c @@ -1,395 +1,661 @@ + /* * Copyright © 2016 Red Hat. * Copyright © 2016 Bas Nieuwenhuizen - * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ -#include "tu_formats.h" +#include "tu_private.h" -#include "fdl/fd6_format_table.h" +#include "registers/adreno_common.xml.h" +#include "registers/a6xx.xml.h" +#include "util/format_r11g11b10f.h" +#include "util/format_srgb.h" +#include "util/u_half.h" +#include "vk_format.h" #include "vk_util.h" -#include "drm-uapi/drm_fourcc.h" -#include "tu_device.h" -#include "tu_image.h" +/** + * Declare a format table. A format table is an array of tu_native_format. + * It can map a consecutive range of VkFormat to the corresponding + * tu_native_format. + * + * TU_FORMAT_TABLE_FIRST and TU_FORMAT_TABLE_LAST must already be defined and + * have the values of the first and last VkFormat of the array respectively. + */ +#define TU_FORMAT_TABLE(var) \ + static const VkFormat var##_first = TU_FORMAT_TABLE_FIRST; \ + static const VkFormat var##_last = TU_FORMAT_TABLE_LAST; \ + static const struct tu_native_format var[TU_FORMAT_TABLE_LAST - TU_FORMAT_TABLE_FIRST + 1] +#undef TU_FORMAT_TABLE_FIRST +#undef TU_FORMAT_TABLE_LAST + +#define VFMT6_x -1 +#define TFMT6_x -1 +#define RB6_x -1 + +#define TU6_FMT(vkfmt, vtxfmt, texfmt, rbfmt, swapfmt, valid) \ + [VK_FORMAT_##vkfmt - TU_FORMAT_TABLE_FIRST] = { \ + .vtx = VFMT6_##vtxfmt, \ + .tex = TFMT6_##texfmt, \ + .rb = RB6_##rbfmt, \ + .swap = swapfmt, \ + .present = valid, \ + } -struct tu_native_format -tu6_format_vtx(VkFormat vk_format) -{ - enum pipe_format format = vk_format_to_pipe_format(vk_format); - struct tu_native_format fmt = { - .fmt = fd6_vertex_format(format), - .swap = fd6_vertex_swap(format), - }; - assert(fmt.fmt != FMT6_NONE); - return fmt; -} +/** + * fmt/alias/swap are derived from VkFormat mechanically (and might not even + * exist). It is the macro of choice that decides whether a VkFormat is + * supported and how. + */ +#define TU6_VTC(vk, fmt, alias, swap) TU6_FMT(vk, fmt, fmt, alias, swap, true) +#define TU6_xTC(vk, fmt, alias, swap) TU6_FMT(vk, x, fmt, alias, swap, true) +#define TU6_VTx(vk, fmt, alias, swap) TU6_FMT(vk, fmt, fmt, x, swap, true) +#define TU6_Vxx(vk, fmt, alias, swap) TU6_FMT(vk, fmt, x, x, swap, true) +#define TU6_xTx(vk, fmt, alias, swap) TU6_FMT(vk, x, fmt, x, swap, true) +#define TU6_xxx(vk, fmt, alias, swap) TU6_FMT(vk, x, x, x, WZYX, false) + +#define TU_FORMAT_TABLE_FIRST VK_FORMAT_UNDEFINED +#define TU_FORMAT_TABLE_LAST VK_FORMAT_ASTC_12x12_SRGB_BLOCK +TU_FORMAT_TABLE(tu6_format_table0) = { + TU6_xxx(UNDEFINED, x, x, x), /* 0 */ + + /* 8-bit packed */ + TU6_xxx(R4G4_UNORM_PACK8, 4_4_UNORM, R4G4_UNORM, WZXY), /* 1 */ + + /* 16-bit packed */ + TU6_xTC(R4G4B4A4_UNORM_PACK16, 4_4_4_4_UNORM, R4G4B4A4_UNORM, XYZW), /* 2 */ + TU6_xTC(B4G4R4A4_UNORM_PACK16, 4_4_4_4_UNORM, R4G4B4A4_UNORM, ZYXW), /* 3 */ + TU6_xTC(R5G6B5_UNORM_PACK16, 5_6_5_UNORM, R5G6B5_UNORM, WXYZ), /* 4 */ + TU6_xTC(B5G6R5_UNORM_PACK16, 5_6_5_UNORM, R5G6B5_UNORM, WXYZ), /* 5 */ + TU6_xxx(R5G5B5A1_UNORM_PACK16, 1_5_5_5_UNORM, A1R5G5B5_UNORM, XYZW), /* 6 */ + TU6_xxx(B5G5R5A1_UNORM_PACK16, 1_5_5_5_UNORM, A1R5G5B5_UNORM, XYZW), /* 7 */ + TU6_xTC(A1R5G5B5_UNORM_PACK16, 5_5_5_1_UNORM, R5G5B5A1_UNORM, WXYZ), /* 8 */ + + /* 8-bit R */ + TU6_VTC(R8_UNORM, 8_UNORM, R8_UNORM, WZYX), /* 9 */ + TU6_VTC(R8_SNORM, 8_SNORM, R8_SNORM, WZYX), /* 10 */ + TU6_Vxx(R8_USCALED, 8_UINT, R8_UINT, WZYX), /* 11 */ + TU6_Vxx(R8_SSCALED, 8_SINT, R8_SINT, WZYX), /* 12 */ + TU6_VTC(R8_UINT, 8_UINT, R8_UINT, WZYX), /* 13 */ + TU6_VTC(R8_SINT, 8_SINT, R8_SINT, WZYX), /* 14 */ + TU6_xTC(R8_SRGB, 8_UNORM, R8_UNORM, WZYX), /* 15 */ + + /* 16-bit RG */ + TU6_VTC(R8G8_UNORM, 8_8_UNORM, R8G8_UNORM, WZYX), /* 16 */ + TU6_VTC(R8G8_SNORM, 8_8_SNORM, R8G8_SNORM, WZYX), /* 17 */ + TU6_Vxx(R8G8_USCALED, 8_8_UINT, R8G8_UINT, WZYX), /* 18 */ + TU6_Vxx(R8G8_SSCALED, 8_8_SINT, R8G8_SINT, WZYX), /* 19 */ + TU6_VTC(R8G8_UINT, 8_8_UINT, R8G8_UINT, WZYX), /* 20 */ + TU6_VTC(R8G8_SINT, 8_8_SINT, R8G8_SINT, WZYX), /* 21 */ + TU6_xTC(R8G8_SRGB, 8_8_UNORM, R8G8_UNORM, WZYX), /* 22 */ + + /* 24-bit RGB */ + TU6_Vxx(R8G8B8_UNORM, 8_8_8_UNORM, R8G8B8_UNORM, WZYX), /* 23 */ + TU6_Vxx(R8G8B8_SNORM, 8_8_8_SNORM, R8G8B8_SNORM, WZYX), /* 24 */ + TU6_Vxx(R8G8B8_USCALED, 8_8_8_UINT, R8G8B8_UINT, WZYX), /* 25 */ + TU6_Vxx(R8G8B8_SSCALED, 8_8_8_SINT, R8G8B8_SINT, WZYX), /* 26 */ + TU6_Vxx(R8G8B8_UINT, 8_8_8_UINT, R8G8B8_UINT, WZYX), /* 27 */ + TU6_Vxx(R8G8B8_SINT, 8_8_8_SINT, R8G8B8_SINT, WZYX), /* 28 */ + TU6_xxx(R8G8B8_SRGB, 8_8_8_UNORM, R8G8B8_UNORM, WZYX), /* 29 */ + + /* 24-bit BGR */ + TU6_Vxx(B8G8R8_UNORM, 8_8_8_UNORM, R8G8B8_UNORM, WXYZ), /* 30 */ + TU6_Vxx(B8G8R8_SNORM, 8_8_8_SNORM, R8G8B8_SNORM, WXYZ), /* 31 */ + TU6_Vxx(B8G8R8_USCALED, 8_8_8_UINT, R8G8B8_UINT, WXYZ), /* 32 */ + TU6_Vxx(B8G8R8_SSCALED, 8_8_8_SINT, R8G8B8_SINT, WXYZ), /* 33 */ + TU6_Vxx(B8G8R8_UINT, 8_8_8_UINT, R8G8B8_UINT, WXYZ), /* 34 */ + TU6_Vxx(B8G8R8_SINT, 8_8_8_SINT, R8G8B8_SINT, WXYZ), /* 35 */ + TU6_xxx(B8G8R8_SRGB, 8_8_8_UNORM, R8G8B8_UNORM, WXYZ), /* 36 */ + + /* 32-bit RGBA */ + TU6_VTC(R8G8B8A8_UNORM, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX), /* 37 */ + TU6_VTC(R8G8B8A8_SNORM, 8_8_8_8_SNORM, R8G8B8A8_SNORM, WZYX), /* 38 */ + TU6_Vxx(R8G8B8A8_USCALED, 8_8_8_8_UINT, R8G8B8A8_UINT, WZYX), /* 39 */ + TU6_Vxx(R8G8B8A8_SSCALED, 8_8_8_8_SINT, R8G8B8A8_SINT, WZYX), /* 40 */ + TU6_VTC(R8G8B8A8_UINT, 8_8_8_8_UINT, R8G8B8A8_UINT, WZYX), /* 41 */ + TU6_VTC(R8G8B8A8_SINT, 8_8_8_8_SINT, R8G8B8A8_SINT, WZYX), /* 42 */ + TU6_xTC(R8G8B8A8_SRGB, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX), /* 43 */ + + /* 32-bit BGRA */ + TU6_VTC(B8G8R8A8_UNORM, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WXYZ), /* 44 */ + TU6_VTC(B8G8R8A8_SNORM, 8_8_8_8_SNORM, R8G8B8A8_SNORM, WXYZ), /* 45 */ + TU6_Vxx(B8G8R8A8_USCALED, 8_8_8_8_UINT, R8G8B8A8_UINT, WXYZ), /* 46 */ + TU6_Vxx(B8G8R8A8_SSCALED, 8_8_8_8_SINT, R8G8B8A8_SINT, WXYZ), /* 47 */ + TU6_VTC(B8G8R8A8_UINT, 8_8_8_8_UINT, R8G8B8A8_UINT, WXYZ), /* 48 */ + TU6_VTC(B8G8R8A8_SINT, 8_8_8_8_SINT, R8G8B8A8_SINT, WXYZ), /* 49 */ + TU6_xTC(B8G8R8A8_SRGB, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WXYZ), /* 50 */ + + /* 32-bit packed */ + TU6_VTC(A8B8G8R8_UNORM_PACK32, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX), /* 51 */ + TU6_VTC(A8B8G8R8_SNORM_PACK32, 8_8_8_8_SNORM, R8G8B8A8_SNORM, WZYX), /* 52 */ + TU6_Vxx(A8B8G8R8_USCALED_PACK32, 8_8_8_8_UINT, R8G8B8A8_UINT, WZYX), /* 53 */ + TU6_Vxx(A8B8G8R8_SSCALED_PACK32, 8_8_8_8_SINT, R8G8B8A8_SINT, WZYX), /* 54 */ + TU6_VTC(A8B8G8R8_UINT_PACK32, 8_8_8_8_UINT, R8G8B8A8_UINT, WZYX), /* 55 */ + TU6_VTC(A8B8G8R8_SINT_PACK32, 8_8_8_8_SINT, R8G8B8A8_SINT, WZYX), /* 56 */ + TU6_xTC(A8B8G8R8_SRGB_PACK32, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX), /* 57 */ + TU6_VTC(A2R10G10B10_UNORM_PACK32, 10_10_10_2_UNORM, R10G10B10A2_UNORM, WXYZ), /* 58 */ + TU6_Vxx(A2R10G10B10_SNORM_PACK32, 10_10_10_2_SNORM, R10G10B10A2_SNORM, WXYZ), /* 59 */ + TU6_Vxx(A2R10G10B10_USCALED_PACK32, 10_10_10_2_UINT, R10G10B10A2_UINT, WXYZ), /* 60 */ + TU6_Vxx(A2R10G10B10_SSCALED_PACK32, 10_10_10_2_SINT, R10G10B10A2_SINT, WXYZ), /* 61 */ + TU6_VTC(A2R10G10B10_UINT_PACK32, 10_10_10_2_UINT, R10G10B10A2_UINT, WXYZ), /* 62 */ + TU6_Vxx(A2R10G10B10_SINT_PACK32, 10_10_10_2_SINT, R10G10B10A2_SINT, WXYZ), /* 63 */ + TU6_VTC(A2B10G10R10_UNORM_PACK32, 10_10_10_2_UNORM, R10G10B10A2_UNORM, WZYX), /* 64 */ + TU6_Vxx(A2B10G10R10_SNORM_PACK32, 10_10_10_2_SNORM, R10G10B10A2_SNORM, WZYX), /* 65 */ + TU6_Vxx(A2B10G10R10_USCALED_PACK32, 10_10_10_2_UINT, R10G10B10A2_UINT, WZYX), /* 66 */ + TU6_Vxx(A2B10G10R10_SSCALED_PACK32, 10_10_10_2_SINT, R10G10B10A2_SINT, WZYX), /* 67 */ + TU6_VTC(A2B10G10R10_UINT_PACK32, 10_10_10_2_UINT, R10G10B10A2_UINT, WZYX), /* 68 */ + TU6_Vxx(A2B10G10R10_SINT_PACK32, 10_10_10_2_SINT, R10G10B10A2_SINT, WZYX), /* 69 */ + + /* 16-bit R */ + TU6_VTC(R16_UNORM, 16_UNORM, R16_UNORM, WZYX), /* 70 */ + TU6_VTC(R16_SNORM, 16_SNORM, R16_SNORM, WZYX), /* 71 */ + TU6_Vxx(R16_USCALED, 16_UINT, R16_UINT, WZYX), /* 72 */ + TU6_Vxx(R16_SSCALED, 16_SINT, R16_SINT, WZYX), /* 73 */ + TU6_VTC(R16_UINT, 16_UINT, R16_UINT, WZYX), /* 74 */ + TU6_VTC(R16_SINT, 16_SINT, R16_SINT, WZYX), /* 75 */ + TU6_VTC(R16_SFLOAT, 16_FLOAT, R16_FLOAT, WZYX), /* 76 */ + + /* 32-bit RG */ + TU6_VTC(R16G16_UNORM, 16_16_UNORM, R16G16_UNORM, WZYX), /* 77 */ + TU6_VTC(R16G16_SNORM, 16_16_SNORM, R16G16_SNORM, WZYX), /* 78 */ + TU6_VTx(R16G16_USCALED, 16_16_UINT, R16G16_UINT, WZYX), /* 79 */ + TU6_VTx(R16G16_SSCALED, 16_16_SINT, R16G16_SINT, WZYX), /* 80 */ + TU6_VTC(R16G16_UINT, 16_16_UINT, R16G16_UINT, WZYX), /* 81 */ + TU6_VTC(R16G16_SINT, 16_16_SINT, R16G16_SINT, WZYX), /* 82 */ + TU6_VTC(R16G16_SFLOAT, 16_16_FLOAT, R16G16_FLOAT, WZYX), /* 83 */ + + /* 48-bit RGB */ + TU6_Vxx(R16G16B16_UNORM, 16_16_16_UNORM, R16G16B16_UNORM, WZYX), /* 84 */ + TU6_Vxx(R16G16B16_SNORM, 16_16_16_SNORM, R16G16B16_SNORM, WZYX), /* 85 */ + TU6_Vxx(R16G16B16_USCALED, 16_16_16_UINT, R16G16B16_UINT, WZYX), /* 86 */ + TU6_Vxx(R16G16B16_SSCALED, 16_16_16_SINT, R16G16B16_SINT, WZYX), /* 87 */ + TU6_Vxx(R16G16B16_UINT, 16_16_16_UINT, R16G16B16_UINT, WZYX), /* 88 */ + TU6_Vxx(R16G16B16_SINT, 16_16_16_SINT, R16G16B16_SINT, WZYX), /* 89 */ + TU6_Vxx(R16G16B16_SFLOAT, 16_16_16_FLOAT, R16G16B16_FLOAT, WZYX), /* 90 */ + + /* 64-bit RGBA */ + TU6_VTC(R16G16B16A16_UNORM, 16_16_16_16_UNORM, R16G16B16A16_UNORM, WZYX), /* 91 */ + TU6_VTC(R16G16B16A16_SNORM, 16_16_16_16_SNORM, R16G16B16A16_SNORM, WZYX), /* 92 */ + TU6_VTx(R16G16B16A16_USCALED, 16_16_16_16_UINT, R16G16B16A16_UINT, WZYX), /* 93 */ + TU6_VTx(R16G16B16A16_SSCALED, 16_16_16_16_SINT, R16G16B16A16_SINT, WZYX), /* 94 */ + TU6_VTC(R16G16B16A16_UINT, 16_16_16_16_UINT, R16G16B16A16_UINT, WZYX), /* 95 */ + TU6_VTC(R16G16B16A16_SINT, 16_16_16_16_SINT, R16G16B16A16_SINT, WZYX), /* 96 */ + TU6_VTC(R16G16B16A16_SFLOAT, 16_16_16_16_FLOAT, R16G16B16A16_FLOAT, WZYX), /* 97 */ + + /* 32-bit R */ + TU6_VTC(R32_UINT, 32_UINT, R32_UINT, WZYX), /* 98 */ + TU6_VTC(R32_SINT, 32_SINT, R32_SINT, WZYX), /* 99 */ + TU6_VTC(R32_SFLOAT, 32_FLOAT, R32_FLOAT, WZYX), /* 100 */ + + /* 64-bit RG */ + TU6_VTC(R32G32_UINT, 32_32_UINT, R32G32_UINT, WZYX), /* 101 */ + TU6_VTC(R32G32_SINT, 32_32_SINT, R32G32_SINT, WZYX), /* 102 */ + TU6_VTC(R32G32_SFLOAT, 32_32_FLOAT, R32G32_FLOAT, WZYX), /* 103 */ + + /* 96-bit RGB */ + TU6_VTx(R32G32B32_UINT, 32_32_32_UINT, R32G32B32_UINT, WZYX), /* 104 */ + TU6_VTx(R32G32B32_SINT, 32_32_32_SINT, R32G32B32_SINT, WZYX), /* 105 */ + TU6_VTx(R32G32B32_SFLOAT, 32_32_32_FLOAT, R32G32B32_FLOAT, WZYX), /* 106 */ + + /* 128-bit RGBA */ + TU6_VTC(R32G32B32A32_UINT, 32_32_32_32_UINT, R32G32B32A32_UINT, WZYX), /* 107 */ + TU6_VTC(R32G32B32A32_SINT, 32_32_32_32_SINT, R32G32B32A32_SINT, WZYX), /* 108 */ + TU6_VTC(R32G32B32A32_SFLOAT, 32_32_32_32_FLOAT, R32G32B32A32_FLOAT, WZYX), /* 109 */ + + /* 64-bit R */ + TU6_xxx(R64_UINT, 64_UINT, R64_UINT, WZYX), /* 110 */ + TU6_xxx(R64_SINT, 64_SINT, R64_SINT, WZYX), /* 111 */ + TU6_xxx(R64_SFLOAT, 64_FLOAT, R64_FLOAT, WZYX), /* 112 */ + + /* 128-bit RG */ + TU6_xxx(R64G64_UINT, 64_64_UINT, R64G64_UINT, WZYX), /* 113 */ + TU6_xxx(R64G64_SINT, 64_64_SINT, R64G64_SINT, WZYX), /* 114 */ + TU6_xxx(R64G64_SFLOAT, 64_64_FLOAT, R64G64_FLOAT, WZYX), /* 115 */ + + /* 192-bit RGB */ + TU6_xxx(R64G64B64_UINT, 64_64_64_UINT, R64G64B64_UINT, WZYX), /* 116 */ + TU6_xxx(R64G64B64_SINT, 64_64_64_SINT, R64G64B64_SINT, WZYX), /* 117 */ + TU6_xxx(R64G64B64_SFLOAT, 64_64_64_FLOAT, R64G64B64_FLOAT, WZYX), /* 118 */ + + /* 256-bit RGBA */ + TU6_xxx(R64G64B64A64_UINT, 64_64_64_64_UINT, R64G64B64A64_UINT, WZYX), /* 119 */ + TU6_xxx(R64G64B64A64_SINT, 64_64_64_64_SINT, R64G64B64A64_SINT, WZYX), /* 120 */ + TU6_xxx(R64G64B64A64_SFLOAT, 64_64_64_64_FLOAT, R64G64B64A64_FLOAT, WZYX), /* 121 */ + + /* 32-bit packed float */ + TU6_VTC(B10G11R11_UFLOAT_PACK32, 11_11_10_FLOAT, R11G11B10_FLOAT, WZYX), /* 122 */ + TU6_xTx(E5B9G9R9_UFLOAT_PACK32, 9_9_9_E5_FLOAT, R9G9B9E5_FLOAT, WZYX), /* 123 */ + + /* depth/stencil */ + TU6_xTC(D16_UNORM, 16_UNORM, R16_UNORM, WZYX), /* 124 */ + TU6_xTC(X8_D24_UNORM_PACK32, X8Z24_UNORM, X8Z24_UNORM, WZYX), /* 125 */ + TU6_xTC(D32_SFLOAT, 32_FLOAT, R32_FLOAT, WZYX), /* 126 */ + TU6_xTC(S8_UINT, 8_UINT, R8_UNORM, WZYX), /* 127 */ + TU6_xxx(D16_UNORM_S8_UINT, X8Z16_UNORM, X8Z16_UNORM, WZYX), /* 128 */ + TU6_xTC(D24_UNORM_S8_UINT, X8Z24_UNORM, X8Z24_UNORM, WZYX), /* 129 */ + TU6_xTC(D32_SFLOAT_S8_UINT, 32_FLOAT, R32_FLOAT, WZYX), /* 130 */ + + /* compressed */ + TU6_xTx(BC1_RGB_UNORM_BLOCK, DXT1, DXT1, WZYX), /* 131 */ + TU6_xTx(BC1_RGB_SRGB_BLOCK, DXT1, DXT1, WZYX), /* 132 */ + TU6_xTx(BC1_RGBA_UNORM_BLOCK, DXT1, DXT1, WZYX), /* 133 */ + TU6_xTx(BC1_RGBA_SRGB_BLOCK, DXT1, DXT1, WZYX), /* 134 */ + TU6_xTx(BC2_UNORM_BLOCK, DXT3, DXT3, WZYX), /* 135 */ + TU6_xTx(BC2_SRGB_BLOCK, DXT3, DXT3, WZYX), /* 136 */ + TU6_xTx(BC3_UNORM_BLOCK, DXT5, DXT5, WZYX), /* 137 */ + TU6_xTx(BC3_SRGB_BLOCK, DXT5, DXT5, WZYX), /* 138 */ + TU6_xTx(BC4_UNORM_BLOCK, RGTC1_UNORM, RGTC1_UNORM, WZYX), /* 139 */ + TU6_xTx(BC4_SNORM_BLOCK, RGTC1_SNORM, RGTC1_SNORM, WZYX), /* 140 */ + TU6_xTx(BC5_UNORM_BLOCK, RGTC2_UNORM, RGTC2_UNORM, WZYX), /* 141 */ + TU6_xTx(BC5_SNORM_BLOCK, RGTC2_SNORM, RGTC2_SNORM, WZYX), /* 142 */ + TU6_xTx(BC6H_UFLOAT_BLOCK, BPTC_UFLOAT, BPTC_UFLOAT, WZYX), /* 143 */ + TU6_xTx(BC6H_SFLOAT_BLOCK, BPTC_FLOAT, BPTC_FLOAT, WZYX), /* 144 */ + TU6_xTx(BC7_UNORM_BLOCK, BPTC, BPTC, WZYX), /* 145 */ + TU6_xTx(BC7_SRGB_BLOCK, BPTC, BPTC, WZYX), /* 146 */ + TU6_xTx(ETC2_R8G8B8_UNORM_BLOCK, ETC2_RGB8, ETC2_RGB8, WZYX), /* 147 */ + TU6_xTx(ETC2_R8G8B8_SRGB_BLOCK, ETC2_RGB8, ETC2_RGB8, WZYX), /* 148 */ + TU6_xTx(ETC2_R8G8B8A1_UNORM_BLOCK, ETC2_RGB8A1, ETC2_RGB8A1, WZYX), /* 149 */ + TU6_xTx(ETC2_R8G8B8A1_SRGB_BLOCK, ETC2_RGB8A1, ETC2_RGB8A1, WZYX), /* 150 */ + TU6_xTx(ETC2_R8G8B8A8_UNORM_BLOCK, ETC2_RGBA8, ETC2_RGBA8, WZYX), /* 151 */ + TU6_xTx(ETC2_R8G8B8A8_SRGB_BLOCK, ETC2_RGBA8, ETC2_RGBA8, WZYX), /* 152 */ + TU6_xTx(EAC_R11_UNORM_BLOCK, ETC2_R11_UNORM, ETC2_R11_UNORM, WZYX), /* 153 */ + TU6_xTx(EAC_R11_SNORM_BLOCK, ETC2_R11_SNORM, ETC2_R11_SNORM, WZYX), /* 154 */ + TU6_xTx(EAC_R11G11_UNORM_BLOCK, ETC2_RG11_UNORM, ETC2_RG11_UNORM, WZYX), /* 155 */ + TU6_xTx(EAC_R11G11_SNORM_BLOCK, ETC2_RG11_SNORM, ETC2_RG11_SNORM, WZYX), /* 156 */ + TU6_xTx(ASTC_4x4_UNORM_BLOCK, ASTC_4x4, ASTC_4x4, WZYX), /* 157 */ + TU6_xTx(ASTC_4x4_SRGB_BLOCK, ASTC_4x4, ASTC_4x4, WZYX), /* 158 */ + TU6_xTx(ASTC_5x4_UNORM_BLOCK, ASTC_5x4, ASTC_5x4, WZYX), /* 159 */ + TU6_xTx(ASTC_5x4_SRGB_BLOCK, ASTC_5x4, ASTC_5x4, WZYX), /* 160 */ + TU6_xTx(ASTC_5x5_UNORM_BLOCK, ASTC_5x5, ASTC_5x5, WZYX), /* 161 */ + TU6_xTx(ASTC_5x5_SRGB_BLOCK, ASTC_5x5, ASTC_5x5, WZYX), /* 162 */ + TU6_xTx(ASTC_6x5_UNORM_BLOCK, ASTC_6x5, ASTC_6x5, WZYX), /* 163 */ + TU6_xTx(ASTC_6x5_SRGB_BLOCK, ASTC_6x5, ASTC_6x5, WZYX), /* 164 */ + TU6_xTx(ASTC_6x6_UNORM_BLOCK, ASTC_6x6, ASTC_6x6, WZYX), /* 165 */ + TU6_xTx(ASTC_6x6_SRGB_BLOCK, ASTC_6x6, ASTC_6x6, WZYX), /* 166 */ + TU6_xTx(ASTC_8x5_UNORM_BLOCK, ASTC_8x5, ASTC_8x5, WZYX), /* 167 */ + TU6_xTx(ASTC_8x5_SRGB_BLOCK, ASTC_8x5, ASTC_8x5, WZYX), /* 168 */ + TU6_xTx(ASTC_8x6_UNORM_BLOCK, ASTC_8x6, ASTC_8x6, WZYX), /* 169 */ + TU6_xTx(ASTC_8x6_SRGB_BLOCK, ASTC_8x6, ASTC_8x6, WZYX), /* 170 */ + TU6_xTx(ASTC_8x8_UNORM_BLOCK, ASTC_8x8, ASTC_8x8, WZYX), /* 171 */ + TU6_xTx(ASTC_8x8_SRGB_BLOCK, ASTC_8x8, ASTC_8x8, WZYX), /* 172 */ + TU6_xTx(ASTC_10x5_UNORM_BLOCK, ASTC_10x5, ASTC_10x5, WZYX), /* 173 */ + TU6_xTx(ASTC_10x5_SRGB_BLOCK, ASTC_10x5, ASTC_10x5, WZYX), /* 174 */ + TU6_xTx(ASTC_10x6_UNORM_BLOCK, ASTC_10x6, ASTC_10x6, WZYX), /* 175 */ + TU6_xTx(ASTC_10x6_SRGB_BLOCK, ASTC_10x6, ASTC_10x6, WZYX), /* 176 */ + TU6_xTx(ASTC_10x8_UNORM_BLOCK, ASTC_10x8, ASTC_10x8, WZYX), /* 177 */ + TU6_xTx(ASTC_10x8_SRGB_BLOCK, ASTC_10x8, ASTC_10x8, WZYX), /* 178 */ + TU6_xTx(ASTC_10x10_UNORM_BLOCK, ASTC_10x10, ASTC_10x10, WZYX), /* 179 */ + TU6_xTx(ASTC_10x10_SRGB_BLOCK, ASTC_10x10, ASTC_10x10, WZYX), /* 180 */ + TU6_xTx(ASTC_12x10_UNORM_BLOCK, ASTC_12x10, ASTC_12x10, WZYX), /* 181 */ + TU6_xTx(ASTC_12x10_SRGB_BLOCK, ASTC_12x10, ASTC_12x10, WZYX), /* 182 */ + TU6_xTx(ASTC_12x12_UNORM_BLOCK, ASTC_12x12, ASTC_12x12, WZYX), /* 183 */ + TU6_xTx(ASTC_12x12_SRGB_BLOCK, ASTC_12x12, ASTC_12x12, WZYX), /* 184 */ +}; +#undef TU_FORMAT_TABLE_FIRST +#undef TU_FORMAT_TABLE_LAST -bool -tu6_format_vtx_supported(VkFormat vk_format) +const struct tu_native_format * +tu6_get_native_format(VkFormat format) { - enum pipe_format format = vk_format_to_pipe_format(vk_format); - return fd6_vertex_format(format) != FMT6_NONE; + const struct tu_native_format *fmt = NULL; + + if (format >= tu6_format_table0_first && format <= tu6_format_table0_last) + fmt = &tu6_format_table0[format - tu6_format_table0_first]; + + return (fmt && fmt->present) ? fmt : NULL; } -/* Map non-colorspace-converted YUV formats to RGB pipe formats where we can, - * since our hardware doesn't support colorspace conversion. - * - * Really, we should probably be returning the RGB formats in - * vk_format_to_pipe_format, but we don't have all the equivalent pipe formats - * for VK RGB formats yet, and we'd have to switch all consumers of that - * function at once. - */ -enum pipe_format -tu_vk_format_to_pipe_format(VkFormat vk_format) +enum a6xx_2d_ifmt +tu6_rb_fmt_to_ifmt(enum a6xx_color_fmt fmt) { - switch (vk_format) { - case VK_FORMAT_G8B8G8R8_422_UNORM: /* YUYV */ - return PIPE_FORMAT_R8G8_R8B8_UNORM; - case VK_FORMAT_B8G8R8G8_422_UNORM: /* UYVY */ - return PIPE_FORMAT_G8R8_B8R8_UNORM; - case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: - return PIPE_FORMAT_G8_B8R8_420_UNORM; - case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: - return PIPE_FORMAT_G8_B8_R8_420_UNORM; + switch (fmt) { + case RB6_A8_UNORM: + case RB6_R8_UNORM: + case RB6_R8_SNORM: + case RB6_R8G8_UNORM: + case RB6_R8G8_SNORM: + case RB6_R8G8B8A8_UNORM: + case RB6_R8G8B8_UNORM: + case RB6_R8G8B8A8_SNORM: + return R2D_UNORM8; + + case RB6_R32_UINT: + case RB6_R32_SINT: + case RB6_R32G32_UINT: + case RB6_R32G32_SINT: + case RB6_R32G32B32A32_UINT: + case RB6_R32G32B32A32_SINT: + return R2D_INT32; + + case RB6_R16_UINT: + case RB6_R16_SINT: + case RB6_R16G16_UINT: + case RB6_R16G16_SINT: + case RB6_R16G16B16A16_UINT: + case RB6_R16G16B16A16_SINT: + return R2D_INT16; + + case RB6_R8_UINT: + case RB6_R8_SINT: + case RB6_R8G8_UINT: + case RB6_R8G8_SINT: + case RB6_R8G8B8A8_UINT: + case RB6_R8G8B8A8_SINT: + return R2D_INT8; + + case RB6_R16_UNORM: + case RB6_R16_SNORM: + case RB6_R16G16_UNORM: + case RB6_R16G16_SNORM: + case RB6_R16G16B16A16_UNORM: + case RB6_R16G16B16A16_SNORM: + case RB6_R32_FLOAT: + case RB6_R32G32_FLOAT: + case RB6_R32G32B32A32_FLOAT: + return R2D_FLOAT32; + + case RB6_R16_FLOAT: + case RB6_R16G16_FLOAT: + case RB6_R16G16B16A16_FLOAT: + return R2D_FLOAT16; + + case RB6_R4G4B4A4_UNORM: + case RB6_R5G5B5A1_UNORM: + case RB6_R5G6B5_UNORM: + case RB6_R10G10B10A2_UNORM: + case RB6_R10G10B10A2_UINT: + case RB6_R11G11B10_FLOAT: + case RB6_X8Z24_UNORM: + // ??? + return 0; default: - return vk_format_to_pipe_format(vk_format); + unreachable("bad format"); + return 0; } } -static struct tu_native_format -tu6_format_color_unchecked(enum pipe_format format, enum a6xx_tile_mode tile_mode) +static uint32_t +tu_pack_mask(int bits) { - struct tu_native_format fmt = { - .fmt = fd6_color_format(format, tile_mode), - .swap = fd6_color_swap(format, tile_mode), - }; - - switch (format) { - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - fmt.fmt = FMT6_8_8_8_8_UNORM; - break; - - default: - break; - } - - return fmt; + assert(bits <= 32); + return (1ull << bits) - 1; } -bool -tu6_format_color_supported(enum pipe_format format) +static uint32_t +tu_pack_float32_for_unorm(float val, int bits) { - return tu6_format_color_unchecked(format, TILE6_LINEAR).fmt != FMT6_NONE; + const uint32_t max = tu_pack_mask(bits); + if (val < 0.0f) + return 0; + else if (val > 1.0f) + return max; + else + return _mesa_lroundevenf(val * (float) max); } -struct tu_native_format -tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode) +static uint32_t +tu_pack_float32_for_snorm(float val, int bits) { - struct tu_native_format fmt = tu6_format_color_unchecked(format, tile_mode); - assert(fmt.fmt != FMT6_NONE); - return fmt; + const int32_t max = tu_pack_mask(bits - 1); + int32_t tmp; + if (val < -1.0f) + tmp = -max; + else if (val > 1.0f) + tmp = max; + else + tmp = _mesa_lroundevenf(val * (float) max); + + return tmp & tu_pack_mask(bits); } -static struct tu_native_format -tu6_format_texture_unchecked(enum pipe_format format, enum a6xx_tile_mode tile_mode) +static uint32_t +tu_pack_float32_for_uscaled(float val, int bits) { - struct tu_native_format fmt = { - .fmt = fd6_texture_format(format, tile_mode), - .swap = fd6_texture_swap(format, tile_mode), - }; - - switch (format) { - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - /* freedreno uses Z24_UNORM_S8_UINT (sampling) or - * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 (blits) for this format, while we use - * FMT6_8_8_8_8_UNORM or FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 - */ - fmt.fmt = FMT6_8_8_8_8_UNORM; - break; + const uint32_t max = tu_pack_mask(bits); + if (val < 0.0f) + return 0; + else if (val > (float) max) + return max; + else + return (uint32_t) val; +} - default: - break; - } +static uint32_t +tu_pack_float32_for_sscaled(float val, int bits) +{ + const int32_t max = tu_pack_mask(bits - 1); + const int32_t min = -max - 1; + int32_t tmp; + if (val < (float) min) + tmp = min; + else if (val > (float) max) + tmp = max; + else + tmp = (int32_t) val; + + return tmp & tu_pack_mask(bits); +} - return fmt; +static uint32_t +tu_pack_uint32_for_uint(uint32_t val, int bits) +{ + return val & tu_pack_mask(bits); } -struct tu_native_format -tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode) +static uint32_t +tu_pack_int32_for_sint(int32_t val, int bits) { - struct tu_native_format fmt = tu6_format_texture_unchecked(format, tile_mode); - assert(fmt.fmt != FMT6_NONE); - return fmt; + return val & tu_pack_mask(bits); } -bool -tu6_format_texture_supported(enum pipe_format format) +static uint32_t +tu_pack_float32_for_sfloat(float val, int bits) { - return tu6_format_texture_unchecked(format, TILE6_LINEAR).fmt != FMT6_NONE; + assert(bits == 16 || bits == 32); + return bits == 16 ? util_float_to_half(val) : fui(val); } -enum tu6_ubwc_compat_type { - TU6_UBWC_UNKNOWN_COMPAT, - TU6_UBWC_R8G8_UNORM, - TU6_UBWC_R8G8_INT, - TU6_UBWC_R8G8B8A8_UNORM, - TU6_UBWC_R8G8B8A8_INT, - TU6_UBWC_B8G8R8A8_UNORM, - TU6_UBWC_R16G16_INT, - TU6_UBWC_R16G16B16A16_INT, - TU6_UBWC_R32_INT, - TU6_UBWC_R32G32_INT, - TU6_UBWC_R32G32B32A32_INT, - TU6_UBWC_R32_FLOAT, +union tu_clear_component_value { + float float32; + int32_t int32; + uint32_t uint32; }; -static enum tu6_ubwc_compat_type -tu6_ubwc_compat_mode(VkFormat format) +static uint32_t +tu_pack_clear_component_value(union tu_clear_component_value val, + const struct vk_format_channel_description *ch) { - switch (format) { - case VK_FORMAT_R8G8_UNORM: - case VK_FORMAT_R8G8_SRGB: - return TU6_UBWC_R8G8_UNORM; - - case VK_FORMAT_R8G8_UINT: - case VK_FORMAT_R8G8_SINT: - return TU6_UBWC_R8G8_INT; - - case VK_FORMAT_R8G8B8A8_UNORM: - case VK_FORMAT_R8G8B8A8_SRGB: - case VK_FORMAT_A8B8G8R8_UNORM_PACK32: - case VK_FORMAT_A8B8G8R8_SRGB_PACK32: - return TU6_UBWC_R8G8B8A8_UNORM; - - case VK_FORMAT_R8G8B8A8_UINT: - case VK_FORMAT_R8G8B8A8_SINT: - case VK_FORMAT_A8B8G8R8_UINT_PACK32: - case VK_FORMAT_A8B8G8R8_SINT_PACK32: - return TU6_UBWC_R8G8B8A8_INT; - - case VK_FORMAT_R16G16_UINT: - case VK_FORMAT_R16G16_SINT: - return TU6_UBWC_R16G16_INT; - - case VK_FORMAT_R16G16B16A16_UINT: - case VK_FORMAT_R16G16B16A16_SINT: - return TU6_UBWC_R16G16B16A16_INT; - - case VK_FORMAT_R32_UINT: - case VK_FORMAT_R32_SINT: - return TU6_UBWC_R32_INT; - - case VK_FORMAT_R32G32_UINT: - case VK_FORMAT_R32G32_SINT: - return TU6_UBWC_R32G32_INT; - - case VK_FORMAT_R32G32B32A32_UINT: - case VK_FORMAT_R32G32B32A32_SINT: - return TU6_UBWC_R32G32B32A32_INT; - - case VK_FORMAT_D32_SFLOAT: - case VK_FORMAT_R32_SFLOAT: - /* TODO: a630 blob allows these, but not a660. When is it legal? */ - return TU6_UBWC_UNKNOWN_COMPAT; - - case VK_FORMAT_B8G8R8A8_UNORM: - case VK_FORMAT_B8G8R8A8_SRGB: - /* The blob doesn't list these as compatible, but they surely are. - * freedreno's happy to cast between them, and zink would really like - * to. - */ - return TU6_UBWC_B8G8R8A8_UNORM; - + uint32_t packed; + + switch (ch->type) { + case VK_FORMAT_TYPE_UNSIGNED: + /* normalized, scaled, or pure integer */ + assert(ch->normalized + ch->scaled + ch->pure_integer == 1); + if (ch->normalized) + packed = tu_pack_float32_for_unorm(val.float32, ch->size); + else if (ch->scaled) + packed = tu_pack_float32_for_uscaled(val.float32, ch->size); + else + packed = tu_pack_uint32_for_uint(val.uint32, ch->size); + break; + case VK_FORMAT_TYPE_SIGNED: + /* normalized, scaled, or pure integer */ + assert(ch->normalized + ch->scaled + ch->pure_integer == 1); + if (ch->normalized) + packed = tu_pack_float32_for_snorm(val.float32, ch->size); + else if (ch->scaled) + packed = tu_pack_float32_for_sscaled(val.float32, ch->size); + else + packed = tu_pack_int32_for_sint(val.int32, ch->size); + break; + case VK_FORMAT_TYPE_FLOAT: + packed = tu_pack_float32_for_sfloat(val.float32, ch->size); + break; default: - return TU6_UBWC_UNKNOWN_COMPAT; + unreachable("unexpected channel type"); + packed = 0; + break; } + + assert((packed & tu_pack_mask(ch->size)) == packed); + return packed; } -bool -tu6_mutable_format_list_ubwc_compatible(const VkImageFormatListCreateInfo *fmt_list) +static const struct vk_format_channel_description * +tu_get_format_channel_description(const struct vk_format_description *desc, + int comp) { - if (!fmt_list || !fmt_list->viewFormatCount) - return false; - - /* We're only looking at format list cross compatibility here, check - * ubwc_possible() for the base "is the format UBWC-able at all?" - */ - if (fmt_list->viewFormatCount == 1) - return true; - - enum tu6_ubwc_compat_type type = - tu6_ubwc_compat_mode(fmt_list->pViewFormats[0]); - if (type == TU6_UBWC_UNKNOWN_COMPAT) - return false; - - for (uint32_t i = 1; i < fmt_list->viewFormatCount; i++) { - if (tu6_ubwc_compat_mode(fmt_list->pViewFormats[i]) != type) - return false; + switch (desc->swizzle[comp]) { + case VK_SWIZZLE_X: + return &desc->channel[0]; + case VK_SWIZZLE_Y: + return &desc->channel[1]; + case VK_SWIZZLE_Z: + return &desc->channel[2]; + case VK_SWIZZLE_W: + return &desc->channel[3]; + default: + return NULL; } - - return true; } -static void -tu_physical_device_get_format_properties( - struct tu_physical_device *physical_device, - VkFormat vk_format, - VkFormatProperties3 *out_properties) +static union tu_clear_component_value +tu_get_clear_component_value(const VkClearValue *val, int comp, bool color) { - VkFormatFeatureFlags2 linear = 0, optimal = 0, buffer = 0; - enum pipe_format format = tu_vk_format_to_pipe_format(vk_format); - const struct util_format_description *desc = util_format_description(format); - - bool supported_vtx = tu6_format_vtx_supported(vk_format); - bool supported_color = tu6_format_color_supported(format); - bool supported_tex = tu6_format_texture_supported(format); - bool is_npot = !util_is_power_of_two_or_zero(desc->block.bits); - - if (format == PIPE_FORMAT_NONE || - !(supported_vtx || supported_color || supported_tex)) { - goto end; + union tu_clear_component_value tmp; + if (color) { + assert(comp < 4); + tmp.uint32 = val->color.uint32[comp]; + } else { + assert(comp < 2); + if (comp == 0) + tmp.float32 = val->depthStencil.depth; + else + tmp.uint32 = val->depthStencil.stencil; } - /* We don't support BufferToImage/ImageToBuffer for npot formats */ - if (!is_npot) - buffer |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT; + return tmp; +} - if (supported_vtx) - buffer |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT; +/** + * Pack a VkClearValue into a 128-bit buffer. \a format is respected except + * for the component order. The components are always packed in WZYX order + * (i.e., msb is white and lsb is red). + * + * Return the number of uint32_t's used. + */ +int +tu_pack_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4]) +{ + const struct vk_format_description *desc = vk_format_description(format); + assert(desc && desc->layout == VK_FORMAT_LAYOUT_PLAIN); + + /* S8_UINT is special and has no depth */ + const int max_components = + format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels; + + int buf_offset = 0; + int bit_shift = 0; + for (int comp = 0; comp < max_components; comp++) { + const struct vk_format_channel_description *ch = + tu_get_format_channel_description(desc, comp); + if (!ch) { + assert(format == VK_FORMAT_S8_UINT && comp == 0); + continue; + } - if (supported_tex) - buffer |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT; + union tu_clear_component_value v = tu_get_clear_component_value( + val, comp, desc->colorspace != VK_FORMAT_COLORSPACE_ZS); - /* Don't support anything but texel buffers for non-power-of-two formats - * with 3 components. We'd need several workarounds for copying and - * clearing them because they're not renderable. - */ - if (supported_tex && !is_npot) { - optimal |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | - VK_FORMAT_FEATURE_TRANSFER_DST_BIT | - VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | - VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT | - VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT | - VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT; - - /* no blit src bit for YUYV/NV12/I420 formats */ - if (desc->layout != UTIL_FORMAT_LAYOUT_SUBSAMPLED && - desc->layout != UTIL_FORMAT_LAYOUT_PLANAR2 && - desc->layout != UTIL_FORMAT_LAYOUT_PLANAR3) - optimal |= VK_FORMAT_FEATURE_BLIT_SRC_BIT; - - if (desc->layout != UTIL_FORMAT_LAYOUT_SUBSAMPLED) - optimal |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT; - - if (!vk_format_is_int(vk_format)) { - optimal |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT; - - if (physical_device->vk.supported_extensions.EXT_filter_cubic) - optimal |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_CUBIC_BIT_EXT; + /* move to the next uint32_t when there is not enough space */ + assert(ch->size <= 32); + if (bit_shift + ch->size > 32) { + buf_offset++; + bit_shift = 0; } - } - if (supported_color) { - assert(supported_tex); - optimal |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | - VK_FORMAT_FEATURE_BLIT_DST_BIT | - VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT | - VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT | - VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT; + if (bit_shift == 0) + buf[buf_offset] = 0; - buffer |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT | - VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT | - VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT; + buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift; + bit_shift += ch->size; + } - /* TODO: The blob also exposes these for R16G16_UINT/R16G16_SINT, but we - * don't have any tests for those. - */ - if (vk_format == VK_FORMAT_R32_UINT || vk_format == VK_FORMAT_R32_SINT) { - optimal |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT; - buffer |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT; - } + return buf_offset + 1; +} - if (!util_format_is_pure_integer(format)) - optimal |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT; +static void +tu_physical_device_get_format_properties( + struct tu_physical_device *physical_device, + VkFormat format, + VkFormatProperties *out_properties) +{ + VkFormatFeatureFlags linear = 0, tiled = 0, buffer = 0; + const struct vk_format_description *desc = vk_format_description(format); + const struct tu_native_format *native_fmt = tu6_get_native_format(format); + if (!desc || !native_fmt) { + out_properties->linearTilingFeatures = linear; + out_properties->optimalTilingFeatures = tiled; + out_properties->bufferFeatures = buffer; + return; } - /* For the most part, we can do anything with a linear image that we could - * do with a tiled image. However, we can't support sysmem rendering with a - * linear depth texture, because we don't know if there's a bit to control - * the tiling of the depth buffer in BYPASS mode, and the blob also - * disables linear depth rendering, so there's no way to discover it. We - * also can't force GMEM mode, because there are other situations where we - * have to use sysmem rendering. So follow the blob here, and only enable - * DEPTH_STENCIL_ATTACHMENT_BIT for the optimal features. - */ - linear = optimal; - if (tu6_pipe2depth(vk_format) != (enum a6xx_depth_format)~0) - optimal |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT; - - if (!tiling_possible(vk_format) && - /* We don't actually support tiling for this format, but we need to - * fake it as it's required by VK_KHR_sampler_ycbcr_conversion. - */ - vk_format != VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) { - optimal = 0; - } + linear |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT; + tiled |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT; + buffer |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT; - if (vk_format == VK_FORMAT_G8B8G8R8_422_UNORM || - vk_format == VK_FORMAT_B8G8R8G8_422_UNORM || - vk_format == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM || - vk_format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) { - /* Disable buffer texturing of subsampled (422) and planar YUV textures. - * The subsampling requirement comes from "If format is a block-compressed - * format, then bufferFeatures must not support any features for the - * format" plus the specification of subsampled as 2x1 compressed block - * format. I couldn't find the citation for planar, but 1D access of - * planar YUV would be really silly. - */ - buffer = 0; + if (native_fmt->tex >= 0) { + linear |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT; + tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT; + buffer |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT; } - /* We don't support writing into VK__FORMAT_*_PACK16 images/buffers */ - if (desc->nr_channels > 2 && desc->block.bits == 16) { - buffer &= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT; - linear &= ~(VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT | - VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT); - optimal &= ~(VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT | - VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT); + if (native_fmt->rb >= 0) { + linear |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT; + tiled |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT; } - /* All our depth formats support shadow comparisons. */ - if (vk_format_has_depth(vk_format) && (optimal & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) { - optimal |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT; - linear |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT; + if (native_fmt->vtx >= 0) { + buffer |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT; } - /* From the Vulkan 1.3.205 spec, section 19.3 "43.3. Required Format Support": - * - * Mandatory format support: depth/stencil with VkImageType - * VK_IMAGE_TYPE_2D - * [...] - * bufferFeatures must not support any features for these formats - */ - if (vk_format_is_depth_or_stencil(vk_format)) - buffer = 0; - - /* D32_SFLOAT_S8_UINT is tiled as two images, so no linear format - * blob enables some linear features, but its not useful, so don't bother. - */ - if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) - linear = 0; - -end: out_properties->linearTilingFeatures = linear; - out_properties->optimalTilingFeatures = optimal; + out_properties->optimalTilingFeatures = tiled; out_properties->bufferFeatures = buffer; } -VKAPI_ATTR void VKAPI_CALL +void +tu_GetPhysicalDeviceFormatProperties(VkPhysicalDevice physicalDevice, + VkFormat format, + VkFormatProperties *pFormatProperties) +{ + TU_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice); + + tu_physical_device_get_format_properties(physical_device, format, + pFormatProperties); +} + +void tu_GetPhysicalDeviceFormatProperties2( VkPhysicalDevice physicalDevice, VkFormat format, @@ -397,61 +663,18 @@ tu_GetPhysicalDeviceFormatProperties2( { TU_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice); - VkFormatProperties3 local_props3; - VkFormatProperties3 *props3 = - vk_find_struct(pFormatProperties->pNext, FORMAT_PROPERTIES_3); - if (!props3) - props3 = &local_props3; - tu_physical_device_get_format_properties( - physical_device, format, props3); - - pFormatProperties->formatProperties = (VkFormatProperties) { - .linearTilingFeatures = props3->linearTilingFeatures, - .optimalTilingFeatures = props3->optimalTilingFeatures, - .bufferFeatures = props3->bufferFeatures, - }; - - VkDrmFormatModifierPropertiesListEXT *list = - vk_find_struct(pFormatProperties->pNext, DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT); - if (list) { - VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierPropertiesEXT, out, - list->pDrmFormatModifierProperties, - &list->drmFormatModifierCount); - - if (pFormatProperties->formatProperties.linearTilingFeatures) { - vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, &out, mod_props) { - mod_props->drmFormatModifier = DRM_FORMAT_MOD_LINEAR; - mod_props->drmFormatModifierPlaneCount = tu6_plane_count(format); - mod_props->drmFormatModifierTilingFeatures = - pFormatProperties->formatProperties.linearTilingFeatures; - } - } - - /* note: ubwc_possible() argument values to be ignored except for format */ - if (pFormatProperties->formatProperties.optimalTilingFeatures && - tiling_possible(format) && - ubwc_possible(NULL, format, VK_IMAGE_TYPE_2D, 0, 0, - physical_device->info, VK_SAMPLE_COUNT_1_BIT, - false)) { - vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, &out, mod_props) { - mod_props->drmFormatModifier = DRM_FORMAT_MOD_QCOM_COMPRESSED; - mod_props->drmFormatModifierPlaneCount = tu6_plane_count(format); - mod_props->drmFormatModifierTilingFeatures = - pFormatProperties->formatProperties.optimalTilingFeatures; - } - } - } + physical_device, format, &pFormatProperties->formatProperties); } static VkResult tu_get_image_format_properties( struct tu_physical_device *physical_device, const VkPhysicalDeviceImageFormatInfo2 *info, - VkImageFormatProperties *pImageFormatProperties, - VkFormatFeatureFlags *p_feature_flags) + VkImageFormatProperties *pImageFormatProperties) + { - VkFormatProperties3 format_props; + VkFormatProperties format_props; VkFormatFeatureFlags format_feature_flags; VkExtent3D maxExtent; uint32_t maxMipLevels; @@ -460,53 +683,12 @@ tu_get_image_format_properties( tu_physical_device_get_format_properties(physical_device, info->format, &format_props); - - switch (info->tiling) { - case VK_IMAGE_TILING_LINEAR: + if (info->tiling == VK_IMAGE_TILING_LINEAR) { format_feature_flags = format_props.linearTilingFeatures; - break; - - case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT: { - const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *drm_info = - vk_find_struct_const(info->pNext, PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT); - - switch (drm_info->drmFormatModifier) { - case DRM_FORMAT_MOD_QCOM_COMPRESSED: - /* falling back to linear/non-UBWC isn't possible with explicit modifier */ - - /* formats which don't support tiling */ - if (!format_props.optimalTilingFeatures || - !tiling_possible(info->format)) - return VK_ERROR_FORMAT_NOT_SUPPORTED; - - if (info->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) { - const VkImageFormatListCreateInfo *format_list = - vk_find_struct_const(info->pNext, - IMAGE_FORMAT_LIST_CREATE_INFO); - if (!tu6_mutable_format_list_ubwc_compatible(format_list)) - return VK_ERROR_FORMAT_NOT_SUPPORTED; - } - - if (!ubwc_possible(NULL, info->format, info->type, info->usage, - info->usage, physical_device->info, sampleCounts, - false)) { - return VK_ERROR_FORMAT_NOT_SUPPORTED; - } - - format_feature_flags = format_props.optimalTilingFeatures; - break; - case DRM_FORMAT_MOD_LINEAR: - format_feature_flags = format_props.linearTilingFeatures; - break; - default: - return VK_ERROR_FORMAT_NOT_SUPPORTED; - } - } break; - case VK_IMAGE_TILING_OPTIMAL: + } else if (info->tiling == VK_IMAGE_TILING_OPTIMAL) { format_feature_flags = format_props.optimalTilingFeatures; - break; - default: - unreachable("bad VkPhysicalDeviceImageFormatInfo2"); + } else { + unreachable("bad VkImageTiling"); } if (format_feature_flags == 0) @@ -549,50 +731,29 @@ tu_get_image_format_properties( VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) && !(info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) && !(info->usage & VK_IMAGE_USAGE_STORAGE_BIT)) { - sampleCounts |= VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT; - /* note: most operations support 8 samples (GMEM render/resolve do at least) - * but some do not (which ones?), just disable 8 samples completely, - * (no 8x msaa matches the blob driver behavior) - */ + sampleCounts |= VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT | + VK_SAMPLE_COUNT_8_BIT; } - /* From the Vulkan 1.3.206 spec: - * - * "VK_IMAGE_CREATE_EXTENDED_USAGE_BIT specifies that the image can be - * created with usage flags that are not supported for the format the image - * is created with but are supported for at least one format a VkImageView - * created from the image can have." - * - * This means we should relax checks that only depend on the - * format_feature_flags, to allow the user to create images that may be - * e.g. reinterpreted as storage when the original format doesn't allow it. - * The user will have to check against the format features anyway. - * Otherwise we'd unnecessarily disallow it. - */ - - VkImageUsageFlags image_usage = info->usage; - if (info->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT) - image_usage = 0; - - if (image_usage & VK_IMAGE_USAGE_SAMPLED_BIT) { + if (info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) { if (!(format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) { goto unsupported; } } - if (image_usage & VK_IMAGE_USAGE_STORAGE_BIT) { + if (info->usage & VK_IMAGE_USAGE_STORAGE_BIT) { if (!(format_feature_flags & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) { goto unsupported; } } - if (image_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { + if (info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { if (!(format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) { goto unsupported; } } - if (image_usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { + if (info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { if (!(format_feature_flags & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) { goto unsupported; @@ -611,9 +772,6 @@ tu_get_image_format_properties( .maxResourceSize = UINT32_MAX, }; - if (p_feature_flags) - *p_feature_flags = format_feature_flags; - return VK_SUCCESS; unsupported: *pImageFormatProperties = (VkImageFormatProperties) { @@ -627,12 +785,38 @@ unsupported: return VK_ERROR_FORMAT_NOT_SUPPORTED; } +VkResult +tu_GetPhysicalDeviceImageFormatProperties( + VkPhysicalDevice physicalDevice, + VkFormat format, + VkImageType type, + VkImageTiling tiling, + VkImageUsageFlags usage, + VkImageCreateFlags createFlags, + VkImageFormatProperties *pImageFormatProperties) +{ + TU_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice); + + const VkPhysicalDeviceImageFormatInfo2 info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2, + .pNext = NULL, + .format = format, + .type = type, + .tiling = tiling, + .usage = usage, + .flags = createFlags, + }; + + return tu_get_image_format_properties(physical_device, &info, + pImageFormatProperties); +} + static VkResult tu_get_external_image_format_properties( const struct tu_physical_device *physical_device, const VkPhysicalDeviceImageFormatInfo2 *pImageFormatInfo, VkExternalMemoryHandleTypeFlagBits handleType, - VkExternalImageFormatProperties *external_properties) + VkExternalMemoryProperties *external_properties) { VkExternalMemoryFeatureFlagBits flags = 0; VkExternalMemoryHandleTypeFlags export_flags = 0; @@ -659,7 +843,7 @@ tu_get_external_image_format_properties( VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT; break; default: - return vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED, + return vk_errorf(physical_device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED, "VkExternalMemoryTypeFlagBits(0x%x) unsupported for VkImageType(%d)", handleType, pImageFormatInfo->type); } @@ -669,24 +853,21 @@ tu_get_external_image_format_properties( compat_flags = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT; break; default: - return vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED, + return vk_errorf(physical_device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED, "VkExternalMemoryTypeFlagBits(0x%x) unsupported", handleType); } - if (external_properties) { - external_properties->externalMemoryProperties = - (VkExternalMemoryProperties) { - .externalMemoryFeatures = flags, - .exportFromImportedHandleTypes = export_flags, - .compatibleHandleTypes = compat_flags, - }; - } + *external_properties = (VkExternalMemoryProperties) { + .externalMemoryFeatures = flags, + .exportFromImportedHandleTypes = export_flags, + .compatibleHandleTypes = compat_flags, + }; return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_GetPhysicalDeviceImageFormatProperties2( VkPhysicalDevice physicalDevice, const VkPhysicalDeviceImageFormatInfo2 *base_info, @@ -694,15 +875,11 @@ tu_GetPhysicalDeviceImageFormatProperties2( { TU_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice); const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL; - const VkPhysicalDeviceImageViewImageFormatInfoEXT *image_view_info = NULL; VkExternalImageFormatProperties *external_props = NULL; - VkFilterCubicImageViewImageFormatPropertiesEXT *cubic_props = NULL; - VkFormatFeatureFlags format_feature_flags; - VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL; VkResult result; - result = tu_get_image_format_properties(physical_device, - base_info, &base_props->imageFormatProperties, &format_feature_flags); + result = tu_get_image_format_properties( + physical_device, base_info, &base_props->imageFormatProperties); if (result != VK_SUCCESS) return result; @@ -713,9 +890,6 @@ tu_GetPhysicalDeviceImageFormatProperties2( case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO: external_info = (const void *) s; break; - case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_VIEW_IMAGE_FORMAT_INFO_EXT: - image_view_info = (const void *) s; - break; default: break; } @@ -728,12 +902,6 @@ tu_GetPhysicalDeviceImageFormatProperties2( case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES: external_props = (void *) s; break; - case VK_STRUCTURE_TYPE_FILTER_CUBIC_IMAGE_VIEW_IMAGE_FORMAT_PROPERTIES_EXT: - cubic_props = (void *) s; - break; - case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES: - ycbcr_props = (void *) s; - break; default: break; } @@ -748,29 +916,11 @@ tu_GetPhysicalDeviceImageFormatProperties2( if (external_info && external_info->handleType != 0) { result = tu_get_external_image_format_properties( physical_device, base_info, external_info->handleType, - external_props); + &external_props->externalMemoryProperties); if (result != VK_SUCCESS) goto fail; } - if (cubic_props) { - /* note: blob only allows cubic filtering for 2D and 2D array views - * its likely we can enable it for 1D and CUBE, needs testing however - */ - if ((image_view_info->imageViewType == VK_IMAGE_VIEW_TYPE_2D || - image_view_info->imageViewType == VK_IMAGE_VIEW_TYPE_2D_ARRAY) && - (format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_CUBIC_BIT_EXT)) { - cubic_props->filterCubic = true; - cubic_props->filterCubicMinmax = true; - } else { - cubic_props->filterCubic = false; - cubic_props->filterCubicMinmax = false; - } - } - - if (ycbcr_props) - ycbcr_props->combinedImageSamplerDescriptorCount = 1; - return VK_SUCCESS; fail: @@ -782,13 +932,28 @@ fail: * the implementation for use in vkCreateImage, then all members of * imageFormatProperties will be filled with zero. */ - base_props->imageFormatProperties = (VkImageFormatProperties) {}; + base_props->imageFormatProperties = (VkImageFormatProperties) { 0 }; } return result; } -VKAPI_ATTR void VKAPI_CALL +void +tu_GetPhysicalDeviceSparseImageFormatProperties( + VkPhysicalDevice physicalDevice, + VkFormat format, + VkImageType type, + uint32_t samples, + VkImageUsageFlags usage, + VkImageTiling tiling, + uint32_t *pNumProperties, + VkSparseImageFormatProperties *pProperties) +{ + /* Sparse images are not yet supported. */ + *pNumProperties = 0; +} + +void tu_GetPhysicalDeviceSparseImageFormatProperties2( VkPhysicalDevice physicalDevice, const VkPhysicalDeviceSparseImageFormatInfo2 *pFormatInfo, @@ -799,7 +964,7 @@ tu_GetPhysicalDeviceSparseImageFormatProperties2( *pPropertyCount = 0; } -VKAPI_ATTR void VKAPI_CALL +void tu_GetPhysicalDeviceExternalBufferProperties( VkPhysicalDevice physicalDevice, const VkPhysicalDeviceExternalBufferInfo *pExternalBufferInfo, diff --git a/lib/mesa/src/freedreno/vulkan/tu_image.c b/lib/mesa/src/freedreno/vulkan/tu_image.c index 15a0649a2..657612d42 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_image.c +++ b/lib/mesa/src/freedreno/vulkan/tu_image.c @@ -1,733 +1,266 @@ /* * Copyright © 2016 Red Hat. * Copyright © 2016 Bas Nieuwenhuizen - * SPDX-License-Identifier: MIT * * based in part on anv driver which is: * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ -#include "tu_image.h" - -#include "fdl/fd6_format_table.h" +#include "tu_private.h" -#include "util/u_debug.h" -#include "util/format/u_format.h" +#include "util/debug.h" +#include "util/u_atomic.h" +#include "vk_format.h" #include "vk_util.h" -#include "drm-uapi/drm_fourcc.h" -#include "tu_android.h" -#include "tu_cs.h" -#include "tu_descriptor_set.h" -#include "tu_device.h" -#include "tu_formats.h" - -uint32_t -tu6_plane_count(VkFormat format) +static inline bool +image_level_linear(struct tu_image *image, int level) { - switch (format) { - default: - return 1; - case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: - case VK_FORMAT_D32_SFLOAT_S8_UINT: - return 2; - case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: - return 3; - } -} - -enum pipe_format -tu6_plane_format(VkFormat format, uint32_t plane) -{ - switch (format) { - case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: - return plane ? PIPE_FORMAT_R8G8_UNORM : PIPE_FORMAT_Y8_UNORM; - case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: - return PIPE_FORMAT_R8_UNORM; - case VK_FORMAT_D32_SFLOAT_S8_UINT: - return plane ? PIPE_FORMAT_S8_UINT : PIPE_FORMAT_Z32_FLOAT; - default: - return tu_vk_format_to_pipe_format(format); - } -} - -uint32_t -tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask) -{ - switch (aspect_mask) { - default: - assert(aspect_mask != VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT); - return 0; - case VK_IMAGE_ASPECT_PLANE_1_BIT: - case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT: - return 1; - case VK_IMAGE_ASPECT_PLANE_2_BIT: - case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT: - return 2; - case VK_IMAGE_ASPECT_STENCIL_BIT: - return format == VK_FORMAT_D32_SFLOAT_S8_UINT; - } -} - -enum pipe_format -tu_format_for_aspect(enum pipe_format format, VkImageAspectFlags aspect_mask) -{ - switch (format) { - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - if (aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT) - return PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8; - if (aspect_mask & VK_IMAGE_ASPECT_STENCIL_BIT) { - if (aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT) - return PIPE_FORMAT_Z24_UNORM_S8_UINT; - else - return PIPE_FORMAT_X24S8_UINT; - } else { - return PIPE_FORMAT_Z24X8_UNORM; - } - case PIPE_FORMAT_Z24X8_UNORM: - if (aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT) - return PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8; - return PIPE_FORMAT_Z24X8_UNORM; - default: - return format; - } + unsigned w = u_minify(image->extent.width, level); + return w < 16; } -static bool -tu_is_r8g8(enum pipe_format format) +/* indexed by cpp: */ +static const struct { - return (util_format_get_blocksize(format) == 2) && - (util_format_get_nr_components(format) == 2); -} - -static bool -tu_is_r8g8_compatible(enum pipe_format format) -{ - return (util_format_get_blocksize(format) == 2) && - !util_format_is_depth_or_stencil(format); -} - -void -tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer) -{ - tu_cs_emit(cs, iview->PITCH); - tu_cs_emit(cs, iview->layer_size >> 6); - tu_cs_emit_qw(cs, iview->base_addr + iview->layer_size * layer); -} - -void -tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) -{ - tu_cs_emit(cs, iview->stencil_PITCH); - tu_cs_emit(cs, iview->stencil_layer_size >> 6); - tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer); -} - -void -tu_cs_image_depth_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer) -{ - tu_cs_emit(cs, iview->depth_PITCH); - tu_cs_emit(cs, iview->depth_layer_size >> 6); - tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer); -} - -void -tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src) -{ - tu_cs_emit_qw(cs, iview->base_addr + iview->layer_size * layer); - /* SP_PS_2D_SRC_PITCH has shifted pitch field */ - tu_cs_emit(cs, iview->PITCH << (src ? 9 : 0)); -} - -void -tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer) -{ - tu_cs_emit_qw(cs, iview->ubwc_addr + iview->ubwc_layer_size * layer); - tu_cs_emit(cs, iview->FLAG_BUFFER_PITCH); -} + unsigned pitchalign; + unsigned heightalign; +} tile_alignment[] = { + [1] = { 128, 32 }, [2] = { 128, 16 }, [3] = { 128, 16 }, [4] = { 64, 16 }, + [8] = { 64, 16 }, [12] = { 64, 16 }, [16] = { 64, 16 }, +}; static void -tu_image_view_init(struct tu_device *device, - struct tu_image_view *iview, - const VkImageViewCreateInfo *pCreateInfo, - bool has_z24uint_s8uint) +setup_slices(struct tu_image *image, const VkImageCreateInfo *pCreateInfo) { - TU_FROM_HANDLE(tu_image, image, pCreateInfo->image); - const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange; - VkFormat vk_format = pCreateInfo->format; - VkImageAspectFlagBits aspect_mask = pCreateInfo->subresourceRange.aspectMask; - - const struct VkSamplerYcbcrConversionInfo *ycbcr_conversion = - vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO); - const struct tu_sampler_ycbcr_conversion *conversion = ycbcr_conversion ? - tu_sampler_ycbcr_conversion_from_handle(ycbcr_conversion->conversion) : NULL; - - vk_image_view_init(&device->vk, &iview->vk, false, pCreateInfo); - - iview->image = image; - - const struct fdl_layout *layouts[3]; - - layouts[0] = &image->layout[tu6_plane_index(image->vk.format, aspect_mask)]; - - enum pipe_format format; - if (aspect_mask != VK_IMAGE_ASPECT_COLOR_BIT) - format = tu6_plane_format(vk_format, tu6_plane_index(vk_format, aspect_mask)); - else - format = tu_vk_format_to_pipe_format(vk_format); - - if (image->vk.format == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM && - aspect_mask == VK_IMAGE_ASPECT_PLANE_0_BIT) { - if (vk_format == VK_FORMAT_R8_UNORM) { - /* The 0'th plane of this format has a different UBWC compression. */ - format = PIPE_FORMAT_Y8_UNORM; + enum vk_format_layout layout = + vk_format_description(pCreateInfo->format)->layout; + uint32_t layer_size = 0; + uint32_t width = pCreateInfo->extent.width; + uint32_t height = pCreateInfo->extent.height; + uint32_t depth = pCreateInfo->extent.depth; + bool layer_first = pCreateInfo->imageType != VK_IMAGE_TYPE_3D; + uint32_t alignment = pCreateInfo->imageType == VK_IMAGE_TYPE_3D ? 4096 : 1; + uint32_t cpp = vk_format_get_blocksize(pCreateInfo->format); + + uint32_t heightalign = tile_alignment[cpp].heightalign; + + for (unsigned level = 0; level < pCreateInfo->mipLevels; level++) { + struct tu_image_level *slice = &image->levels[level]; + bool linear_level = image_level_linear(image, level); + uint32_t aligned_height = height; + uint32_t blocks; + uint32_t pitchalign; + + if (image->tile_mode && !linear_level) { + pitchalign = tile_alignment[cpp].pitchalign; + aligned_height = align(aligned_height, heightalign); } else { - /* If the user wants to reinterpret this plane, then they should've - * set MUTABLE_FORMAT_BIT which should disable UBWC and tiling. + pitchalign = 64; + + /* The blits used for mem<->gmem work at a granularity of + * 32x32, which can cause faults due to over-fetch on the + * last level. The simple solution is to over-allocate a + * bit the last level to ensure any over-fetch is harmless. + * The pitch is already sufficiently aligned, but height + * may not be: */ - assert(!layouts[0]->ubwc); + if ((level + 1 == pCreateInfo->mipLevels)) + aligned_height = align(aligned_height, 32); } - } - if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT && - (vk_format == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM || - vk_format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM)) { - layouts[1] = &image->layout[1]; - layouts[2] = &image->layout[2]; - } - - struct fdl_view_args args = {}; - args.iova = image->iova; - args.base_array_layer = range->baseArrayLayer; - args.base_miplevel = range->baseMipLevel; - args.layer_count = vk_image_subresource_layer_count(&image->vk, range); - args.level_count = vk_image_subresource_level_count(&image->vk, range); - args.min_lod_clamp = iview->vk.min_lod; - args.format = tu_format_for_aspect(format, aspect_mask); - vk_component_mapping_to_pipe_swizzle(pCreateInfo->components, args.swiz); - if (conversion) { - unsigned char conversion_swiz[4], create_swiz[4]; - memcpy(create_swiz, args.swiz, sizeof(create_swiz)); - vk_component_mapping_to_pipe_swizzle(conversion->components, - conversion_swiz); - util_format_compose_swizzles(create_swiz, conversion_swiz, args.swiz); - } - - switch (pCreateInfo->viewType) { - case VK_IMAGE_VIEW_TYPE_1D: - case VK_IMAGE_VIEW_TYPE_1D_ARRAY: - args.type = FDL_VIEW_TYPE_1D; - break; - case VK_IMAGE_VIEW_TYPE_2D: - case VK_IMAGE_VIEW_TYPE_2D_ARRAY: - args.type = FDL_VIEW_TYPE_2D; - break; - case VK_IMAGE_VIEW_TYPE_CUBE: - case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY: - args.type = FDL_VIEW_TYPE_CUBE; - break; - case VK_IMAGE_VIEW_TYPE_3D: - args.type = FDL_VIEW_TYPE_3D; - break; - default: - unreachable("unknown view type"); - } + if (layout == VK_FORMAT_LAYOUT_ASTC) + slice->pitch = util_align_npot( + width, + pitchalign * vk_format_get_blockwidth(pCreateInfo->format)); + else + slice->pitch = align(width, pitchalign); - STATIC_ASSERT((unsigned)VK_CHROMA_LOCATION_COSITED_EVEN == (unsigned)FDL_CHROMA_LOCATION_COSITED_EVEN); - STATIC_ASSERT((unsigned)VK_CHROMA_LOCATION_MIDPOINT == (unsigned)FDL_CHROMA_LOCATION_MIDPOINT); - if (conversion) { - args.chroma_offsets[0] = (enum fdl_chroma_location) conversion->chroma_offsets[0]; - args.chroma_offsets[1] = (enum fdl_chroma_location) conversion->chroma_offsets[1]; - } + slice->offset = layer_size; + blocks = vk_format_get_block_count(pCreateInfo->format, slice->pitch, + aligned_height); - fdl6_view_init(&iview->view, layouts, &args, has_z24uint_s8uint); + /* 1d array and 2d array textures must all have the same layer size + * for each miplevel on a3xx. 3d textures can have different layer + * sizes for high levels, but the hw auto-sizer is buggy (or at least + * different than what this code does), so as soon as the layer size + * range gets into range, we stop reducing it. + */ + if (pCreateInfo->imageType == VK_IMAGE_TYPE_3D && + (level == 1 || + (level > 1 && image->levels[level - 1].size > 0xf000))) + slice->size = align(blocks * cpp, alignment); + else if (level == 0 || layer_first || alignment == 1) + slice->size = align(blocks * cpp, alignment); + else + slice->size = image->levels[level - 1].size; - if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { - struct fdl_layout *layout = &image->layout[0]; - iview->depth_base_addr = image->iova + - fdl_surface_offset(layout, range->baseMipLevel, range->baseArrayLayer); - iview->depth_layer_size = fdl_layer_stride(layout, range->baseMipLevel); - iview->depth_PITCH = A6XX_RB_DEPTH_BUFFER_PITCH(fdl_pitch(layout, range->baseMipLevel)).value; + layer_size += slice->size * depth; - layout = &image->layout[1]; - iview->stencil_base_addr = image->iova + - fdl_surface_offset(layout, range->baseMipLevel, range->baseArrayLayer); - iview->stencil_layer_size = fdl_layer_stride(layout, range->baseMipLevel); - iview->stencil_PITCH = A6XX_RB_STENCIL_BUFFER_PITCH(fdl_pitch(layout, range->baseMipLevel)).value; + width = u_minify(width, 1); + height = u_minify(height, 1); + depth = u_minify(depth, 1); } -} -bool -tiling_possible(VkFormat format) -{ - if (format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM || - format == VK_FORMAT_G8B8G8R8_422_UNORM || - format == VK_FORMAT_B8G8R8G8_422_UNORM) - return false; - - return true; + image->layer_size = layer_size; } -/* Checks if we should advertise UBWC support for the given usage. - * - * Used by both vkCreateImage and vkGetPhysicalDeviceFormatProperties2, so the - * logical tu_device may be NULL. - */ -bool -ubwc_possible(struct tu_device *device, - VkFormat format, - VkImageType type, - VkImageUsageFlags usage, - VkImageUsageFlags stencil_usage, - const struct fd_dev_info *info, - VkSampleCountFlagBits samples, - bool use_z24uint_s8uint) +VkResult +tu_image_create(VkDevice _device, + const struct tu_image_create_info *create_info, + const VkAllocationCallbacks *alloc, + VkImage *pImage) { - /* no UBWC with compressed formats, E5B9G9R9, S8_UINT - * (S8_UINT because separate stencil doesn't have UBWC-enable bit) - */ - if (vk_format_is_compressed(format) || - format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 || - format == VK_FORMAT_S8_UINT) - return false; - - /* In copy_format, we treat snorm as unorm to avoid clamping. But snorm - * and unorm are UBWC incompatible for special values such as all 0's or - * all 1's. Disable UBWC for snorm. - */ - if (vk_format_is_snorm(format)) - return false; - - if (!info->a6xx.has_8bpp_ubwc && - (format == VK_FORMAT_R8_UNORM || - format == VK_FORMAT_R8_SNORM || - format == VK_FORMAT_R8_UINT || - format == VK_FORMAT_R8_SINT || - format == VK_FORMAT_R8_SRGB)) - return false; - - if (type == VK_IMAGE_TYPE_3D) { - if (device) { - perf_debug(device, - "Disabling UBWC for %s 3D image, but it should be " - "possible to support.", - util_format_name(vk_format_to_pipe_format(format))); - } - return false; - } - - /* Disable UBWC for storage images. - * - * The closed GL driver skips UBWC for storage images (and additionally - * uses linear for writeonly images). We seem to have image tiling working - * in freedreno in general, so turnip matches that. freedreno also enables - * UBWC on images, but it's not really tested due to the lack of - * UBWC-enabled mipmaps in freedreno currently. Just match the closed GL - * behavior of no UBWC. - */ - if ((usage | stencil_usage) & VK_IMAGE_USAGE_STORAGE_BIT) { - if (device) { - perf_debug(device, - "Disabling UBWC for %s storage image, but should be " - "possible to support", - util_format_name(vk_format_to_pipe_format(format))); - } - return false; + TU_FROM_HANDLE(tu_device, device, _device); + const VkImageCreateInfo *pCreateInfo = create_info->vk_info; + struct tu_image *image = NULL; + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO); + + tu_assert(pCreateInfo->mipLevels > 0); + tu_assert(pCreateInfo->arrayLayers > 0); + tu_assert(pCreateInfo->samples > 0); + tu_assert(pCreateInfo->extent.width > 0); + tu_assert(pCreateInfo->extent.height > 0); + tu_assert(pCreateInfo->extent.depth > 0); + + image = vk_zalloc2(&device->alloc, alloc, sizeof(*image), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!image) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + + image->type = pCreateInfo->imageType; + + image->vk_format = pCreateInfo->format; + image->tiling = pCreateInfo->tiling; + image->usage = pCreateInfo->usage; + image->flags = pCreateInfo->flags; + image->extent = pCreateInfo->extent; + image->level_count = pCreateInfo->mipLevels; + image->layer_count = pCreateInfo->arrayLayers; + + image->exclusive = pCreateInfo->sharingMode == VK_SHARING_MODE_EXCLUSIVE; + if (pCreateInfo->sharingMode == VK_SHARING_MODE_CONCURRENT) { + for (uint32_t i = 0; i < pCreateInfo->queueFamilyIndexCount; ++i) + if (pCreateInfo->pQueueFamilyIndices[i] == + VK_QUEUE_FAMILY_EXTERNAL) + image->queue_family_mask |= (1u << TU_MAX_QUEUE_FAMILIES) - 1u; + else + image->queue_family_mask |= + 1u << pCreateInfo->pQueueFamilyIndices[i]; } - /* Disable UBWC for D24S8 on A630 in some cases - * - * VK_IMAGE_ASPECT_STENCIL_BIT image view requires to be able to sample - * from the stencil component as UINT, however no format allows this - * on a630 (the special FMT6_Z24_UINT_S8_UINT format is missing) - * - * It must be sampled as FMT6_8_8_8_8_UINT, which is not UBWC-compatible - * - * If we wish to get the border colors correct without knowing the format - * when creating the sampler, we also have to use the A630 workaround. - * - * Additionally, the special AS_R8G8B8A8 format is broken without UBWC, - * so we have to fallback to 8_8_8_8_UNORM when UBWC is disabled - */ - if (!use_z24uint_s8uint && - format == VK_FORMAT_D24_UNORM_S8_UINT && - (stencil_usage & (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))) - return false; - - /* This meant to disable UBWC for MSAA z24s8, but accidentally disables it - * for all MSAA. https://gitlab.freedesktop.org/mesa/mesa/-/issues/7438 - */ - if (!info->a6xx.has_z24uint_s8uint && samples > VK_SAMPLE_COUNT_1_BIT) { - if (device) { - perf_debug(device, - "Disabling UBWC for %d-sample %s image, but it should be " - "possible to support", - samples, - util_format_name(vk_format_to_pipe_format(format))); - } - return false; - } + image->shareable = + vk_find_struct_const(pCreateInfo->pNext, + EXTERNAL_MEMORY_IMAGE_CREATE_INFO) != NULL; - return true; -} + image->tile_mode = pCreateInfo->tiling == VK_IMAGE_TILING_OPTIMAL ? 3 : 0; + setup_slices(image, pCreateInfo); -/* R8G8 have a different block width/height and height alignment from other - * formats that would normally be compatible (like R16), and so if we are - * trying to, for example, sample R16 as R8G8 we need to demote to linear. - */ -static bool -format_list_reinterprets_r8g8_r16(enum pipe_format format, const VkImageFormatListCreateInfo *fmt_list) -{ - /* Check if it's actually a 2-cpp color format. */ - if (!tu_is_r8g8_compatible(format)) - return false; - - /* If there's no format list, then the app may reinterpret to any compatible - * format. - */ - if (!fmt_list || !fmt_list->viewFormatCount) - return true; - - bool has_r8g8 = false; - bool has_non_r8g8 = false; - for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) { - enum pipe_format format = - tu_vk_format_to_pipe_format(fmt_list->pViewFormats[i]); - if (tu_is_r8g8(format)) - has_r8g8 = true; - else - has_non_r8g8 = true; - } - return has_r8g8 && has_non_r8g8; -} + image->size = image->layer_size * pCreateInfo->arrayLayers; + *pImage = tu_image_to_handle(image); -static bool -format_list_has_swaps(const VkImageFormatListCreateInfo *fmt_list) -{ - /* If there's no format list, then the app may reinterpret to any compatible - * format, and presumably one would have the swap set. - */ - if (!fmt_list || !fmt_list->viewFormatCount) - return true; - - for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) { - enum pipe_format format = - tu_vk_format_to_pipe_format(fmt_list->pViewFormats[i]); - - if (tu6_format_texture(format, TILE6_LINEAR).swap) - return true; - } - return false; + return VK_SUCCESS; } -static VkResult -tu_image_init(struct tu_device *device, struct tu_image *image, - const VkImageCreateInfo *pCreateInfo, uint64_t modifier, - const VkSubresourceLayout *plane_layouts) +void +tu_image_view_init(struct tu_image_view *iview, + struct tu_device *device, + const VkImageViewCreateInfo *pCreateInfo) { - vk_image_init(&device->vk, &image->vk, pCreateInfo); - image->vk.drm_format_mod = modifier; - - enum a6xx_tile_mode tile_mode = TILE6_3; - bool ubwc_enabled = true; - - /* use linear tiling if requested */ - if (pCreateInfo->tiling == VK_IMAGE_TILING_LINEAR || modifier == DRM_FORMAT_MOD_LINEAR) { - tile_mode = TILE6_LINEAR; - ubwc_enabled = false; - } - - /* Force linear tiling for formats with "fake" optimalTilingFeatures */ - if (!tiling_possible(image->vk.format)) { - tile_mode = TILE6_LINEAR; - ubwc_enabled = false; - } - - /* No sense in tiling a 1D image, you'd just waste space and cache locality. */ - if (pCreateInfo->imageType == VK_IMAGE_TYPE_1D) { - tile_mode = TILE6_LINEAR; - ubwc_enabled = false; - } - - enum pipe_format format = - tu_vk_format_to_pipe_format(image->vk.format); - /* Whether a view of the image with an R8G8 format could be made. */ - bool has_r8g8 = tu_is_r8g8(format); - - if (ubwc_enabled && - !ubwc_possible(device, image->vk.format, pCreateInfo->imageType, - pCreateInfo->usage, image->vk.stencil_usage, - device->physical_device->info, pCreateInfo->samples, - device->use_z24uint_s8uint)) - ubwc_enabled = false; - - /* Mutable images can be reinterpreted as any other compatible format. - * This is a problem with UBWC (compression for different formats is different), - * but also tiling ("swap" affects how tiled formats are stored in memory) - * Depth and stencil formats cannot be reintepreted as another format, and - * cannot be linear with sysmem rendering, so don't fall back for those. - * - * TODO: - * - if the fmt_list contains only formats which are swapped, but compatible - * with each other (B8G8R8A8_UNORM and B8G8R8A8_UINT for example), then - * tiling is still possible - * - figure out which UBWC compressions are compatible to keep it enabled - */ - if ((pCreateInfo->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) && - !vk_format_is_depth_or_stencil(image->vk.format)) { - const VkImageFormatListCreateInfo *fmt_list = - vk_find_struct_const(pCreateInfo->pNext, IMAGE_FORMAT_LIST_CREATE_INFO); - if (!tu6_mutable_format_list_ubwc_compatible(fmt_list)) { - if (ubwc_enabled) { - if (fmt_list && fmt_list->viewFormatCount == 2) { - perf_debug( - device, - "Disabling UBWC on %dx%d %s resource due to mutable formats " - "(fmt list %s, %s)", - image->vk.extent.width, image->vk.extent.height, - util_format_name(vk_format_to_pipe_format(image->vk.format)), - util_format_name(vk_format_to_pipe_format(fmt_list->pViewFormats[0])), - util_format_name(vk_format_to_pipe_format(fmt_list->pViewFormats[1]))); - } else { - perf_debug( - device, - "Disabling UBWC on %dx%d %s resource due to mutable formats " - "(fmt list %s)", - image->vk.extent.width, image->vk.extent.height, - util_format_name(vk_format_to_pipe_format(image->vk.format)), - fmt_list ? "present" : "missing"); - } - ubwc_enabled = false; - } - - if (format_list_reinterprets_r8g8_r16(format, fmt_list) || - format_list_has_swaps(fmt_list)) { - tile_mode = TILE6_LINEAR; - } - } - } + TU_FROM_HANDLE(tu_image, image, pCreateInfo->image); + const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange; - /* expect UBWC enabled if we asked for it */ - if (modifier == DRM_FORMAT_MOD_QCOM_COMPRESSED) - assert(ubwc_enabled); - else if (device->physical_device->instance->debug_flags & TU_DEBUG_NOUBWC) - ubwc_enabled = false; - - /* Non-UBWC tiled R8G8 is probably buggy since media formats are always - * either linear or UBWC. There is no simple test to reproduce the bug. - * However it was observed in the wild leading to an unrecoverable hang - * on a650/a660. - */ - if (has_r8g8 && tile_mode == TILE6_3 && !ubwc_enabled) { - tile_mode = TILE6_LINEAR; + switch (image->type) { + case VK_IMAGE_TYPE_1D: + case VK_IMAGE_TYPE_2D: + assert(range->baseArrayLayer + tu_get_layerCount(image, range) <= + image->layer_count); + break; + case VK_IMAGE_TYPE_3D: + assert(range->baseArrayLayer + tu_get_layerCount(image, range) <= + tu_minify(image->extent.depth, range->baseMipLevel)); + break; + default: + unreachable("bad VkImageType"); } - for (uint32_t i = 0; i < tu6_plane_count(image->vk.format); i++) { - struct fdl_layout *layout = &image->layout[i]; - enum pipe_format format = tu6_plane_format(image->vk.format, i); - uint32_t width0 = pCreateInfo->extent.width; - uint32_t height0 = pCreateInfo->extent.height; - - if (i > 0) { - switch (image->vk.format) { - case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: - case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: - /* half width/height on chroma planes */ - width0 = (width0 + 1) >> 1; - height0 = (height0 + 1) >> 1; - break; - case VK_FORMAT_D32_SFLOAT_S8_UINT: - /* no UBWC for separate stencil */ - ubwc_enabled = false; - break; - default: - break; - } - } - - struct fdl_explicit_layout plane_layout; - - if (plane_layouts) { - /* only expect simple 2D images for now */ - if (pCreateInfo->mipLevels != 1 || - pCreateInfo->arrayLayers != 1 || - pCreateInfo->extent.depth != 1) - return vk_error(device, VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT); - - plane_layout.offset = plane_layouts[i].offset; - plane_layout.pitch = plane_layouts[i].rowPitch; - /* note: use plane_layouts[0].arrayPitch to support array formats */ - } - - layout->tile_mode = tile_mode; - layout->ubwc = ubwc_enabled; - - if (!fdl6_layout(layout, format, - pCreateInfo->samples, - width0, height0, - pCreateInfo->extent.depth, - pCreateInfo->mipLevels, - pCreateInfo->arrayLayers, - pCreateInfo->imageType == VK_IMAGE_TYPE_3D, - plane_layouts ? &plane_layout : NULL)) { - assert(plane_layouts); /* can only fail with explicit layout */ - return vk_error(device, VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT); - } - - if (device->instance->debug_flags & TU_DEBUG_LAYOUT) - fdl_dump_layout(layout); - - /* fdl6_layout can't take explicit offset without explicit pitch - * add offset manually for extra layouts for planes - */ - if (!plane_layouts && i > 0) { - uint32_t offset = ALIGN_POT(image->total_size, 4096); - for (int i = 0; i < pCreateInfo->mipLevels; i++) { - layout->slices[i].offset += offset; - layout->ubwc_slices[i].offset += offset; - } - layout->size += offset; - } - - image->total_size = MAX2(image->total_size, layout->size); + iview->image = image; + iview->type = pCreateInfo->viewType; + iview->vk_format = pCreateInfo->format; + iview->aspect_mask = pCreateInfo->subresourceRange.aspectMask; + + if (iview->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) { + iview->vk_format = vk_format_stencil_only(iview->vk_format); + } else if (iview->aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) { + iview->vk_format = vk_format_depth_only(iview->vk_format); } - const struct util_format_description *desc = util_format_description(image->layout[0].format); - if (util_format_has_depth(desc) && !(device->instance->debug_flags & TU_DEBUG_NOLRZ)) - { - /* Depth plane is the first one */ - struct fdl_layout *layout = &image->layout[0]; - unsigned width = layout->width0; - unsigned height = layout->height0; - - /* LRZ buffer is super-sampled */ - switch (layout->nr_samples) { - case 4: - width *= 2; - FALLTHROUGH; - case 2: - height *= 2; - break; - default: - break; - } - - unsigned lrz_pitch = align(DIV_ROUND_UP(width, 8), 32); - unsigned lrz_height = align(DIV_ROUND_UP(height, 8), 16); - - image->lrz_height = lrz_height; - image->lrz_pitch = lrz_pitch; - image->lrz_offset = image->total_size; - unsigned lrz_size = lrz_pitch * lrz_height * 2; - image->total_size += lrz_size; - - unsigned nblocksx = DIV_ROUND_UP(DIV_ROUND_UP(width, 8), 16); - unsigned nblocksy = DIV_ROUND_UP(DIV_ROUND_UP(height, 8), 4); - - /* Fast-clear buffer is 1bit/block */ - image->lrz_fc_size = DIV_ROUND_UP(nblocksx * nblocksy, 8); - - /* Fast-clear buffer cannot be larger than 512 bytes (HW limitation) */ - bool has_lrz_fc = image->lrz_fc_size <= 512 && - device->physical_device->info->a6xx.enable_lrz_fast_clear && - !unlikely(device->physical_device->instance->debug_flags & TU_DEBUG_NOLRZFC); + // should we minify? + iview->extent = image->extent; - if (has_lrz_fc || device->physical_device->info->a6xx.has_lrz_dir_tracking) { - image->lrz_fc_offset = image->total_size; - image->total_size += 512; - - if (device->physical_device->info->a6xx.has_lrz_dir_tracking) { - /* Direction tracking uses 1 byte */ - image->total_size += 1; - /* GRAS_LRZ_DEPTH_VIEW needs 5 bytes: 4 for view data and 1 for padding */ - image->total_size += 5; - } - } - - if (!has_lrz_fc) { - image->lrz_fc_size = 0; - } - } else { - image->lrz_height = 0; - } + iview->base_layer = range->baseArrayLayer; + iview->layer_count = tu_get_layerCount(image, range); + iview->base_mip = range->baseMipLevel; + iview->level_count = tu_get_levelCount(image, range); +} - return VK_SUCCESS; +unsigned +tu_image_queue_family_mask(const struct tu_image *image, + uint32_t family, + uint32_t queue_family) +{ + if (!image->exclusive) + return image->queue_family_mask; + if (family == VK_QUEUE_FAMILY_EXTERNAL) + return (1u << TU_MAX_QUEUE_FAMILIES) - 1u; + if (family == VK_QUEUE_FAMILY_IGNORED) + return 1u << queue_family; + return 1u << family; } -VKAPI_ATTR VkResult VKAPI_CALL -tu_CreateImage(VkDevice _device, +VkResult +tu_CreateImage(VkDevice device, const VkImageCreateInfo *pCreateInfo, - const VkAllocationCallbacks *alloc, + const VkAllocationCallbacks *pAllocator, VkImage *pImage) { - uint64_t modifier = DRM_FORMAT_MOD_INVALID; - const VkSubresourceLayout *plane_layouts = NULL; - - TU_FROM_HANDLE(tu_device, device, _device); - struct tu_image *image = - vk_object_zalloc(&device->vk, alloc, sizeof(*image), VK_OBJECT_TYPE_IMAGE); - - if (!image) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { - const VkImageDrmFormatModifierListCreateInfoEXT *mod_info = - vk_find_struct_const(pCreateInfo->pNext, - IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT); - const VkImageDrmFormatModifierExplicitCreateInfoEXT *drm_explicit_info = - vk_find_struct_const(pCreateInfo->pNext, - IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT); - - assert(mod_info || drm_explicit_info); - - if (mod_info) { - modifier = DRM_FORMAT_MOD_LINEAR; - for (unsigned i = 0; i < mod_info->drmFormatModifierCount; i++) { - if (mod_info->pDrmFormatModifiers[i] == DRM_FORMAT_MOD_QCOM_COMPRESSED) - modifier = DRM_FORMAT_MOD_QCOM_COMPRESSED; - } - } else { - modifier = drm_explicit_info->drmFormatModifier; - assert(modifier == DRM_FORMAT_MOD_LINEAR || - modifier == DRM_FORMAT_MOD_QCOM_COMPRESSED); - plane_layouts = drm_explicit_info->pPlaneLayouts; - } - } else { - const struct wsi_image_create_info *wsi_info = - vk_find_struct_const(pCreateInfo->pNext, WSI_IMAGE_CREATE_INFO_MESA); - if (wsi_info && wsi_info->scanout) - modifier = DRM_FORMAT_MOD_LINEAR; - } - #ifdef ANDROID const VkNativeBufferANDROID *gralloc_info = vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID); - int dma_buf; - if (gralloc_info) { - VkResult result = tu_gralloc_info(device, gralloc_info, &dma_buf, &modifier); - if (result != VK_SUCCESS) - return result; - } -#endif - VkResult result = tu_image_init(device, image, pCreateInfo, modifier, - plane_layouts); - if (result != VK_SUCCESS) { - vk_object_free(&device->vk, alloc, image); - return result; - } - - *pImage = tu_image_to_handle(image); - -#ifdef ANDROID if (gralloc_info) - return tu_import_memory_from_gralloc_handle(_device, dma_buf, alloc, - *pImage); + return tu_image_from_gralloc(device, pCreateInfo, gralloc_info, + pAllocator, pImage); #endif - return VK_SUCCESS; + + return tu_image_create(device, + &(struct tu_image_create_info) { + .vk_info = pCreateInfo, + .scanout = false, + }, + pAllocator, pImage); } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyImage(VkDevice _device, VkImage _image, const VkAllocationCallbacks *pAllocator) @@ -738,87 +271,13 @@ tu_DestroyImage(VkDevice _device, if (!image) return; -#ifdef ANDROID if (image->owned_memory != VK_NULL_HANDLE) tu_FreeMemory(_device, image->owned_memory, pAllocator); -#endif - - vk_object_free(&device->vk, pAllocator, image); -} - -static void -tu_get_image_memory_requirements(struct tu_image *image, - VkMemoryRequirements2 *pMemoryRequirements) -{ - pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) { - .memoryTypeBits = 1, - .alignment = image->layout[0].base_align, - .size = image->total_size - }; - - vk_foreach_struct(ext, pMemoryRequirements->pNext) { - switch (ext->sType) { - case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: { - VkMemoryDedicatedRequirements *req = - (VkMemoryDedicatedRequirements *) ext; - req->requiresDedicatedAllocation = - image->vk.external_handle_types != 0; - req->prefersDedicatedAllocation = req->requiresDedicatedAllocation; - break; - } - default: - break; - } - } -} - -VKAPI_ATTR void VKAPI_CALL -tu_GetImageMemoryRequirements2(VkDevice device, - const VkImageMemoryRequirementsInfo2 *pInfo, - VkMemoryRequirements2 *pMemoryRequirements) -{ - TU_FROM_HANDLE(tu_image, image, pInfo->image); - - tu_get_image_memory_requirements(image, pMemoryRequirements); -} - -VKAPI_ATTR void VKAPI_CALL -tu_GetImageSparseMemoryRequirements2( - VkDevice device, - const VkImageSparseMemoryRequirementsInfo2 *pInfo, - uint32_t *pSparseMemoryRequirementCount, - VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements) -{ - tu_stub(); -} - -VKAPI_ATTR void VKAPI_CALL -tu_GetDeviceImageMemoryRequirements( - VkDevice _device, - const VkDeviceImageMemoryRequirements *pInfo, - VkMemoryRequirements2 *pMemoryRequirements) -{ - TU_FROM_HANDLE(tu_device, device, _device); - - struct tu_image image = {0}; - tu_image_init(device, &image, pInfo->pCreateInfo, DRM_FORMAT_MOD_INVALID, - NULL); - - tu_get_image_memory_requirements(&image, pMemoryRequirements); -} - -VKAPI_ATTR void VKAPI_CALL -tu_GetDeviceImageSparseMemoryRequirements( - VkDevice device, - const VkDeviceImageMemoryRequirements *pInfo, - uint32_t *pSparseMemoryRequirementCount, - VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements) -{ - tu_stub(); + vk_free2(&device->alloc, pAllocator, image); } -VKAPI_ATTR void VKAPI_CALL +void tu_GetImageSubresourceLayout(VkDevice _device, VkImage _image, const VkImageSubresource *pSubresource, @@ -826,26 +285,19 @@ tu_GetImageSubresourceLayout(VkDevice _device, { TU_FROM_HANDLE(tu_image, image, _image); - struct fdl_layout *layout = - &image->layout[tu6_plane_index(image->vk.format, pSubresource->aspectMask)]; - const struct fdl_slice *slice = layout->slices + pSubresource->mipLevel; - - pLayout->offset = - fdl_surface_offset(layout, pSubresource->mipLevel, pSubresource->arrayLayer); - pLayout->rowPitch = fdl_pitch(layout, pSubresource->mipLevel); - pLayout->arrayPitch = fdl_layer_stride(layout, pSubresource->mipLevel); - pLayout->depthPitch = slice->size0; - pLayout->size = pLayout->depthPitch * layout->depth0; - - if (fdl_ubwc_enabled(layout, pSubresource->mipLevel)) { - /* UBWC starts at offset 0 */ - pLayout->offset = 0; - /* UBWC scanout won't match what the kernel wants if we have levels/layers */ - assert(image->vk.mip_levels == 1 && image->vk.array_layers == 1); - } + const uint32_t layer_offset = image->layer_size * pSubresource->arrayLayer; + const struct tu_image_level *level = + image->levels + pSubresource->mipLevel; + + pLayout->offset = layer_offset + level->offset; + pLayout->size = level->size; + pLayout->rowPitch = + level->pitch * vk_format_get_blocksize(image->vk_format); + pLayout->arrayPitch = image->layer_size; + pLayout->depthPitch = level->size; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreateImageView(VkDevice _device, const VkImageViewCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, @@ -854,19 +306,19 @@ tu_CreateImageView(VkDevice _device, TU_FROM_HANDLE(tu_device, device, _device); struct tu_image_view *view; - view = vk_object_alloc(&device->vk, pAllocator, sizeof(*view), - VK_OBJECT_TYPE_IMAGE_VIEW); + view = vk_alloc2(&device->alloc, pAllocator, sizeof(*view), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (view == NULL) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - tu_image_view_init(device, view, pCreateInfo, device->use_z24uint_s8uint); + tu_image_view_init(view, device, pCreateInfo); *pView = tu_image_view_to_handle(view); return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyImageView(VkDevice _device, VkImageView _iview, const VkAllocationCallbacks *pAllocator) @@ -876,8 +328,7 @@ tu_DestroyImageView(VkDevice _device, if (!iview) return; - - vk_object_free(&device->vk, pAllocator, iview); + vk_free2(&device->alloc, pAllocator, iview); } void @@ -887,19 +338,13 @@ tu_buffer_view_init(struct tu_buffer_view *view, { TU_FROM_HANDLE(tu_buffer, buffer, pCreateInfo->buffer); - view->buffer = buffer; - - uint32_t range = vk_buffer_range(&buffer->vk, pCreateInfo->offset, - pCreateInfo->range); - uint8_t swiz[4] = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, - PIPE_SWIZZLE_W }; - - fdl6_buffer_view_init( - view->descriptor, tu_vk_format_to_pipe_format(pCreateInfo->format), - swiz, buffer->iova + pCreateInfo->offset, range); + view->range = pCreateInfo->range == VK_WHOLE_SIZE + ? buffer->size - pCreateInfo->offset + : pCreateInfo->range; + view->vk_format = pCreateInfo->format; } -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreateBufferView(VkDevice _device, const VkBufferViewCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, @@ -908,10 +353,10 @@ tu_CreateBufferView(VkDevice _device, TU_FROM_HANDLE(tu_device, device, _device); struct tu_buffer_view *view; - view = vk_object_alloc(&device->vk, pAllocator, sizeof(*view), - VK_OBJECT_TYPE_BUFFER_VIEW); + view = vk_alloc2(&device->alloc, pAllocator, sizeof(*view), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!view) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); tu_buffer_view_init(view, device, pCreateInfo); @@ -920,7 +365,7 @@ tu_CreateBufferView(VkDevice _device, return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyBufferView(VkDevice _device, VkBufferView bufferView, const VkAllocationCallbacks *pAllocator) @@ -931,5 +376,5 @@ tu_DestroyBufferView(VkDevice _device, if (!view) return; - vk_object_free(&device->vk, pAllocator, view); + vk_free2(&device->alloc, pAllocator, view); } diff --git a/lib/mesa/src/freedreno/vulkan/tu_pass.c b/lib/mesa/src/freedreno/vulkan/tu_pass.c index 84c1c3061..e3d9f23df 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_pass.c +++ b/lib/mesa/src/freedreno/vulkan/tu_pass.c @@ -1,796 +1,245 @@ /* * Copyright © 2016 Red Hat. * Copyright © 2016 Bas Nieuwenhuizen - * SPDX-License-Identifier: MIT * * based in part on anv driver which is: * Copyright © 2015 Intel Corporation - */ - -#include "tu_pass.h" - -#include "vk_util.h" - -#include "tu_cmd_buffer.h" -#include "tu_device.h" -#include "tu_image.h" - -/* Return true if we have to fallback to sysmem rendering because the - * dependency can't be satisfied with tiled rendering. - */ - -static bool -dep_invalid_for_gmem(const VkSubpassDependency2 *dep, - VkPipelineStageFlags2 src_stage_mask, - VkPipelineStageFlags2 dst_stage_mask) -{ - /* External dependencies don't matter here. */ - if (dep->srcSubpass == VK_SUBPASS_EXTERNAL || - dep->dstSubpass == VK_SUBPASS_EXTERNAL) - return false; - - /* We can conceptually break down the process of rewriting a sysmem - * renderpass into a gmem one into two parts: - * - * 1. Split each draw and multisample resolve into N copies, one for each - * bin. (If hardware binning, add one more copy where the FS is disabled - * for the binning pass). This is always allowed because the vertex stage - * is allowed to run an arbitrary number of times and there are no extra - * ordering constraints within a draw. - * 2. Take the last copy of the second-to-last draw and slide it down to - * before the last copy of the last draw. Repeat for each earlier draw - * until the draw pass for the last bin is complete, then repeat for each - * earlier bin until we finish with the first bin. - * - * During this rearranging process, we can't slide draws past each other in - * a way that breaks the subpass dependencies. For each draw, we must slide - * it past (copies of) the rest of the draws in the renderpass. We can - * slide a draw past another if there isn't a dependency between them, or - * if the dependenc(ies) are dependencies between framebuffer-space stages - * only with the BY_REGION bit set. Note that this includes - * self-dependencies, since these may result in pipeline barriers that also - * break the rearranging process. - */ - - /* This is straight from the Vulkan 1.2 spec, section 6.1.4 "Framebuffer - * Region Dependencies": - */ - const VkPipelineStageFlags2 framebuffer_space_stages = - VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | - VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | - VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | - VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT; - - return - (src_stage_mask & ~(framebuffer_space_stages | VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) || - (dst_stage_mask & ~(framebuffer_space_stages | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)) || - !(dep->dependencyFlags & VK_DEPENDENCY_BY_REGION_BIT); -} - -static void -tu_render_pass_add_subpass_dep(struct tu_render_pass *pass, - const VkSubpassDependency2 *dep) -{ - uint32_t src = dep->srcSubpass; - uint32_t dst = dep->dstSubpass; - - /* Ignore subpass self-dependencies as they allow the app to call - * vkCmdPipelineBarrier() inside the render pass and the driver should only - * do the barrier when called, not when starting the render pass. - * - * We cannot decide whether to allow gmem rendering before a barrier - * is actually emitted, so we delay the decision until then. - */ - if (src == dst) - return; - - /* From the Vulkan 1.2.195 spec: - * - * "If an instance of VkMemoryBarrier2 is included in the pNext chain, srcStageMask, - * dstStageMask, srcAccessMask, and dstAccessMask parameters are ignored. The synchronization - * and access scopes instead are defined by the parameters of VkMemoryBarrier2." - */ - const VkMemoryBarrier2 *barrier = - vk_find_struct_const(dep->pNext, MEMORY_BARRIER_2); - VkPipelineStageFlags2 src_stage_mask = barrier ? barrier->srcStageMask : dep->srcStageMask; - VkAccessFlags2 src_access_mask = barrier ? barrier->srcAccessMask : dep->srcAccessMask; - VkPipelineStageFlags2 dst_stage_mask = barrier ? barrier->dstStageMask : dep->dstStageMask; - VkAccessFlags2 dst_access_mask = barrier ? barrier->dstAccessMask : dep->dstAccessMask; - - if (dep_invalid_for_gmem(dep, src_stage_mask, dst_stage_mask)) { - perf_debug((struct tu_device *)pass->base.device, "Disabling gmem rendering due to invalid subpass dependency"); - for (int i = 0; i < ARRAY_SIZE(pass->gmem_pixels); i++) - pass->gmem_pixels[i] = 0; - } - - struct tu_subpass_barrier *dst_barrier; - if (dst == VK_SUBPASS_EXTERNAL) { - dst_barrier = &pass->end_barrier; - } else { - dst_barrier = &pass->subpasses[dst].start_barrier; - } - - dst_barrier->src_stage_mask |= src_stage_mask; - dst_barrier->dst_stage_mask |= dst_stage_mask; - dst_barrier->src_access_mask |= src_access_mask; - dst_barrier->dst_access_mask |= dst_access_mask; -} - -/* We currently only care about undefined layouts, because we have to - * flush/invalidate CCU for those. PREINITIALIZED is the same thing as - * UNDEFINED for anything not linear tiled, but we don't know yet whether the - * images used are tiled, so just assume they are. - */ - -static bool -layout_undefined(VkImageLayout layout) -{ - return layout == VK_IMAGE_LAYOUT_UNDEFINED || - layout == VK_IMAGE_LAYOUT_PREINITIALIZED; -} - -/* This implements the following bit of spec text: - * - * If there is no subpass dependency from VK_SUBPASS_EXTERNAL to the - * first subpass that uses an attachment, then an implicit subpass - * dependency exists from VK_SUBPASS_EXTERNAL to the first subpass it is - * used in. The implicit subpass dependency only exists if there - * exists an automatic layout transition away from initialLayout. - * The subpass dependency operates as if defined with the - * following parameters: - * - * VkSubpassDependency implicitDependency = { - * .srcSubpass = VK_SUBPASS_EXTERNAL; - * .dstSubpass = firstSubpass; // First subpass attachment is used in - * .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - * .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; - * .srcAccessMask = 0; - * .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | - * VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | - * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | - * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; - * .dependencyFlags = 0; - * }; * - * Similarly, if there is no subpass dependency from the last subpass - * that uses an attachment to VK_SUBPASS_EXTERNAL, then an implicit - * subpass dependency exists from the last subpass it is used in to - * VK_SUBPASS_EXTERNAL. The implicit subpass dependency only exists - * if there exists an automatic layout transition into finalLayout. - * The subpass dependency operates as if defined with the following - * parameters: + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: * - * VkSubpassDependency implicitDependency = { - * .srcSubpass = lastSubpass; // Last subpass attachment is used in - * .dstSubpass = VK_SUBPASS_EXTERNAL; - * .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; - * .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; - * .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | - * VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | - * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | - * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; - * .dstAccessMask = 0; - * .dependencyFlags = 0; - * }; + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. * - * Note: currently this is the only use we have for layout transitions, - * besides needing to invalidate CCU at the beginning, so we also flag - * transitions from UNDEFINED here. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ -static void -tu_render_pass_add_implicit_deps(struct tu_render_pass *pass, - const VkRenderPassCreateInfo2 *info) -{ - const VkAttachmentDescription2* att = info->pAttachments; - bool has_external_src[info->subpassCount]; - bool has_external_dst[info->subpassCount]; - bool att_used[pass->attachment_count]; - - memset(has_external_src, 0, sizeof(has_external_src)); - memset(has_external_dst, 0, sizeof(has_external_dst)); - - for (uint32_t i = 0; i < info->dependencyCount; i++) { - uint32_t src = info->pDependencies[i].srcSubpass; - uint32_t dst = info->pDependencies[i].dstSubpass; - - if (src == dst) - continue; +#include "tu_private.h" - if (src == VK_SUBPASS_EXTERNAL) - has_external_src[dst] = true; - if (dst == VK_SUBPASS_EXTERNAL) - has_external_dst[src] = true; - } - - memset(att_used, 0, sizeof(att_used)); - - for (unsigned i = 0; i < info->subpassCount; i++) { - const VkSubpassDescription2 *subpass = &info->pSubpasses[i]; - bool src_implicit_dep = false; - - for (unsigned j = 0; j < subpass->inputAttachmentCount; j++) { - uint32_t a = subpass->pInputAttachments[j].attachment; - - if (a == VK_ATTACHMENT_UNUSED) - continue; - - uint32_t stencil_layout = vk_format_has_stencil(att[a].format) ? - vk_att_ref_stencil_layout(&subpass->pInputAttachments[j], att) : - VK_IMAGE_LAYOUT_UNDEFINED; - uint32_t stencil_initial_layout = vk_att_desc_stencil_layout(&att[a], false); - - if ((att[a].initialLayout != subpass->pInputAttachments[j].layout || - stencil_initial_layout != stencil_layout) && - !att_used[a] && !has_external_src[i]) - src_implicit_dep = true; - att_used[a] = true; - } - - for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) { - uint32_t a = subpass->pColorAttachments[j].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - if (att[a].initialLayout != subpass->pColorAttachments[j].layout && - !att_used[a] && !has_external_src[i]) - src_implicit_dep = true; - att_used[a] = true; - } - - if (subpass->pDepthStencilAttachment && - subpass->pDepthStencilAttachment->attachment != VK_ATTACHMENT_UNUSED) { - uint32_t a = subpass->pDepthStencilAttachment->attachment; - uint32_t stencil_layout = vk_att_ref_stencil_layout(subpass->pDepthStencilAttachment, att); - uint32_t stencil_initial_layout = vk_att_desc_stencil_layout(&att[a], false); +#include "vk_util.h" - if ((att[a].initialLayout != subpass->pDepthStencilAttachment->layout || - stencil_initial_layout != stencil_layout) && - !att_used[a] && !has_external_src[i]) { - src_implicit_dep = true; - } - att_used[a] = true; - } +VkResult +tu_CreateRenderPass(VkDevice _device, + const VkRenderPassCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkRenderPass *pRenderPass) +{ + TU_FROM_HANDLE(tu_device, device, _device); + struct tu_render_pass *pass; + size_t size; + size_t attachments_offset; + VkRenderPassMultiviewCreateInfo *multiview_info = NULL; - if (subpass->pResolveAttachments) { - for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) { - uint32_t a = subpass->pResolveAttachments[j].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - if (att[a].initialLayout != subpass->pResolveAttachments[j].layout && - !att_used[a] && !has_external_src[i]) - src_implicit_dep = true; - att_used[a] = true; - } - } + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO); - const VkSubpassDescriptionDepthStencilResolve *ds_resolve = - vk_find_struct_const(subpass->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE); + size = sizeof(*pass); + size += pCreateInfo->subpassCount * sizeof(pass->subpasses[0]); + attachments_offset = size; + size += pCreateInfo->attachmentCount * sizeof(pass->attachments[0]); - if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment && - ds_resolve->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED) { - uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment; - uint32_t stencil_layout = vk_att_ref_stencil_layout(ds_resolve->pDepthStencilResolveAttachment, att); - uint32_t stencil_initial_layout = vk_att_desc_stencil_layout(&att[a], false); + pass = vk_alloc2(&device->alloc, pAllocator, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pass == NULL) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - if ((att[a].initialLayout != subpass->pDepthStencilAttachment->layout || - stencil_initial_layout != stencil_layout) && - !att_used[a] && !has_external_src[i]) - src_implicit_dep = true; - att_used[a] = true; - } + memset(pass, 0, size); + pass->attachment_count = pCreateInfo->attachmentCount; + pass->subpass_count = pCreateInfo->subpassCount; + pass->attachments = (void *) pass + attachments_offset; - if (src_implicit_dep) { - tu_render_pass_add_subpass_dep(pass, &(VkSubpassDependency2) { - .srcSubpass = VK_SUBPASS_EXTERNAL, - .dstSubpass = i, - .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, - .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - .srcAccessMask = 0, - .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | - VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, - .dependencyFlags = 0, - }); + vk_foreach_struct(ext, pCreateInfo->pNext) + { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_RENDER_PASS_MULTIVIEW_CREATE_INFO: + multiview_info = (VkRenderPassMultiviewCreateInfo *) ext; + break; + default: + break; } } - memset(att_used, 0, sizeof(att_used)); - - for (int i = info->subpassCount - 1; i >= 0; i--) { - const VkSubpassDescription2 *subpass = &info->pSubpasses[i]; - bool dst_implicit_dep = false; - - for (unsigned j = 0; j < subpass->inputAttachmentCount; j++) { - uint32_t a = subpass->pInputAttachments[j].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - - uint32_t stencil_layout = vk_format_has_stencil(att[a].format) ? - vk_att_ref_stencil_layout(&subpass->pInputAttachments[j], att) : - VK_IMAGE_LAYOUT_UNDEFINED; - uint32_t stencil_final_layout = vk_att_desc_stencil_layout(&att[a], true); - - if ((att[a].finalLayout != subpass->pInputAttachments[j].layout || - stencil_final_layout != stencil_layout) && - !att_used[a] && !has_external_dst[i]) - dst_implicit_dep = true; - att_used[a] = true; - } - - for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) { - uint32_t a = subpass->pColorAttachments[j].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - if (att[a].finalLayout != subpass->pColorAttachments[j].layout && - !att_used[a] && !has_external_dst[i]) - dst_implicit_dep = true; - att_used[a] = true; - } - - if (subpass->pDepthStencilAttachment && - subpass->pDepthStencilAttachment->attachment != VK_ATTACHMENT_UNUSED) { - uint32_t a = subpass->pDepthStencilAttachment->attachment; - uint32_t stencil_layout = vk_att_ref_stencil_layout(subpass->pDepthStencilAttachment, att); - uint32_t stencil_final_layout = vk_att_desc_stencil_layout(&att[a], true); - - if ((att[a].finalLayout != subpass->pDepthStencilAttachment->layout || - stencil_final_layout != stencil_layout) && - !att_used[a] && !has_external_dst[i]) { - dst_implicit_dep = true; - } - att_used[a] = true; - } - - if (subpass->pResolveAttachments) { - for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) { - uint32_t a = subpass->pResolveAttachments[j].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - if (att[a].finalLayout != subpass->pResolveAttachments[j].layout && - !att_used[a] && !has_external_dst[i]) - dst_implicit_dep = true; - att_used[a] = true; - } - } - - const VkSubpassDescriptionDepthStencilResolve *ds_resolve = - vk_find_struct_const(subpass->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE); - - if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment && - ds_resolve->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED) { - uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment; - uint32_t stencil_layout = vk_att_ref_stencil_layout(ds_resolve->pDepthStencilResolveAttachment, att); - uint32_t stencil_final_layout = vk_att_desc_stencil_layout(&att[a], true); - - if ((att[a].finalLayout != subpass->pDepthStencilAttachment->layout || - stencil_final_layout != stencil_layout) && - !att_used[a] && !has_external_src[i]) - dst_implicit_dep = true; - att_used[a] = true; - } + for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) { + struct tu_render_pass_attachment *att = &pass->attachments[i]; - if (dst_implicit_dep) { - tu_render_pass_add_subpass_dep(pass, &(VkSubpassDependency2) { - .srcSubpass = i, - .dstSubpass = VK_SUBPASS_EXTERNAL, - .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, - .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | - VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, - .dstAccessMask = 0, - .dependencyFlags = 0, - }); - } + att->format = pCreateInfo->pAttachments[i].format; + att->samples = pCreateInfo->pAttachments[i].samples; + att->load_op = pCreateInfo->pAttachments[i].loadOp; + att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp; + att->initial_layout = pCreateInfo->pAttachments[i].initialLayout; + att->final_layout = pCreateInfo->pAttachments[i].finalLayout; + // att->store_op = pCreateInfo->pAttachments[i].storeOp; + // att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp; } + uint32_t subpass_attachment_count = 0; + struct tu_subpass_attachment *p; + for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) { + const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i]; - /* Handle UNDEFINED transitions, similar to the handling in tu_barrier(). - * Assume that if an attachment has an initial layout of UNDEFINED, it gets - * transitioned eventually. - */ - for (unsigned i = 0; i < info->attachmentCount; i++) { - if (layout_undefined(att[i].initialLayout)) { - if (vk_format_is_depth_or_stencil(att[i].format)) { - pass->subpasses[0].start_barrier.incoherent_ccu_depth = true; - } else { - pass->subpasses[0].start_barrier.incoherent_ccu_color = true; - } - } + subpass_attachment_count += + desc->inputAttachmentCount + desc->colorAttachmentCount + + (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) + + (desc->pDepthStencilAttachment != NULL); } -} -/* If an input attachment is used without an intervening write to the same - * attachment, then we can just use the original image, even in GMEM mode. - * This is an optimization, but it's also important because it allows us to - * avoid having to invalidate UCHE at the beginning of each tile due to it - * becoming invalid. The only reads of GMEM via UCHE should be after an - * earlier subpass modified it, which only works if there's already an - * appropriate dependency that will add the CACHE_INVALIDATE anyway. We - * don't consider this in the dependency code, so this is also required for - * correctness. - */ -static void -tu_render_pass_patch_input_gmem(struct tu_render_pass *pass) -{ - bool written[pass->attachment_count]; - - memset(written, 0, sizeof(written)); - - for (unsigned i = 0; i < pass->subpass_count; i++) { - struct tu_subpass *subpass = &pass->subpasses[i]; - - for (unsigned j = 0; j < subpass->input_count; j++) { - uint32_t a = subpass->input_attachments[j].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - subpass->input_attachments[j].patch_input_gmem = written[a]; + if (subpass_attachment_count) { + pass->subpass_attachments = vk_alloc2( + &device->alloc, pAllocator, + subpass_attachment_count * sizeof(struct tu_subpass_attachment), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pass->subpass_attachments == NULL) { + vk_free2(&device->alloc, pAllocator, pass); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } + } else + pass->subpass_attachments = NULL; - for (unsigned j = 0; j < subpass->color_count; j++) { - uint32_t a = subpass->color_attachments[j].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - written[a] = true; + p = pass->subpass_attachments; + for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) { + const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i]; + uint32_t color_sample_count = 1, depth_sample_count = 1; + struct tu_subpass *subpass = &pass->subpasses[i]; - for (unsigned k = 0; k < subpass->input_count; k++) { - if (subpass->input_attachments[k].attachment == a && - !subpass->input_attachments[k].patch_input_gmem) { - /* For render feedback loops, we have no idea whether the use - * as a color attachment or input attachment will come first, - * so we have to always use GMEM in case the color attachment - * comes first and defensively invalidate UCHE in case the - * input attachment comes first. - */ - subpass->feedback_invalidate = true; - subpass->input_attachments[k].patch_input_gmem = true; - } - } - } + subpass->input_count = desc->inputAttachmentCount; + subpass->color_count = desc->colorAttachmentCount; + if (multiview_info) + subpass->view_mask = multiview_info->pViewMasks[i]; - for (unsigned j = 0; j < subpass->resolve_count; j++) { - uint32_t a = subpass->resolve_attachments[j].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - written[a] = true; - } + if (desc->inputAttachmentCount > 0) { + subpass->input_attachments = p; + p += desc->inputAttachmentCount; - if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) { - written[subpass->depth_stencil_attachment.attachment] = true; - for (unsigned k = 0; k < subpass->input_count; k++) { - if (subpass->input_attachments[k].attachment == - subpass->depth_stencil_attachment.attachment && - !subpass->input_attachments[k].patch_input_gmem) { - subpass->feedback_invalidate = true; - subpass->input_attachments[k].patch_input_gmem = true; - } + for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) { + subpass->input_attachments[j] = (struct tu_subpass_attachment) { + .attachment = desc->pInputAttachments[j].attachment, + .layout = desc->pInputAttachments[j].layout, + }; + if (desc->pInputAttachments[j].attachment != VK_ATTACHMENT_UNUSED) + pass->attachments[desc->pInputAttachments[j].attachment] + .view_mask |= subpass->view_mask; } } - } -} - -static void -tu_render_pass_check_feedback_loop(struct tu_render_pass *pass) -{ - for (unsigned i = 0; i < pass->subpass_count; i++) { - struct tu_subpass *subpass = &pass->subpasses[i]; - for (unsigned j = 0; j < subpass->color_count; j++) { - uint32_t a = subpass->color_attachments[j].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - for (unsigned k = 0; k < subpass->input_count; k++) { - if (subpass->input_attachments[k].attachment == a) { - subpass->feedback_loop_color = true; - break; - } - } - } + if (desc->colorAttachmentCount > 0) { + subpass->color_attachments = p; + p += desc->colorAttachmentCount; - if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) { - for (unsigned k = 0; k < subpass->input_count; k++) { - if (subpass->input_attachments[k].attachment == - subpass->depth_stencil_attachment.attachment) { - subpass->feedback_loop_ds = true; - break; + for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { + subpass->color_attachments[j] = (struct tu_subpass_attachment) { + .attachment = desc->pColorAttachments[j].attachment, + .layout = desc->pColorAttachments[j].layout, + }; + if (desc->pColorAttachments[j].attachment != + VK_ATTACHMENT_UNUSED) { + pass->attachments[desc->pColorAttachments[j].attachment] + .view_mask |= subpass->view_mask; + color_sample_count = + pCreateInfo + ->pAttachments[desc->pColorAttachments[j].attachment] + .samples; } } } - } -} - -static void update_samples(struct tu_subpass *subpass, - VkSampleCountFlagBits samples) -{ - assert(subpass->samples == 0 || subpass->samples == samples); - subpass->samples = samples; -} -static void -tu_render_pass_calc_hash(struct tu_render_pass *pass) -{ - #define HASH(hash, data) XXH64(&(data), sizeof(data), hash) - - uint64_t hash = HASH(0, pass->attachment_count); - hash = XXH64(pass->attachments, - pass->attachment_count * sizeof(pass->attachments[0]), hash); - hash = HASH(hash, pass->subpass_count); - for (unsigned i = 0; i < pass->subpass_count; i++) { - hash = HASH(hash, pass->subpasses[i].samples); - hash = HASH(hash, pass->subpasses[i].input_count); - hash = HASH(hash, pass->subpasses[i].color_count); - hash = HASH(hash, pass->subpasses[i].resolve_count); - } - - pass->autotune_hash = hash; - - #undef HASH -} - -static void -tu_render_pass_cond_config(struct tu_render_pass *pass) -{ - for (uint32_t i = 0; i < pass->attachment_count; i++) { - struct tu_render_pass_attachment *att = &pass->attachments[i]; - - att->cond_load_allowed = - (att->load || att->load_stencil) && !att->clear_mask && !att->will_be_resolved; - att->cond_store_allowed = - (att->store || att->store_stencil) && !att->clear_mask; - } -} - -static void -tu_render_pass_gmem_config(struct tu_render_pass *pass, - const struct tu_physical_device *phys_dev) -{ - for (enum tu_gmem_layout layout = 0; layout < TU_GMEM_LAYOUT_COUNT; - layout++) { - /* From the VK_KHR_multiview spec: - * - * Multiview is all-or-nothing for a render pass - that is, either all - * subpasses must have a non-zero view mask (though some subpasses may - * have only one view) or all must be zero. - * - * This means we only have to check one of the view masks. - */ - if (pass->subpasses[0].multiview_mask) { - /* It seems multiview must use sysmem rendering. */ - pass->gmem_pixels[layout] = 0; - continue; - } - - /* log2(gmem_align/(tile_align_w*tile_align_h)) */ - uint32_t block_align_shift = 3; - uint32_t tile_align_w = phys_dev->info->tile_align_w; - uint32_t gmem_align = (1 << block_align_shift) * tile_align_w * - phys_dev->info->tile_align_h; - - /* calculate total bytes per pixel */ - uint32_t cpp_total = 0; - for (uint32_t i = 0; i < pass->attachment_count; i++) { - struct tu_render_pass_attachment *att = &pass->attachments[i]; - bool cpp1 = (att->cpp == 1); - if (att->gmem) { - cpp_total += att->cpp; - - /* take into account the separate stencil: */ - if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { - cpp1 = (att->samples == 1); - cpp_total += att->samples; - } + subpass->has_resolve = false; + if (desc->pResolveAttachments) { + subpass->resolve_attachments = p; + p += desc->colorAttachmentCount; - /* texture pitch must be aligned to 64, use a tile_align_w that is - * a multiple of 64 for cpp==1 attachment to work as input - * attachment - */ - if (cpp1 && tile_align_w % 64 != 0) { - tile_align_w *= 2; - block_align_shift -= 1; + for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { + uint32_t a = desc->pResolveAttachments[j].attachment; + subpass->resolve_attachments[j] = (struct tu_subpass_attachment) { + .attachment = desc->pResolveAttachments[j].attachment, + .layout = desc->pResolveAttachments[j].layout, + }; + if (a != VK_ATTACHMENT_UNUSED) { + subpass->has_resolve = true; + pass->attachments[desc->pResolveAttachments[j].attachment] + .view_mask |= subpass->view_mask; } } } - pass->tile_align_w = tile_align_w; - - /* no gmem attachments */ - if (cpp_total == 0) { - /* any value non-zero value so tiling config works with no - * attachments - */ - pass->gmem_pixels[layout] = 1024 * 1024; - continue; - } - - /* TODO: this algorithm isn't optimal - * for example, two attachments with cpp = {1, 4} - * result: nblocks = {12, 52}, pixels = 196608 - * optimal: nblocks = {13, 51}, pixels = 208896 - */ - uint32_t gmem_size = layout == TU_GMEM_LAYOUT_FULL - ? phys_dev->gmem_size - : phys_dev->ccu_offset_gmem; - uint32_t gmem_blocks = gmem_size / gmem_align; - uint32_t offset = 0, pixels = ~0u, i; - for (i = 0; i < pass->attachment_count; i++) { - struct tu_render_pass_attachment *att = &pass->attachments[i]; - if (!att->gmem) - continue; - - att->gmem_offset[layout] = offset; - - uint32_t align = MAX2(1, att->cpp >> block_align_shift); - uint32_t nblocks = - MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align); - - if (nblocks > gmem_blocks) - break; - - gmem_blocks -= nblocks; - cpp_total -= att->cpp; - offset += nblocks * gmem_align; - pixels = MIN2(pixels, nblocks * gmem_align / att->cpp); - - /* repeat the same for separate stencil */ - if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) { - att->gmem_offset_stencil[layout] = offset; - - /* note: for s8_uint, block align is always 1 */ - uint32_t nblocks = gmem_blocks * att->samples / cpp_total; - if (nblocks > gmem_blocks) - break; - - gmem_blocks -= nblocks; - cpp_total -= att->samples; - offset += nblocks * gmem_align; - pixels = MIN2(pixels, nblocks * gmem_align / att->samples); + if (desc->pDepthStencilAttachment) { + subpass->depth_stencil_attachment = (struct tu_subpass_attachment) { + .attachment = desc->pDepthStencilAttachment->attachment, + .layout = desc->pDepthStencilAttachment->layout, + }; + if (desc->pDepthStencilAttachment->attachment != + VK_ATTACHMENT_UNUSED) { + pass->attachments[desc->pDepthStencilAttachment->attachment] + .view_mask |= subpass->view_mask; + depth_sample_count = + pCreateInfo + ->pAttachments[desc->pDepthStencilAttachment->attachment] + .samples; } + } else { + subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED; } - /* if the loop didn't complete then the gmem config is impossible */ - if (i == pass->attachment_count) - pass->gmem_pixels[layout] = pixels; + subpass->max_sample_count = + MAX2(color_sample_count, depth_sample_count); } -} -static void -tu_render_pass_bandwidth_config(struct tu_render_pass *pass) -{ - pass->gmem_bandwidth_per_pixel = 0; - pass->sysmem_bandwidth_per_pixel = 0; - - for (uint32_t i = 0; i < pass->attachment_count; i++) { - const struct tu_render_pass_attachment *att = &pass->attachments[i]; - - /* approximate tu_load_gmem_attachment */ - if (att->load) - pass->gmem_bandwidth_per_pixel += att->cpp; - - /* approximate tu_store_gmem_attachment */ - if (att->store) - pass->gmem_bandwidth_per_pixel += att->cpp; - - /* approximate tu_clear_sysmem_attachment */ - if (att->clear_mask) - pass->sysmem_bandwidth_per_pixel += att->cpp; - - /* approximate tu6_emit_sysmem_resolves */ - if (att->will_be_resolved) { - pass->sysmem_bandwidth_per_pixel += - att->cpp + att->cpp / att->samples; + for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) { + uint32_t dst = pCreateInfo->pDependencies[i].dstSubpass; + if (dst == VK_SUBPASS_EXTERNAL) { + pass->end_barrier.src_stage_mask = + pCreateInfo->pDependencies[i].srcStageMask; + pass->end_barrier.src_access_mask = + pCreateInfo->pDependencies[i].srcAccessMask; + pass->end_barrier.dst_access_mask = + pCreateInfo->pDependencies[i].dstAccessMask; + } else { + pass->subpasses[dst].start_barrier.src_stage_mask = + pCreateInfo->pDependencies[i].srcStageMask; + pass->subpasses[dst].start_barrier.src_access_mask = + pCreateInfo->pDependencies[i].srcAccessMask; + pass->subpasses[dst].start_barrier.dst_access_mask = + pCreateInfo->pDependencies[i].dstAccessMask; } } -} - -static void -attachment_set_ops(struct tu_device *device, - struct tu_render_pass_attachment *att, - VkAttachmentLoadOp load_op, - VkAttachmentLoadOp stencil_load_op, - VkAttachmentStoreOp store_op, - VkAttachmentStoreOp stencil_store_op) -{ - if (device->instance->debug_flags & TU_DEBUG_DONT_CARE_AS_LOAD) { - if (load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE) - load_op = VK_ATTACHMENT_LOAD_OP_LOAD; - if (stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE) - stencil_load_op = VK_ATTACHMENT_LOAD_OP_LOAD; - } - - /* load/store ops */ - att->clear_mask = - (load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) ? VK_IMAGE_ASPECT_COLOR_BIT : 0; - att->load = (load_op == VK_ATTACHMENT_LOAD_OP_LOAD); - att->store = (store_op == VK_ATTACHMENT_STORE_OP_STORE); - - bool stencil_clear = (stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR); - bool stencil_load = (stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD); - bool stencil_store = (stencil_store_op == VK_ATTACHMENT_STORE_OP_STORE); - switch (att->format) { - case VK_FORMAT_D24_UNORM_S8_UINT: /* || stencil load/store */ - if (att->clear_mask) - att->clear_mask = VK_IMAGE_ASPECT_DEPTH_BIT; - if (stencil_clear) - att->clear_mask |= VK_IMAGE_ASPECT_STENCIL_BIT; - if (stencil_load) - att->load = true; - if (stencil_store) - att->store = true; - break; - case VK_FORMAT_S8_UINT: /* replace load/store with stencil load/store */ - att->clear_mask = stencil_clear ? VK_IMAGE_ASPECT_COLOR_BIT : 0; - att->load = stencil_load; - att->store = stencil_store; - break; - case VK_FORMAT_D32_SFLOAT_S8_UINT: /* separate stencil */ - if (att->clear_mask) - att->clear_mask = VK_IMAGE_ASPECT_DEPTH_BIT; - if (stencil_clear) - att->clear_mask |= VK_IMAGE_ASPECT_STENCIL_BIT; - if (stencil_load) - att->load_stencil = true; - if (stencil_store) - att->store_stencil = true; - break; - default: - break; - } -} - -static bool -is_depth_stencil_resolve_enabled(const VkSubpassDescriptionDepthStencilResolve *depth_stencil_resolve) -{ - if (depth_stencil_resolve && - depth_stencil_resolve->pDepthStencilResolveAttachment && - depth_stencil_resolve->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED) { - return true; - } - return false; -} - -static void -tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const VkRenderPassCreateInfo2 *pCreateInfo) -{ - struct tu_subpass *subpass = &pass->subpasses[i]; + *pRenderPass = tu_render_pass_to_handle(pass); - pass->attachments[a].gmem = true; - update_samples(subpass, pCreateInfo->pAttachments[a].samples); - pass->attachments[a].clear_views |= subpass->multiview_mask; + return VK_SUCCESS; } -VKAPI_ATTR VkResult VKAPI_CALL -tu_CreateRenderPass2(VkDevice _device, - const VkRenderPassCreateInfo2 *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkRenderPass *pRenderPass) +VkResult +tu_CreateRenderPass2KHR(VkDevice _device, + const VkRenderPassCreateInfo2KHR *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkRenderPass *pRenderPass) { TU_FROM_HANDLE(tu_device, device, _device); - - if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC)) - return vk_common_CreateRenderPass2(_device, pCreateInfo, pAllocator, - pRenderPass); - struct tu_render_pass *pass; size_t size; size_t attachments_offset; - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2); + assert(pCreateInfo->sType == + VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR); size = sizeof(*pass); size += pCreateInfo->subpassCount * sizeof(pass->subpasses[0]); attachments_offset = size; size += pCreateInfo->attachmentCount * sizeof(pass->attachments[0]); - pass = vk_object_zalloc(&device->vk, pAllocator, size, - VK_OBJECT_TYPE_RENDER_PASS); + pass = vk_alloc2(&device->alloc, pAllocator, size, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (pass == NULL) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); + memset(pass, 0, size); pass->attachment_count = pCreateInfo->attachmentCount; pass->subpass_count = pCreateInfo->subpassCount; pass->attachments = (void *) pass + attachments_offset; @@ -800,82 +249,58 @@ tu_CreateRenderPass2(VkDevice _device, att->format = pCreateInfo->pAttachments[i].format; att->samples = pCreateInfo->pAttachments[i].samples; - /* for d32s8, cpp is for the depth image, and - * att->samples will be used as the cpp for the stencil image - */ - if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) - att->cpp = 4 * att->samples; - else - att->cpp = vk_format_get_blocksize(att->format) * att->samples; - /* Initially not allocated into gmem, tu_subpass_use_attachment() will move it there. */ - att->gmem = false; - - VkAttachmentLoadOp loadOp = pCreateInfo->pAttachments[i].loadOp; - VkAttachmentLoadOp stencilLoadOp = pCreateInfo->pAttachments[i].stencilLoadOp; - - attachment_set_ops(device, att, loadOp, stencilLoadOp, - pCreateInfo->pAttachments[i].storeOp, - pCreateInfo->pAttachments[i].stencilStoreOp); + att->load_op = pCreateInfo->pAttachments[i].loadOp; + att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp; + att->initial_layout = pCreateInfo->pAttachments[i].initialLayout; + att->final_layout = pCreateInfo->pAttachments[i].finalLayout; + // att->store_op = pCreateInfo->pAttachments[i].storeOp; + // att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp; } uint32_t subpass_attachment_count = 0; struct tu_subpass_attachment *p; for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) { - const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i]; - const VkSubpassDescriptionDepthStencilResolve *ds_resolve = - vk_find_struct_const(desc->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE); + const VkSubpassDescription2KHR *desc = &pCreateInfo->pSubpasses[i]; subpass_attachment_count += desc->inputAttachmentCount + desc->colorAttachmentCount + (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) + - (is_depth_stencil_resolve_enabled(ds_resolve) ? 1 : 0); + (desc->pDepthStencilAttachment != NULL); } if (subpass_attachment_count) { pass->subpass_attachments = vk_alloc2( - &device->vk.alloc, pAllocator, + &device->alloc, pAllocator, subpass_attachment_count * sizeof(struct tu_subpass_attachment), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (pass->subpass_attachments == NULL) { - vk_object_free(&device->vk, pAllocator, pass); - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + vk_free2(&device->alloc, pAllocator, pass); + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); } } else pass->subpass_attachments = NULL; p = pass->subpass_attachments; for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) { - const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i]; - const VkSubpassDescriptionDepthStencilResolve *ds_resolve = - vk_find_struct_const(desc->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE); + const VkSubpassDescription2KHR *desc = &pCreateInfo->pSubpasses[i]; + uint32_t color_sample_count = 1, depth_sample_count = 1; struct tu_subpass *subpass = &pass->subpasses[i]; subpass->input_count = desc->inputAttachmentCount; subpass->color_count = desc->colorAttachmentCount; - subpass->resolve_count = 0; - subpass->resolve_depth_stencil = is_depth_stencil_resolve_enabled(ds_resolve); - subpass->samples = 0; - subpass->srgb_cntl = 0; - - const VkSubpassDescriptionFlagBits raster_order_access_bits = - VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_EXT | - VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_EXT | - VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_EXT; - - subpass->raster_order_attachment_access = desc->flags & raster_order_access_bits; - - subpass->multiview_mask = desc->viewMask; + subpass->view_mask = desc->viewMask; if (desc->inputAttachmentCount > 0) { subpass->input_attachments = p; p += desc->inputAttachmentCount; for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) { - uint32_t a = desc->pInputAttachments[j].attachment; - subpass->input_attachments[j].attachment = a; - /* Note: attachments only used as input attachments will be read - * directly instead of through gmem, so we don't mark input - * attachments as needing gmem. - */ + subpass->input_attachments[j] = (struct tu_subpass_attachment) { + .attachment = desc->pInputAttachments[j].attachment, + .layout = desc->pInputAttachments[j].layout, + }; + if (desc->pInputAttachments[j].attachment != VK_ATTACHMENT_UNUSED) + pass->attachments[desc->pInputAttachments[j].attachment] + .view_mask |= subpass->view_mask; } } @@ -884,313 +309,108 @@ tu_CreateRenderPass2(VkDevice _device, p += desc->colorAttachmentCount; for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { - uint32_t a = desc->pColorAttachments[j].attachment; - subpass->color_attachments[j].attachment = a; - - if (a != VK_ATTACHMENT_UNUSED) { - tu_subpass_use_attachment(pass, i, a, pCreateInfo); - - if (vk_format_is_srgb(pass->attachments[a].format)) - subpass->srgb_cntl |= 1 << j; + subpass->color_attachments[j] = (struct tu_subpass_attachment) { + .attachment = desc->pColorAttachments[j].attachment, + .layout = desc->pColorAttachments[j].layout, + }; + if (desc->pColorAttachments[j].attachment != + VK_ATTACHMENT_UNUSED) { + pass->attachments[desc->pColorAttachments[j].attachment] + .view_mask |= subpass->view_mask; + color_sample_count = + pCreateInfo + ->pAttachments[desc->pColorAttachments[j].attachment] + .samples; } } } - subpass->resolve_attachments = (desc->pResolveAttachments || subpass->resolve_depth_stencil) ? p : NULL; + subpass->has_resolve = false; if (desc->pResolveAttachments) { + subpass->resolve_attachments = p; p += desc->colorAttachmentCount; - subpass->resolve_count += desc->colorAttachmentCount; - for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { - subpass->resolve_attachments[j].attachment = - desc->pResolveAttachments[j].attachment; - uint32_t src_a = desc->pColorAttachments[j].attachment; - if (src_a != VK_ATTACHMENT_UNUSED) { - pass->attachments[src_a].will_be_resolved = - desc->pResolveAttachments[j].attachment != VK_ATTACHMENT_UNUSED; + for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { + uint32_t a = desc->pResolveAttachments[j].attachment; + subpass->resolve_attachments[j] = (struct tu_subpass_attachment) { + .attachment = desc->pResolveAttachments[j].attachment, + .layout = desc->pResolveAttachments[j].layout, + }; + if (a != VK_ATTACHMENT_UNUSED) { + subpass->has_resolve = true; + pass->attachments[desc->pResolveAttachments[j].attachment] + .view_mask |= subpass->view_mask; } } } - if (subpass->resolve_depth_stencil) { - p++; - subpass->resolve_count++; - uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment; - subpass->resolve_attachments[subpass->resolve_count - 1].attachment = a; - - uint32_t src_a = desc->pDepthStencilAttachment->attachment; - if (src_a != VK_ATTACHMENT_UNUSED) { - pass->attachments[src_a].will_be_resolved = a != VK_ATTACHMENT_UNUSED; + if (desc->pDepthStencilAttachment) { + subpass->depth_stencil_attachment = (struct tu_subpass_attachment) { + .attachment = desc->pDepthStencilAttachment->attachment, + .layout = desc->pDepthStencilAttachment->layout, + }; + if (desc->pDepthStencilAttachment->attachment != + VK_ATTACHMENT_UNUSED) { + pass->attachments[desc->pDepthStencilAttachment->attachment] + .view_mask |= subpass->view_mask; + depth_sample_count = + pCreateInfo + ->pAttachments[desc->pDepthStencilAttachment->attachment] + .samples; } + } else { + subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED; } - uint32_t a = desc->pDepthStencilAttachment ? - desc->pDepthStencilAttachment->attachment : VK_ATTACHMENT_UNUSED; - subpass->depth_stencil_attachment.attachment = a; - if (a != VK_ATTACHMENT_UNUSED) - tu_subpass_use_attachment(pass, i, a, pCreateInfo); - } - - tu_render_pass_patch_input_gmem(pass); - - tu_render_pass_check_feedback_loop(pass); - - /* disable unused attachments */ - for (uint32_t i = 0; i < pass->attachment_count; i++) { - struct tu_render_pass_attachment *att = &pass->attachments[i]; - if (!att->gmem) { - att->clear_mask = 0; - att->load = false; - } + subpass->max_sample_count = + MAX2(color_sample_count, depth_sample_count); } - tu_render_pass_cond_config(pass); - tu_render_pass_gmem_config(pass, device->physical_device); - tu_render_pass_bandwidth_config(pass); - tu_render_pass_calc_hash(pass); - for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) { - tu_render_pass_add_subpass_dep(pass, &pCreateInfo->pDependencies[i]); + uint32_t dst = pCreateInfo->pDependencies[i].dstSubpass; + if (dst == VK_SUBPASS_EXTERNAL) { + pass->end_barrier.src_stage_mask = + pCreateInfo->pDependencies[i].srcStageMask; + pass->end_barrier.src_access_mask = + pCreateInfo->pDependencies[i].srcAccessMask; + pass->end_barrier.dst_access_mask = + pCreateInfo->pDependencies[i].dstAccessMask; + } else { + pass->subpasses[dst].start_barrier.src_stage_mask = + pCreateInfo->pDependencies[i].srcStageMask; + pass->subpasses[dst].start_barrier.src_access_mask = + pCreateInfo->pDependencies[i].srcAccessMask; + pass->subpasses[dst].start_barrier.dst_access_mask = + pCreateInfo->pDependencies[i].dstAccessMask; + } } - tu_render_pass_add_implicit_deps(pass, pCreateInfo); - *pRenderPass = tu_render_pass_to_handle(pass); return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyRenderPass(VkDevice _device, VkRenderPass _pass, const VkAllocationCallbacks *pAllocator) { TU_FROM_HANDLE(tu_device, device, _device); - - if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC)) { - vk_common_DestroyRenderPass(_device, _pass, pAllocator); - return; - } - TU_FROM_HANDLE(tu_render_pass, pass, _pass); if (!_pass) return; - - vk_free2(&device->vk.alloc, pAllocator, pass->subpass_attachments); - vk_object_free(&device->vk, pAllocator, pass); -} - -static void -tu_setup_dynamic_attachment(struct tu_render_pass_attachment *att, - struct tu_image_view *view) -{ - att->format = view->vk.format; - att->samples = view->image->layout->nr_samples; - - /* for d32s8, cpp is for the depth image, and - * att->samples will be used as the cpp for the stencil image - */ - if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) - att->cpp = 4 * att->samples; - else - att->cpp = vk_format_get_blocksize(att->format) * att->samples; -} - -void -tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer, - const VkRenderingInfo *info) -{ - struct tu_device *device = cmd_buffer->device; - struct tu_render_pass *pass = &cmd_buffer->dynamic_pass; - struct tu_subpass *subpass = &cmd_buffer->dynamic_subpass; - - pass->subpass_count = 1; - pass->attachments = cmd_buffer->dynamic_rp_attachments; - - subpass->color_count = subpass->resolve_count = info->colorAttachmentCount; - subpass->resolve_depth_stencil = false; - subpass->color_attachments = cmd_buffer->dynamic_color_attachments; - subpass->resolve_attachments = cmd_buffer->dynamic_resolve_attachments; - subpass->feedback_invalidate = false; - subpass->feedback_loop_ds = subpass->feedback_loop_color = false; - subpass->input_count = 0; - subpass->samples = 0; - subpass->srgb_cntl = 0; - subpass->raster_order_attachment_access = false; - subpass->multiview_mask = info->viewMask; - - uint32_t a = 0; - for (uint32_t i = 0; i < info->colorAttachmentCount; i++) { - struct tu_render_pass_attachment *att = &pass->attachments[a]; - const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i]; - - if (att_info->imageView == VK_NULL_HANDLE) { - subpass->color_attachments[i].attachment = VK_ATTACHMENT_UNUSED; - subpass->resolve_attachments[i].attachment = VK_ATTACHMENT_UNUSED; - continue; - } - - TU_FROM_HANDLE(tu_image_view, view, att_info->imageView); - tu_setup_dynamic_attachment(att, view); - att->gmem = true; - att->clear_views = info->viewMask; - attachment_set_ops(device, att, att_info->loadOp, 0, - att_info->storeOp, 0); - subpass->color_attachments[i].attachment = a++; - - subpass->samples = view->image->layout->nr_samples; - - if (vk_format_is_srgb(view->vk.format)) - subpass->srgb_cntl |= 1 << i; - - if (att_info->resolveMode != VK_RESOLVE_MODE_NONE) { - struct tu_render_pass_attachment *resolve_att = &pass->attachments[a]; - TU_FROM_HANDLE(tu_image_view, resolve_view, att_info->resolveImageView); - tu_setup_dynamic_attachment(resolve_att, resolve_view); - resolve_att->gmem = false; - attachment_set_ops(device, resolve_att, - VK_ATTACHMENT_LOAD_OP_DONT_CARE, 0, - VK_ATTACHMENT_STORE_OP_STORE, 0); - subpass->resolve_attachments[i].attachment = a++; - att->will_be_resolved = true; - } else { - subpass->resolve_attachments[i].attachment = VK_ATTACHMENT_UNUSED; - att->will_be_resolved = false; - } - } - - if (info->pDepthAttachment || info->pStencilAttachment) { - const struct VkRenderingAttachmentInfo *common_info = - (info->pDepthAttachment && - info->pDepthAttachment->imageView != VK_NULL_HANDLE) ? - info->pDepthAttachment : - info->pStencilAttachment; - - if (common_info && common_info->imageView != VK_NULL_HANDLE) { - TU_FROM_HANDLE(tu_image_view, view, common_info->imageView); - - struct tu_render_pass_attachment *att = &pass->attachments[a]; - tu_setup_dynamic_attachment(att, view); - att->gmem = true; - att->clear_views = info->viewMask; - subpass->depth_stencil_attachment.attachment = a++; - - attachment_set_ops(device, att, - info->pDepthAttachment ? info->pDepthAttachment->loadOp : 0, - info->pStencilAttachment ? info->pStencilAttachment->loadOp : 0, - info->pDepthAttachment ? info->pDepthAttachment->storeOp : 0, - info->pStencilAttachment ? info->pStencilAttachment->storeOp : 0); - - subpass->samples = view->image->layout->nr_samples; - - if (common_info->resolveMode != VK_RESOLVE_MODE_NONE) { - unsigned i = subpass->resolve_count++; - struct tu_render_pass_attachment *resolve_att = &pass->attachments[a]; - TU_FROM_HANDLE(tu_image_view, resolve_view, - common_info->resolveImageView); - tu_setup_dynamic_attachment(resolve_att, resolve_view); - resolve_att->gmem = false; - attachment_set_ops(device, resolve_att, - VK_ATTACHMENT_LOAD_OP_DONT_CARE, - VK_ATTACHMENT_LOAD_OP_DONT_CARE, - VK_ATTACHMENT_STORE_OP_STORE, - VK_ATTACHMENT_STORE_OP_STORE); - subpass->resolve_attachments[i].attachment = a++; - att->will_be_resolved = true; - subpass->resolve_depth_stencil = true; - } else { - att->will_be_resolved = false; - } - } else { - subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED; - } - } else { - subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED; - } - - pass->attachment_count = a; - - tu_render_pass_cond_config(pass); - tu_render_pass_gmem_config(pass, device->physical_device); - tu_render_pass_bandwidth_config(pass); - tu_render_pass_calc_hash(pass); + vk_free2(&device->alloc, pAllocator, pass->subpass_attachments); + vk_free2(&device->alloc, pAllocator, pass); } void -tu_setup_dynamic_inheritance(struct tu_cmd_buffer *cmd_buffer, - const VkCommandBufferInheritanceRenderingInfo *info) -{ - struct tu_render_pass *pass = &cmd_buffer->dynamic_pass; - struct tu_subpass *subpass = &cmd_buffer->dynamic_subpass; - - pass->subpass_count = 1; - pass->attachments = cmd_buffer->dynamic_rp_attachments; - - subpass->color_count = info->colorAttachmentCount; - subpass->resolve_count = 0; - subpass->resolve_depth_stencil = false; - subpass->color_attachments = cmd_buffer->dynamic_color_attachments; - subpass->resolve_attachments = NULL; - subpass->feedback_invalidate = false; - subpass->feedback_loop_ds = subpass->feedback_loop_color = false; - subpass->input_count = 0; - subpass->samples = 0; - subpass->srgb_cntl = 0; - subpass->raster_order_attachment_access = false; - subpass->multiview_mask = info->viewMask; - subpass->samples = info->rasterizationSamples; - - unsigned a = 0; - for (unsigned i = 0; i < info->colorAttachmentCount; i++) { - struct tu_render_pass_attachment *att = &pass->attachments[a]; - VkFormat format = info->pColorAttachmentFormats[i]; - - if (format == VK_FORMAT_UNDEFINED) { - subpass->color_attachments[i].attachment = VK_ATTACHMENT_UNUSED; - continue; - } - - att->format = format; - att->samples = info->rasterizationSamples; - subpass->samples = info->rasterizationSamples; - subpass->color_attachments[i].attachment = a++; - - /* conservatively assume that the attachment may be conditionally - * loaded/stored. - */ - att->cond_load_allowed = att->cond_store_allowed = true; - } - - if (info->depthAttachmentFormat != VK_FORMAT_UNDEFINED || - info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED) { - struct tu_render_pass_attachment *att = &pass->attachments[a]; - att->format = info->depthAttachmentFormat != VK_FORMAT_UNDEFINED ? - info->depthAttachmentFormat : info->stencilAttachmentFormat; - att->samples = info->rasterizationSamples; - subpass->depth_stencil_attachment.attachment = a++; - att->cond_load_allowed = att->cond_store_allowed = true; - } else { - subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED; - } -} - -VKAPI_ATTR void VKAPI_CALL tu_GetRenderAreaGranularity(VkDevice _device, VkRenderPass renderPass, VkExtent2D *pGranularity) { TU_FROM_HANDLE(tu_device, device, _device); - pGranularity->width = device->physical_device->info->gmem_align_w; - pGranularity->height = device->physical_device->info->gmem_align_h; -} - -uint32_t -tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index) -{ - if (subpass->resolve_depth_stencil && - index == (subpass->resolve_count - 1)) - return subpass->depth_stencil_attachment.attachment; - return subpass->color_attachments[index].attachment; + pGranularity->width = device->physical_device->tile_align_w; + pGranularity->height = device->physical_device->tile_align_h; } diff --git a/lib/mesa/src/freedreno/vulkan/tu_pipeline.c b/lib/mesa/src/freedreno/vulkan/tu_pipeline.c index d4d3c9735..9964020a8 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_pipeline.c +++ b/lib/mesa/src/freedreno/vulkan/tu_pipeline.c @@ -1,297 +1,140 @@ /* * Copyright © 2016 Red Hat. * Copyright © 2016 Bas Nieuwenhuizen - * SPDX-License-Identifier: MIT * * based in part on anv driver which is: * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ -#include "tu_pipeline.h" +#include "tu_private.h" -#include "common/freedreno_guardband.h" - -#include "ir3/ir3_nir.h" #include "main/menums.h" #include "nir/nir.h" #include "nir/nir_builder.h" -#include "nir/nir_serialize.h" #include "spirv/nir_spirv.h" -#include "util/u_debug.h" +#include "util/debug.h" #include "util/mesa-sha1.h" -#include "vk_pipeline.h" -#include "vk_render_pass.h" +#include "util/u_atomic.h" +#include "vk_format.h" #include "vk_util.h" -#include "tu_cmd_buffer.h" #include "tu_cs.h" -#include "tu_device.h" -#include "tu_drm.h" -#include "tu_formats.h" -#include "tu_lrz.h" -#include "tu_pass.h" - -/* Emit IB that preloads the descriptors that the shader uses */ - -static void -emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st, - enum a6xx_state_block sb, unsigned base, unsigned offset, - unsigned count) -{ - /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not - * clear if emitting more packets will even help anything. Presumably the - * descriptor cache is relatively small, and these packets stop doing - * anything when there are too many descriptors. - */ - tu_cs_emit_pkt7(cs, opcode, 3); - tu_cs_emit(cs, - CP_LOAD_STATE6_0_STATE_TYPE(st) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) | - CP_LOAD_STATE6_0_STATE_BLOCK(sb) | - CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1))); - tu_cs_emit_qw(cs, offset | (base << 28)); -} - -static unsigned -tu6_load_state_size(struct tu_pipeline *pipeline, - struct tu_pipeline_layout *layout) -{ - const unsigned load_state_size = 4; - unsigned size = 0; - for (unsigned i = 0; i < layout->num_sets; i++) { - if (!(pipeline->active_desc_sets & (1u << i))) - continue; - - struct tu_descriptor_set_layout *set_layout = layout->set[i].layout; - for (unsigned j = 0; j < set_layout->binding_count; j++) { - struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j]; - unsigned count = 0; - /* See comment in tu6_emit_load_state(). */ - VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages; - unsigned stage_count = util_bitcount(stages); - - if (!binding->array_size) - continue; - - switch (binding->type) { - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - /* IBO-backed resources only need one packet for all graphics stages */ - if (stage_count) - count += 1; - break; - case VK_DESCRIPTOR_TYPE_SAMPLER: - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: - /* Textures and UBO's needs a packet for each stage */ - count = stage_count; - break; - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - /* Because of how we pack combined images and samplers, we - * currently can't use one packet for the whole array. - */ - count = stage_count * binding->array_size * 2; - break; - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - case VK_DESCRIPTOR_TYPE_MUTABLE_EXT: - break; - default: - unreachable("bad descriptor type"); - } - size += count * load_state_size; - } - } - return size; -} - -static void -tu6_emit_load_state(struct tu_pipeline *pipeline, - struct tu_pipeline_layout *layout) -{ - unsigned size = tu6_load_state_size(pipeline, layout); - if (size == 0) - return; - - struct tu_cs cs; - tu_cs_begin_sub_stream(&pipeline->cs, size, &cs); - - for (unsigned i = 0; i < layout->num_sets; i++) { - /* From 13.2.7. Descriptor Set Binding: - * - * A compatible descriptor set must be bound for all set numbers that - * any shaders in a pipeline access, at the time that a draw or - * dispatch command is recorded to execute using that pipeline. - * However, if none of the shaders in a pipeline statically use any - * bindings with a particular set number, then no descriptor set need - * be bound for that set number, even if the pipeline layout includes - * a non-trivial descriptor set layout for that set number. - * - * This means that descriptor sets unused by the pipeline may have a - * garbage or 0 BINDLESS_BASE register, which will cause context faults - * when prefetching descriptors from these sets. Skip prefetching for - * descriptors from them to avoid this. This is also an optimization, - * since these prefetches would be useless. - */ - if (!(pipeline->active_desc_sets & (1u << i))) - continue; - - struct tu_descriptor_set_layout *set_layout = layout->set[i].layout; - for (unsigned j = 0; j < set_layout->binding_count; j++) { - struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j]; - unsigned base = i; - unsigned offset = binding->offset / 4; - /* Note: amber sets VK_SHADER_STAGE_ALL for its descriptor layout, and - * zink has descriptors for each stage in the push layout even if some - * stages aren't present in a used pipeline. We don't want to emit - * loads for unused descriptors. - */ - VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages; - unsigned count = binding->array_size; - - /* If this is a variable-count descriptor, then the array_size is an - * upper bound on the size, but we don't know how many descriptors - * will actually be used. Therefore we can't pre-load them here. - */ - if (j == set_layout->binding_count - 1 && - set_layout->has_variable_descriptors) - continue; - - if (count == 0 || stages == 0) - continue; - switch (binding->type) { - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - base = MAX_SETS; - offset = (layout->set[i].dynamic_offset_start + - binding->dynamic_offset_offset) / 4; - FALLTHROUGH; - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: { - unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4); - /* IBO-backed resources only need one packet for all graphics stages */ - if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) { - emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO, - base, offset, count * mul); - } - if (stages & VK_SHADER_STAGE_COMPUTE_BIT) { - emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER, - base, offset, count * mul); - } - break; - } - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - case VK_DESCRIPTOR_TYPE_MUTABLE_EXT: - /* nothing - input attachment doesn't use bindless */ - break; - case VK_DESCRIPTOR_TYPE_SAMPLER: - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: { - tu_foreach_stage(stage, stages) { - emit_load_state(&cs, tu6_stage2opcode(stage), - binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ? - ST6_SHADER : ST6_CONSTANTS, - tu6_stage2texsb(stage), base, offset, count); - } - break; - } - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - base = MAX_SETS; - offset = (layout->set[i].dynamic_offset_start + - binding->dynamic_offset_offset) / 4; - FALLTHROUGH; - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: { - tu_foreach_stage(stage, stages) { - emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO, - tu6_stage2shadersb(stage), base, offset, count); - } - break; - } - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: { - tu_foreach_stage(stage, stages) { - /* TODO: We could emit less CP_LOAD_STATE6 if we used - * struct-of-arrays instead of array-of-structs. - */ - for (unsigned i = 0; i < count; i++) { - unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS; - unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS; - emit_load_state(&cs, tu6_stage2opcode(stage), - ST6_CONSTANTS, tu6_stage2texsb(stage), - base, tex_offset, 1); - emit_load_state(&cs, tu6_stage2opcode(stage), - ST6_SHADER, tu6_stage2texsb(stage), - base, sam_offset, 1); - } - } - break; - } - default: - unreachable("bad descriptor type"); - } - } - } - - pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs); -} struct tu_pipeline_builder { struct tu_device *device; - void *mem_ctx; - struct vk_pipeline_cache *cache; + struct tu_pipeline_cache *cache; const VkAllocationCallbacks *alloc; const VkGraphicsPipelineCreateInfo *create_info; - struct tu_pipeline_layout layout; - - struct tu_compiled_shaders *compiled_shaders; - - struct tu_const_state const_state[MESA_SHADER_FRAGMENT + 1]; - struct ir3_shader_variant *variants[MESA_SHADER_FRAGMENT + 1]; - struct ir3_shader_variant *binning_variant; - uint64_t shader_iova[MESA_SHADER_FRAGMENT + 1]; - uint64_t binning_vs_iova; - - uint32_t additional_cs_reserve_size; - - struct tu_pvtmem_config pvtmem; + struct tu_shader *shaders[MESA_SHADER_STAGES]; + uint32_t shader_offsets[MESA_SHADER_STAGES]; + uint32_t binning_vs_offset; + uint32_t shader_total_size; bool rasterizer_discard; /* these states are affectd by rasterizer_discard */ + VkSampleCountFlagBits samples; + bool use_depth_stencil_attachment; bool use_color_attachments; - bool attachment_state_valid; + uint32_t color_attachment_count; VkFormat color_attachment_formats[MAX_RTS]; - VkFormat depth_attachment_format; - uint32_t multiview_mask; - - bool subpass_raster_order_attachment_access; - bool subpass_feedback_loop_color; - bool subpass_feedback_loop_ds; - bool feedback_loop_may_involve_textures; - - /* Each library defines at least one piece of state in - * VkGraphicsPipelineLibraryFlagsEXT, and libraries cannot overlap, so - * there can be at most as many libraries as pieces of state, of which - * there are currently 4. - */ -#define MAX_LIBRARIES 4 +}; - unsigned num_libraries; - struct tu_pipeline *libraries[MAX_LIBRARIES]; +static enum tu_dynamic_state_bits +tu_dynamic_state_bit(VkDynamicState state) +{ + switch (state) { + case VK_DYNAMIC_STATE_VIEWPORT: + return TU_DYNAMIC_VIEWPORT; + case VK_DYNAMIC_STATE_SCISSOR: + return TU_DYNAMIC_SCISSOR; + case VK_DYNAMIC_STATE_LINE_WIDTH: + return TU_DYNAMIC_LINE_WIDTH; + case VK_DYNAMIC_STATE_DEPTH_BIAS: + return TU_DYNAMIC_DEPTH_BIAS; + case VK_DYNAMIC_STATE_BLEND_CONSTANTS: + return TU_DYNAMIC_BLEND_CONSTANTS; + case VK_DYNAMIC_STATE_DEPTH_BOUNDS: + return TU_DYNAMIC_DEPTH_BOUNDS; + case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK: + return TU_DYNAMIC_STENCIL_COMPARE_MASK; + case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK: + return TU_DYNAMIC_STENCIL_WRITE_MASK; + case VK_DYNAMIC_STATE_STENCIL_REFERENCE: + return TU_DYNAMIC_STENCIL_REFERENCE; + default: + unreachable("invalid dynamic state"); + return 0; + } +} - /* This is just the state that we are compiling now, whereas the final - * pipeline will include the state from the libraries. - */ - VkGraphicsPipelineLibraryFlagsEXT state; +static gl_shader_stage +tu_shader_stage(VkShaderStageFlagBits stage) +{ + switch (stage) { + case VK_SHADER_STAGE_VERTEX_BIT: + return MESA_SHADER_VERTEX; + case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT: + return MESA_SHADER_TESS_CTRL; + case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT: + return MESA_SHADER_TESS_EVAL; + case VK_SHADER_STAGE_GEOMETRY_BIT: + return MESA_SHADER_GEOMETRY; + case VK_SHADER_STAGE_FRAGMENT_BIT: + return MESA_SHADER_FRAGMENT; + case VK_SHADER_STAGE_COMPUTE_BIT: + return MESA_SHADER_COMPUTE; + default: + unreachable("invalid VkShaderStageFlagBits"); + return MESA_SHADER_NONE; + } +} - /* The stages we are compiling now. */ - VkShaderStageFlags active_stages; -}; +static const VkVertexInputAttributeDescription * +tu_find_vertex_input_attribute( + const VkPipelineVertexInputStateCreateInfo *vi_info, uint32_t slot) +{ + assert(slot >= VERT_ATTRIB_GENERIC0); + slot -= VERT_ATTRIB_GENERIC0; + for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) { + if (vi_info->pVertexAttributeDescriptions[i].location == slot) + return &vi_info->pVertexAttributeDescriptions[i]; + } + return NULL; +} + +static const VkVertexInputBindingDescription * +tu_find_vertex_input_binding( + const VkPipelineVertexInputStateCreateInfo *vi_info, + const VkVertexInputAttributeDescription *vi_attr) +{ + assert(vi_attr); + for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) { + if (vi_info->pVertexBindingDescriptions[i].binding == vi_attr->binding) + return &vi_info->pVertexBindingDescriptions[i]; + } + return NULL; +} static bool tu_logic_op_reads_dst(VkLogicOp op) @@ -321,732 +164,418 @@ tu_blend_factor_no_dst_alpha(VkBlendFactor factor) } } -static bool tu_blend_factor_is_dual_src(VkBlendFactor factor) -{ - switch (factor) { - case VK_BLEND_FACTOR_SRC1_COLOR: - case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR: - case VK_BLEND_FACTOR_SRC1_ALPHA: - case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA: - return true; +static enum pc_di_primtype +tu6_primtype(VkPrimitiveTopology topology) +{ + switch (topology) { + case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: + return DI_PT_POINTLIST; + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: + return DI_PT_LINELIST; + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: + return DI_PT_LINESTRIP; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: + return DI_PT_TRILIST; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: + return DI_PT_TRILIST; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: + return DI_PT_TRIFAN; + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: + return DI_PT_LINE_ADJ; + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: + return DI_PT_LINESTRIP_ADJ; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: + return DI_PT_TRI_ADJ; + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: + return DI_PT_TRISTRIP_ADJ; + case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST: default: - return false; + unreachable("invalid primitive topology"); + return DI_PT_NONE; } } -static bool -tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo *info) +static enum adreno_compare_func +tu6_compare_func(VkCompareOp op) { - if (!info) - return false; - - for (unsigned i = 0; i < info->attachmentCount; i++) { - const VkPipelineColorBlendAttachmentState *blend = &info->pAttachments[i]; - if (tu_blend_factor_is_dual_src(blend->srcColorBlendFactor) || - tu_blend_factor_is_dual_src(blend->dstColorBlendFactor) || - tu_blend_factor_is_dual_src(blend->srcAlphaBlendFactor) || - tu_blend_factor_is_dual_src(blend->dstAlphaBlendFactor)) - return true; + switch (op) { + case VK_COMPARE_OP_NEVER: + return FUNC_NEVER; + case VK_COMPARE_OP_LESS: + return FUNC_LESS; + case VK_COMPARE_OP_EQUAL: + return FUNC_EQUAL; + case VK_COMPARE_OP_LESS_OR_EQUAL: + return FUNC_LEQUAL; + case VK_COMPARE_OP_GREATER: + return FUNC_GREATER; + case VK_COMPARE_OP_NOT_EQUAL: + return FUNC_NOTEQUAL; + case VK_COMPARE_OP_GREATER_OR_EQUAL: + return FUNC_GEQUAL; + case VK_COMPARE_OP_ALWAYS: + return FUNC_ALWAYS; + default: + unreachable("invalid VkCompareOp"); + return FUNC_NEVER; } - - return false; } -static const struct xs_config { - uint16_t reg_sp_xs_ctrl; - uint16_t reg_sp_xs_config; - uint16_t reg_sp_xs_instrlen; - uint16_t reg_hlsq_xs_ctrl; - uint16_t reg_sp_xs_first_exec_offset; - uint16_t reg_sp_xs_pvt_mem_hw_stack_offset; -} xs_config[] = { - [MESA_SHADER_VERTEX] = { - REG_A6XX_SP_VS_CTRL_REG0, - REG_A6XX_SP_VS_CONFIG, - REG_A6XX_SP_VS_INSTRLEN, - REG_A6XX_HLSQ_VS_CNTL, - REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET, - REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET, - }, - [MESA_SHADER_TESS_CTRL] = { - REG_A6XX_SP_HS_CTRL_REG0, - REG_A6XX_SP_HS_CONFIG, - REG_A6XX_SP_HS_INSTRLEN, - REG_A6XX_HLSQ_HS_CNTL, - REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET, - REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET, - }, - [MESA_SHADER_TESS_EVAL] = { - REG_A6XX_SP_DS_CTRL_REG0, - REG_A6XX_SP_DS_CONFIG, - REG_A6XX_SP_DS_INSTRLEN, - REG_A6XX_HLSQ_DS_CNTL, - REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET, - REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET, - }, - [MESA_SHADER_GEOMETRY] = { - REG_A6XX_SP_GS_CTRL_REG0, - REG_A6XX_SP_GS_CONFIG, - REG_A6XX_SP_GS_INSTRLEN, - REG_A6XX_HLSQ_GS_CNTL, - REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET, - REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET, - }, - [MESA_SHADER_FRAGMENT] = { - REG_A6XX_SP_FS_CTRL_REG0, - REG_A6XX_SP_FS_CONFIG, - REG_A6XX_SP_FS_INSTRLEN, - REG_A6XX_HLSQ_FS_CNTL, - REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET, - REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET, - }, - [MESA_SHADER_COMPUTE] = { - REG_A6XX_SP_CS_CTRL_REG0, - REG_A6XX_SP_CS_CONFIG, - REG_A6XX_SP_CS_INSTRLEN, - REG_A6XX_HLSQ_CS_CNTL, - REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET, - REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET, - }, -}; - -static uint32_t -tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs) +static enum adreno_stencil_op +tu6_stencil_op(VkStencilOp op) { - const struct ir3_const_state *const_state = ir3_const_state(xs); - uint32_t base = const_state->offsets.immediate; - int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4); - - /* truncate size to avoid writing constants that shader - * does not use: - */ - size = MIN2(size + base, xs->constlen) - base; - - return MAX2(size, 0) * 4; + switch (op) { + case VK_STENCIL_OP_KEEP: + return STENCIL_KEEP; + case VK_STENCIL_OP_ZERO: + return STENCIL_ZERO; + case VK_STENCIL_OP_REPLACE: + return STENCIL_REPLACE; + case VK_STENCIL_OP_INCREMENT_AND_CLAMP: + return STENCIL_INCR_CLAMP; + case VK_STENCIL_OP_DECREMENT_AND_CLAMP: + return STENCIL_DECR_CLAMP; + case VK_STENCIL_OP_INVERT: + return STENCIL_INVERT; + case VK_STENCIL_OP_INCREMENT_AND_WRAP: + return STENCIL_INCR_WRAP; + case VK_STENCIL_OP_DECREMENT_AND_WRAP: + return STENCIL_DECR_WRAP; + default: + unreachable("invalid VkStencilOp"); + return STENCIL_KEEP; + } } -/* We allocate fixed-length substreams for shader state, however some - * parts of the state may have unbound length. Their additional space - * requirements should be calculated here. - */ -static uint32_t -tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs) +static enum a3xx_rop_code +tu6_rop(VkLogicOp op) { - const struct ir3_const_state *const_state = ir3_const_state(xs); - - uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs); - - /* Variable number of UBO upload ranges. */ - size += 4 * const_state->ubo_state.num_enabled; - - /* Variable number of dwords for the primitive map */ - size += xs->input_size; - - size += xs->constant_data_size / 4; - - return size; + switch (op) { + case VK_LOGIC_OP_CLEAR: + return ROP_CLEAR; + case VK_LOGIC_OP_AND: + return ROP_AND; + case VK_LOGIC_OP_AND_REVERSE: + return ROP_AND_REVERSE; + case VK_LOGIC_OP_COPY: + return ROP_COPY; + case VK_LOGIC_OP_AND_INVERTED: + return ROP_AND_INVERTED; + case VK_LOGIC_OP_NO_OP: + return ROP_NOOP; + case VK_LOGIC_OP_XOR: + return ROP_XOR; + case VK_LOGIC_OP_OR: + return ROP_OR; + case VK_LOGIC_OP_NOR: + return ROP_NOR; + case VK_LOGIC_OP_EQUIVALENT: + return ROP_EQUIV; + case VK_LOGIC_OP_INVERT: + return ROP_INVERT; + case VK_LOGIC_OP_OR_REVERSE: + return ROP_OR_REVERSE; + case VK_LOGIC_OP_COPY_INVERTED: + return ROP_COPY_INVERTED; + case VK_LOGIC_OP_OR_INVERTED: + return ROP_OR_INVERTED; + case VK_LOGIC_OP_NAND: + return ROP_NAND; + case VK_LOGIC_OP_SET: + return ROP_SET; + default: + unreachable("invalid VkLogicOp"); + return ROP_NOOP; + } } -void -tu6_emit_xs_config(struct tu_cs *cs, - gl_shader_stage stage, /* xs->type, but xs may be NULL */ - const struct ir3_shader_variant *xs) +static enum adreno_rb_blend_factor +tu6_blend_factor(VkBlendFactor factor) { - const struct xs_config *cfg = &xs_config[stage]; - - if (!xs) { - /* shader stage disabled */ - tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1); - tu_cs_emit(cs, 0); - - tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1); - tu_cs_emit(cs, 0); - return; + switch (factor) { + case VK_BLEND_FACTOR_ZERO: + return FACTOR_ZERO; + case VK_BLEND_FACTOR_ONE: + return FACTOR_ONE; + case VK_BLEND_FACTOR_SRC_COLOR: + return FACTOR_SRC_COLOR; + case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR: + return FACTOR_ONE_MINUS_SRC_COLOR; + case VK_BLEND_FACTOR_DST_COLOR: + return FACTOR_DST_COLOR; + case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR: + return FACTOR_ONE_MINUS_DST_COLOR; + case VK_BLEND_FACTOR_SRC_ALPHA: + return FACTOR_SRC_ALPHA; + case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA: + return FACTOR_ONE_MINUS_SRC_ALPHA; + case VK_BLEND_FACTOR_DST_ALPHA: + return FACTOR_DST_ALPHA; + case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA: + return FACTOR_ONE_MINUS_DST_ALPHA; + case VK_BLEND_FACTOR_CONSTANT_COLOR: + return FACTOR_CONSTANT_COLOR; + case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR: + return FACTOR_ONE_MINUS_CONSTANT_COLOR; + case VK_BLEND_FACTOR_CONSTANT_ALPHA: + return FACTOR_CONSTANT_ALPHA; + case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA: + return FACTOR_ONE_MINUS_CONSTANT_ALPHA; + case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE: + return FACTOR_SRC_ALPHA_SATURATE; + case VK_BLEND_FACTOR_SRC1_COLOR: + return FACTOR_SRC1_COLOR; + case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR: + return FACTOR_ONE_MINUS_SRC1_COLOR; + case VK_BLEND_FACTOR_SRC1_ALPHA: + return FACTOR_SRC1_ALPHA; + case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA: + return FACTOR_ONE_MINUS_SRC1_ALPHA; + default: + unreachable("invalid VkBlendFactor"); + return FACTOR_ZERO; } - - tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1); - tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED | - COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) | - COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) | - COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) | - COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) | - A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) | - A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp)); - - tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1); - tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) | - A6XX_HLSQ_VS_CNTL_ENABLED); } -void -tu6_emit_xs(struct tu_cs *cs, - gl_shader_stage stage, /* xs->type, but xs may be NULL */ - const struct ir3_shader_variant *xs, - const struct tu_pvtmem_config *pvtmem, - uint64_t binary_iova) +static enum a3xx_rb_blend_opcode +tu6_blend_op(VkBlendOp op) { - const struct xs_config *cfg = &xs_config[stage]; - - if (!xs) { - /* shader stage disabled */ - return; - } - - enum a6xx_threadsize thrsz = - xs->info.double_threadsize ? THREAD128 : THREAD64; - switch (stage) { - case MESA_SHADER_VERTEX: - tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0( - .fullregfootprint = xs->info.max_reg + 1, - .halfregfootprint = xs->info.max_half_reg + 1, - .branchstack = ir3_shader_branchstack_hw(xs), - .mergedregs = xs->mergedregs, - )); - break; - case MESA_SHADER_TESS_CTRL: - tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0( - .fullregfootprint = xs->info.max_reg + 1, - .halfregfootprint = xs->info.max_half_reg + 1, - .branchstack = ir3_shader_branchstack_hw(xs), - )); - break; - case MESA_SHADER_TESS_EVAL: - tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0( - .fullregfootprint = xs->info.max_reg + 1, - .halfregfootprint = xs->info.max_half_reg + 1, - .branchstack = ir3_shader_branchstack_hw(xs), - )); - break; - case MESA_SHADER_GEOMETRY: - tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0( - .fullregfootprint = xs->info.max_reg + 1, - .halfregfootprint = xs->info.max_half_reg + 1, - .branchstack = ir3_shader_branchstack_hw(xs), - )); - break; - case MESA_SHADER_FRAGMENT: - tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0( - .fullregfootprint = xs->info.max_reg + 1, - .halfregfootprint = xs->info.max_half_reg + 1, - .branchstack = ir3_shader_branchstack_hw(xs), - .mergedregs = xs->mergedregs, - .threadsize = thrsz, - .pixlodenable = xs->need_pixlod, - .diff_fine = xs->need_fine_derivatives, - .varying = xs->total_in != 0, - /* unknown bit, seems unnecessary */ - .unk24 = true, - )); - break; - case MESA_SHADER_COMPUTE: - tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0( - .fullregfootprint = xs->info.max_reg + 1, - .halfregfootprint = xs->info.max_half_reg + 1, - .branchstack = ir3_shader_branchstack_hw(xs), - .mergedregs = xs->mergedregs, - .threadsize = thrsz, - )); - break; + switch (op) { + case VK_BLEND_OP_ADD: + return BLEND_DST_PLUS_SRC; + case VK_BLEND_OP_SUBTRACT: + return BLEND_SRC_MINUS_DST; + case VK_BLEND_OP_REVERSE_SUBTRACT: + return BLEND_DST_MINUS_SRC; + case VK_BLEND_OP_MIN: + return BLEND_MIN_DST_SRC; + case VK_BLEND_OP_MAX: + return BLEND_MAX_DST_SRC; default: - unreachable("bad shader stage"); + unreachable("invalid VkBlendOp"); + return BLEND_DST_PLUS_SRC; } +} - tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1); - tu_cs_emit(cs, xs->instrlen); - - /* emit program binary & private memory layout - * binary_iova should be aligned to 1 instrlen unit (128 bytes) - */ +static void +tu6_emit_vs_config(struct tu_cs *cs, const struct ir3_shader_variant *vs) +{ + uint32_t sp_vs_ctrl = + A6XX_SP_VS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | + A6XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vs->info.max_reg + 1) | + A6XX_SP_VS_CTRL_REG0_MERGEDREGS | + A6XX_SP_VS_CTRL_REG0_BRANCHSTACK(vs->branchstack); + if (vs->num_samp) + sp_vs_ctrl |= A6XX_SP_VS_CTRL_REG0_PIXLODENABLE; - assert((binary_iova & 0x7f) == 0); - assert((pvtmem->iova & 0x1f) == 0); + uint32_t sp_vs_config = A6XX_SP_VS_CONFIG_NTEX(vs->num_samp) | + A6XX_SP_VS_CONFIG_NSAMP(vs->num_samp); + if (vs->instrlen) + sp_vs_config |= A6XX_SP_VS_CONFIG_ENABLED; - tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7); - tu_cs_emit(cs, 0); - tu_cs_emit_qw(cs, binary_iova); - tu_cs_emit(cs, - A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size)); - tu_cs_emit_qw(cs, pvtmem->iova); - tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) | - COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT)); - - tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1); - tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size)); - - uint32_t shader_preload_size = - MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size); - - tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | - CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size)); - tu_cs_emit_qw(cs, binary_iova); + tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_CTRL_REG0, 1); + tu_cs_emit(cs, sp_vs_ctrl); - /* emit immediates */ + tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_CONFIG, 2); + tu_cs_emit(cs, sp_vs_config); + tu_cs_emit(cs, vs->instrlen); - const struct ir3_const_state *const_state = ir3_const_state(xs); - uint32_t base = const_state->offsets.immediate; - unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs); - - if (immediate_size > 0) { - tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | - CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4)); - tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_VS_CNTL, 1); + tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(align(vs->constlen, 4)) | 0x100); +} - tu_cs_emit_array(cs, const_state->immediates, immediate_size); - } +static void +tu6_emit_hs_config(struct tu_cs *cs, const struct ir3_shader_variant *hs) +{ + uint32_t sp_hs_config = 0; + if (hs->instrlen) + sp_hs_config |= A6XX_SP_HS_CONFIG_ENABLED; - if (const_state->constant_data_ubo != -1) { - uint64_t iova = binary_iova + xs->info.constant_data_offset; - - /* Upload UBO state for the constant data. */ - tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5); - tu_cs_emit(cs, - CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)| - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | - CP_LOAD_STATE6_0_NUM_UNIT(1)); - tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16); - tu_cs_emit_qw(cs, - iova | - (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32); - - /* Upload the constant data to the const file if needed. */ - const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state; - - for (int i = 0; i < ubo_state->num_enabled; i++) { - if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo || - ubo_state->range[i].ubo.bindless) { - continue; - } + tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1); + tu_cs_emit(cs, 0); - uint32_t start = ubo_state->range[i].start; - uint32_t end = ubo_state->range[i].end; - uint32_t size = MIN2(end - start, - (16 * xs->constlen) - ubo_state->range[i].offset); - - tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); - tu_cs_emit(cs, - CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | - CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); - tu_cs_emit_qw(cs, iova + start); - } - } + tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_CONFIG, 2); + tu_cs_emit(cs, sp_hs_config); + tu_cs_emit(cs, hs->instrlen); - /* emit FS driver param */ - if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) { - uint32_t base = const_state->offsets.driver_param; - int32_t size = DIV_ROUND_UP(const_state->num_driver_params, 4); - size = MAX2(MIN2(size + base, xs->constlen) - base, 0); - - if (size > 0) { - tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | - CP_LOAD_STATE6_0_NUM_UNIT(size)); - tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - - assert(size == 1); - tu_cs_emit(cs, xs->info.double_threadsize ? 128 : 64); - tu_cs_emit(cs, 0); - tu_cs_emit(cs, 0); - tu_cs_emit(cs, 0); - } - } + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_HS_CNTL, 1); + tu_cs_emit(cs, A6XX_HLSQ_HS_CNTL_CONSTLEN(align(hs->constlen, 4))); } static void -tu6_emit_dynamic_offset(struct tu_cs *cs, - const struct ir3_shader_variant *xs, - struct tu_pipeline_builder *builder) +tu6_emit_ds_config(struct tu_cs *cs, const struct ir3_shader_variant *ds) { - if (!xs || builder->const_state[xs->type].dynamic_offset_loc == UINT32_MAX) - return; + uint32_t sp_ds_config = 0; + if (ds->instrlen) + sp_ds_config |= A6XX_SP_DS_CONFIG_ENABLED; - tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + MAX_SETS); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(builder->const_state[xs->type].dynamic_offset_loc / 4) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) | - CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(MAX_SETS, 4))); - tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - - for (unsigned i = 0; i < MAX_SETS; i++) { - unsigned dynamic_offset_start = - builder->layout.set[i].dynamic_offset_start / (A6XX_TEX_CONST_DWORDS * 4); - tu_cs_emit(cs, i < builder->layout.num_sets ? dynamic_offset_start : 0); - } -} + tu_cs_emit_pkt4(cs, REG_A6XX_SP_DS_CONFIG, 2); + tu_cs_emit(cs, sp_ds_config); + tu_cs_emit(cs, ds->instrlen); -static void -tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable) -{ - /* Enable/disable shared constants */ - tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable)); - tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true, - .isammode = ISAMMODE_GL, - .shared_consts_enable = enable)); + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_DS_CNTL, 1); + tu_cs_emit(cs, A6XX_HLSQ_DS_CNTL_CONSTLEN(align(ds->constlen, 4))); } static void -tu6_emit_cs_config(struct tu_cs *cs, - const struct ir3_shader_variant *v, - const struct tu_pvtmem_config *pvtmem, - uint64_t binary_iova) +tu6_emit_gs_config(struct tu_cs *cs, const struct ir3_shader_variant *gs) { - bool shared_consts_enable = ir3_const_state(v)->shared_consts_enable; - tu6_emit_shared_consts_enable(cs, shared_consts_enable); - - tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( - .cs_state = true, - .cs_ibo = true, - .cs_shared_const = shared_consts_enable)); - - tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v); - tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova); - - uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1); - tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); - tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) | - A6XX_SP_CS_UNKNOWN_A9B1_UNK6); - - if (cs->device->physical_device->info->a6xx.has_lpac) { - tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1); - tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) | - A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6); - } + uint32_t sp_gs_config = 0; + if (gs->instrlen) + sp_gs_config |= A6XX_SP_GS_CONFIG_ENABLED; - uint32_t local_invocation_id = - ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); - uint32_t work_group_id = - ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID); + tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_UNKNOWN_A871, 1); + tu_cs_emit(cs, 0); - enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64; - tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2); - tu_cs_emit(cs, - A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | - A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | - A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | - A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); - tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | - A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz)); - - if (cs->device->physical_device->info->a6xx.has_lpac) { - tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2); - tu_cs_emit(cs, - A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) | - A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | - A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | - A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); - tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | - A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz)); - } -} + tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_CONFIG, 2); + tu_cs_emit(cs, sp_gs_config); + tu_cs_emit(cs, gs->instrlen); -#define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2) + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_GS_CNTL, 1); + tu_cs_emit(cs, A6XX_HLSQ_GS_CNTL_CONSTLEN(align(gs->constlen, 4))); +} static void -tu6_emit_vfd_dest(struct tu_cs *cs, - const struct ir3_shader_variant *vs) -{ - int32_t input_for_attr[MAX_VERTEX_ATTRIBS]; - uint32_t attr_count = 0; +tu6_emit_fs_config(struct tu_cs *cs, const struct ir3_shader_variant *fs) +{ + uint32_t sp_fs_ctrl = + A6XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | 0x1000000 | + A6XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fs->info.max_reg + 1) | + A6XX_SP_FS_CTRL_REG0_MERGEDREGS | + A6XX_SP_FS_CTRL_REG0_BRANCHSTACK(fs->branchstack); + if (fs->total_in > 0 || fs->frag_coord) + sp_fs_ctrl |= A6XX_SP_FS_CTRL_REG0_VARYING; + if (fs->num_samp > 0) + sp_fs_ctrl |= A6XX_SP_FS_CTRL_REG0_PIXLODENABLE; + + uint32_t sp_fs_config = A6XX_SP_FS_CONFIG_NTEX(fs->num_samp) | + A6XX_SP_FS_CONFIG_NSAMP(fs->num_samp); + if (fs->instrlen) + sp_fs_config |= A6XX_SP_FS_CONFIG_ENABLED; + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_UNKNOWN_A99E, 1); + tu_cs_emit(cs, 0x7fc0); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_UNKNOWN_A9A8, 1); + tu_cs_emit(cs, 0); - for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++) - input_for_attr[i] = -1; + tu_cs_emit_pkt4(cs, REG_A6XX_SP_UNKNOWN_AB00, 1); + tu_cs_emit(cs, 0x5); - for (unsigned i = 0; i < vs->inputs_count; i++) { - if (vs->inputs[i].sysval || vs->inputs[i].regid == regid(63, 0)) - continue; + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_CTRL_REG0, 1); + tu_cs_emit(cs, sp_fs_ctrl); - assert(vs->inputs[i].slot >= VERT_ATTRIB_GENERIC0); - unsigned loc = vs->inputs[i].slot - VERT_ATTRIB_GENERIC0; - input_for_attr[loc] = i; - attr_count = MAX2(attr_count, loc + 1); - } + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_CONFIG, 2); + tu_cs_emit(cs, sp_fs_config); + tu_cs_emit(cs, fs->instrlen); - tu_cs_emit_regs(cs, - A6XX_VFD_CONTROL_0( - .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */ - .decode_cnt = attr_count)); - - if (attr_count) - tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count); - - for (unsigned i = 0; i < attr_count; i++) { - if (input_for_attr[i] >= 0) { - unsigned input_idx = input_for_attr[i]; - tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0, - .writemask = vs->inputs[input_idx].compmask, - .regid = vs->inputs[input_idx].regid).value); - } else { - tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0, - .writemask = 0, - .regid = regid(63, 0)).value); - } - } + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL, 1); + tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_CONSTLEN(align(fs->constlen, 4)) | 0x100); } static void tu6_emit_vs_system_values(struct tu_cs *cs, - const struct ir3_shader_variant *vs, - const struct ir3_shader_variant *hs, - const struct ir3_shader_variant *ds, - const struct ir3_shader_variant *gs, - bool primid_passthru) + const struct ir3_shader_variant *vs) { const uint32_t vertexid_regid = - ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); + ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); const uint32_t instanceid_regid = - ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); - const uint32_t tess_coord_x_regid = hs ? - ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD) : - regid(63, 0); - const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ? - tess_coord_x_regid + 1 : - regid(63, 0); - const uint32_t hs_rel_patch_regid = hs ? - ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3) : - regid(63, 0); - const uint32_t ds_rel_patch_regid = hs ? - ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3) : - regid(63, 0); - const uint32_t hs_invocation_regid = hs ? - ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3) : - regid(63, 0); - const uint32_t gs_primitiveid_regid = gs ? - ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) : - regid(63, 0); - const uint32_t vs_primitiveid_regid = hs ? - ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID) : - gs_primitiveid_regid; - const uint32_t ds_primitiveid_regid = ds ? - ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID) : - regid(63, 0); - const uint32_t gsheader_regid = gs ? - ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3) : - regid(63, 0); - - /* Note: we currently don't support multiview with tess or GS. If we did, - * and the HW actually works, then we'd have to somehow share this across - * stages. Note that the blob doesn't support this either. - */ - const uint32_t viewid_regid = - ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX); + ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 6); tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) | - A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) | - A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) | - A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid)); - tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) | - A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid)); - tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) | - A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) | - A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) | - A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid)); + A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) | + 0xfcfc0000); + tu_cs_emit(cs, 0x0000fcfc); /* VFD_CONTROL_2 */ + tu_cs_emit(cs, 0xfcfcfcfc); /* VFD_CONTROL_3 */ tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */ - tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) | - 0xfc00); /* VFD_CONTROL_5 */ - tu_cs_emit(cs, COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */ + tu_cs_emit(cs, 0x0000fcfc); /* VFD_CONTROL_5 */ + tu_cs_emit(cs, 0x00000000); /* VFD_CONTROL_6 */ } static void -tu6_setup_streamout(struct tu_cs *cs, - const struct ir3_shader_variant *v, - struct ir3_shader_linkage *l) +tu6_emit_vpc(struct tu_cs *cs, + const struct ir3_shader_variant *vs, + const struct ir3_shader_variant *fs, + bool binning_pass) { - const struct ir3_stream_output_info *info = &v->stream_output; - /* Note: 64 here comes from the HW layout of the program RAM. The program - * for stream N is at DWORD 64 * N. - */ -#define A6XX_SO_PROG_DWORDS 64 - uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {}; - BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0}; - - /* TODO: streamout state should be in a non-GMEM draw state */ - - /* no streamout: */ - if (info->num_outputs == 0) { - unsigned sizedw = 4; - if (cs->device->physical_device->info->a6xx.tess_use_shared) - sizedw += 2; - - tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, sizedw); - tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); - tu_cs_emit(cs, 0); - tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL); - tu_cs_emit(cs, 0); + struct ir3_shader_linkage linkage = { 0 }; + ir3_link_shaders(&linkage, vs, fs); - if (cs->device->physical_device->info->a6xx.tess_use_shared) { - tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL); - tu_cs_emit(cs, 0); - } + if (vs->shader->stream_output.num_outputs && !binning_pass) + tu_finishme("stream output"); - return; + BITSET_DECLARE(vpc_var_enables, 128) = { 0 }; + for (uint32_t i = 0; i < linkage.cnt; i++) { + const uint32_t comp_count = util_last_bit(linkage.var[i].compmask); + for (uint32_t j = 0; j < comp_count; j++) + BITSET_SET(vpc_var_enables, linkage.var[i].loc + j); } - for (unsigned i = 0; i < info->num_outputs; i++) { - const struct ir3_stream_output *out = &info->output[i]; - unsigned k = out->register_index; - unsigned idx; - - /* Skip it, if it's an output that was never assigned a register. */ - if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG) - continue; + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4); + tu_cs_emit(cs, ~vpc_var_enables[0]); + tu_cs_emit(cs, ~vpc_var_enables[1]); + tu_cs_emit(cs, ~vpc_var_enables[2]); + tu_cs_emit(cs, ~vpc_var_enables[3]); - /* linkage map sorted by order frag shader wants things, so - * a bit less ideal here.. - */ - for (idx = 0; idx < l->cnt; idx++) - if (l->var[idx].slot == v->outputs[k].slot) - break; - - assert(idx < l->cnt); - - for (unsigned j = 0; j < out->num_components; j++) { - unsigned c = j + out->start_component; - unsigned loc = l->var[idx].loc + c; - unsigned off = j + out->dst_offset; /* in dwords */ - - assert(loc < A6XX_SO_PROG_DWORDS * 2); - unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2; - if (loc & 1) { - prog[dword] |= A6XX_VPC_SO_PROG_B_EN | - A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) | - A6XX_VPC_SO_PROG_B_OFF(off * 4); - } else { - prog[dword] |= A6XX_VPC_SO_PROG_A_EN | - A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) | - A6XX_VPC_SO_PROG_A_OFF(off * 4); - } - BITSET_SET(valid_dwords, dword); - } + /* a6xx finds position/pointsize at the end */ + const uint32_t position_regid = + ir3_find_output_regid(vs, VARYING_SLOT_POS); + const uint32_t pointsize_regid = + ir3_find_output_regid(vs, VARYING_SLOT_PSIZ); + uint32_t pointsize_loc = 0xff; + if (position_regid != regid(63, 0)) + ir3_link_add(&linkage, position_regid, 0xf, linkage.max_loc); + if (pointsize_regid != regid(63, 0)) { + pointsize_loc = linkage.max_loc; + ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc); } - unsigned prog_count = 0; - unsigned start, end; - BITSET_FOREACH_RANGE(start, end, valid_dwords, - A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) { - prog_count += end - start + 1; + /* map vs outputs to VPC */ + assert(linkage.cnt <= 32); + const uint32_t sp_vs_out_count = (linkage.cnt + 1) / 2; + const uint32_t sp_vs_vpc_dst_count = (linkage.cnt + 3) / 4; + uint32_t sp_vs_out[16]; + uint32_t sp_vs_vpc_dst[8]; + sp_vs_out[sp_vs_out_count - 1] = 0; + sp_vs_vpc_dst[sp_vs_vpc_dst_count - 1] = 0; + for (uint32_t i = 0; i < linkage.cnt; i++) { + ((uint16_t *) sp_vs_out)[i] = + A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) | + A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask); + ((uint8_t *) sp_vs_vpc_dst)[i] = + A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc); } - const bool emit_pc_so_stream_cntl = - cs->device->physical_device->info->a6xx.tess_use_shared && - v->type == MESA_SHADER_TESS_EVAL; - - if (emit_pc_so_stream_cntl) - prog_count += 1; - - tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count); - tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL); - tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) | - COND(info->stride[0] > 0, - A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) | - COND(info->stride[1] > 0, - A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) | - COND(info->stride[2] > 0, - A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) | - COND(info->stride[3] > 0, - A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3]))); - for (uint32_t i = 0; i < 4; i++) { - tu_cs_emit(cs, REG_A6XX_VPC_SO_BUFFER_STRIDE(i)); - tu_cs_emit(cs, info->stride[i]); - } - bool first = true; - BITSET_FOREACH_RANGE(start, end, valid_dwords, - A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) { - tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); - tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) | - A6XX_VPC_SO_CNTL_ADDR(start)); - for (unsigned i = start; i < end; i++) { - tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG); - tu_cs_emit(cs, prog[i]); - } - first = false; - } + tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_OUT_REG(0), sp_vs_out_count); + tu_cs_emit_array(cs, sp_vs_out, sp_vs_out_count); - if (emit_pc_so_stream_cntl) { - /* Possibly not tess_use_shared related, but the combination of - * tess + xfb fails some tests if we don't emit this. - */ - tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL); - tu_cs_emit(cs, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written)); - } -} + tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_VPC_DST_REG(0), sp_vs_vpc_dst_count); + tu_cs_emit_array(cs, sp_vs_vpc_dst, sp_vs_vpc_dst_count); -static void -tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base, - enum a6xx_state_block block, uint32_t offset, - uint32_t size, const uint32_t *dwords) { - assert(size % 4 == 0); - - tu_cs_emit_pkt7(cs, opcode, 3 + size); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(block) | - CP_LOAD_STATE6_0_NUM_UNIT(size / 4)); - - tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - dwords = (uint32_t *)&((uint8_t *)dwords)[offset]; - - tu_cs_emit_array(cs, dwords, size); -} + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1); + tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) | + (fs->total_in > 0 ? A6XX_VPC_CNTL_0_VARYING : 0) | + 0xff00ff00); -static void -tu6_emit_link_map(struct tu_cs *cs, - const struct ir3_shader_variant *producer, - const struct ir3_shader_variant *consumer, - enum a6xx_state_block sb) -{ - const struct ir3_const_state *const_state = ir3_const_state(consumer); - uint32_t base = const_state->offsets.primitive_map; - int size = DIV_ROUND_UP(consumer->input_size, 4); + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_PACK, 1); + tu_cs_emit(cs, A6XX_VPC_PACK_NUMNONPOSVAR(fs->total_in) | + A6XX_VPC_PACK_PSIZELOC(pointsize_loc) | + A6XX_VPC_PACK_STRIDE_IN_VPC(linkage.max_loc)); - size = (MIN2(size + base, consumer->constlen) - base) * 4; - if (size <= 0) - return; + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_SIV_CNTL, 1); + tu_cs_emit(cs, 0x0000ffff); /* XXX */ - tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size, - producer->output_loc); -} + tu_cs_emit_pkt4(cs, REG_A6XX_SP_PRIMITIVE_CNTL, 1); + tu_cs_emit(cs, A6XX_SP_PRIMITIVE_CNTL_VSOUT(linkage.cnt)); -static uint16_t -primitive_to_tess(enum shader_prim primitive) { - switch (primitive) { - case SHADER_PRIM_POINTS: - return TESS_POINTS; - case SHADER_PRIM_LINE_STRIP: - return TESS_LINES; - case SHADER_PRIM_TRIANGLE_STRIP: - return TESS_CW_TRIS; - default: - unreachable(""); - } + tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_1, 1); + tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_1_STRIDE_IN_VPC(linkage.max_loc) | + (vs->writes_psize ? A6XX_PC_PRIMITIVE_CNTL_1_PSIZE : 0)); } static int tu6_vpc_varying_mode(const struct ir3_shader_variant *fs, - const struct ir3_shader_variant *last_shader, uint32_t index, uint8_t *interp_mode, uint8_t *ps_repl_mode) @@ -1091,18 +620,8 @@ tu6_vpc_varying_mode(const struct ir3_shader_variant *fs, *interp_mode |= INTERP_ONE << 6; shift += 2; } - } else if (fs->inputs[index].slot == VARYING_SLOT_LAYER || - fs->inputs[index].slot == VARYING_SLOT_VIEWPORT) { - /* If the last geometry shader doesn't statically write these, they're - * implicitly zero and the FS is supposed to read zero. - */ - if (ir3_find_output(last_shader, fs->inputs[index].slot) < 0 && - (compmask & 0x1)) { - *interp_mode |= INTERP_ZERO; - } else { - *interp_mode |= INTERP_FLAT; - } - } else if (fs->inputs[index].flat) { + } else if ((fs->inputs[index].interpolate == INTERP_MODE_FLAT) || + fs->inputs[index].rasterflat) { for (int i = 0; i < 4; i++) { if (compmask & (1 << i)) { *interp_mode |= INTERP_FLAT << shift; @@ -1111,19 +630,18 @@ tu6_vpc_varying_mode(const struct ir3_shader_variant *fs, } } - return util_bitcount(compmask) * 2; + return shift; } static void tu6_emit_vpc_varying_modes(struct tu_cs *cs, const struct ir3_shader_variant *fs, - const struct ir3_shader_variant *last_shader) + bool binning_pass) { uint32_t interp_modes[8] = { 0 }; uint32_t ps_repl_modes[8] = { 0 }; - uint32_t interp_regs = 0; - if (fs) { + if (!binning_pass) { for (int i = -1; (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) { @@ -1131,7 +649,7 @@ tu6_emit_vpc_varying_modes(struct tu_cs *cs, uint8_t interp_mode; uint8_t ps_repl_mode; const int bits = - tu6_vpc_varying_mode(fs, last_shader, i, &interp_mode, &ps_repl_mode); + tu6_vpc_varying_mode(fs, i, &interp_mode, &ps_repl_mode); /* OR the mode into the array */ const uint32_t inloc = fs->inputs[i].inloc * 2; @@ -1146,1043 +664,445 @@ tu6_emit_vpc_varying_modes(struct tu_cs *cs, interp_modes[n] |= interp_mode >> shift; ps_repl_modes[n] |= ps_repl_mode >> shift; } - interp_regs = MAX2(interp_regs, n + 1); } } - if (interp_regs) { - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), interp_regs); - tu_cs_emit_array(cs, interp_modes, interp_regs); + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); + tu_cs_emit_array(cs, interp_modes, 8); - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), interp_regs); - tu_cs_emit_array(cs, ps_repl_modes, interp_regs); - } + tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); + tu_cs_emit_array(cs, ps_repl_modes, 8); } -void -tu6_emit_vpc(struct tu_cs *cs, - const struct ir3_shader_variant *vs, - const struct ir3_shader_variant *hs, - const struct ir3_shader_variant *ds, - const struct ir3_shader_variant *gs, - const struct ir3_shader_variant *fs) -{ - /* note: doesn't compile as static because of the array regs.. */ - const struct reg_config { - uint16_t reg_sp_xs_out_reg; - uint16_t reg_sp_xs_vpc_dst_reg; - uint16_t reg_vpc_xs_pack; - uint16_t reg_vpc_xs_clip_cntl; - uint16_t reg_gras_xs_cl_cntl; - uint16_t reg_pc_xs_out_cntl; - uint16_t reg_sp_xs_primitive_cntl; - uint16_t reg_vpc_xs_layer_cntl; - uint16_t reg_gras_xs_layer_cntl; - } reg_config[] = { - [MESA_SHADER_VERTEX] = { - REG_A6XX_SP_VS_OUT_REG(0), - REG_A6XX_SP_VS_VPC_DST_REG(0), - REG_A6XX_VPC_VS_PACK, - REG_A6XX_VPC_VS_CLIP_CNTL, - REG_A6XX_GRAS_VS_CL_CNTL, - REG_A6XX_PC_VS_OUT_CNTL, - REG_A6XX_SP_VS_PRIMITIVE_CNTL, - REG_A6XX_VPC_VS_LAYER_CNTL, - REG_A6XX_GRAS_VS_LAYER_CNTL - }, - [MESA_SHADER_TESS_CTRL] = { - 0, - 0, - 0, - 0, - 0, - REG_A6XX_PC_HS_OUT_CNTL, - 0, - 0, - 0 - }, - [MESA_SHADER_TESS_EVAL] = { - REG_A6XX_SP_DS_OUT_REG(0), - REG_A6XX_SP_DS_VPC_DST_REG(0), - REG_A6XX_VPC_DS_PACK, - REG_A6XX_VPC_DS_CLIP_CNTL, - REG_A6XX_GRAS_DS_CL_CNTL, - REG_A6XX_PC_DS_OUT_CNTL, - REG_A6XX_SP_DS_PRIMITIVE_CNTL, - REG_A6XX_VPC_DS_LAYER_CNTL, - REG_A6XX_GRAS_DS_LAYER_CNTL - }, - [MESA_SHADER_GEOMETRY] = { - REG_A6XX_SP_GS_OUT_REG(0), - REG_A6XX_SP_GS_VPC_DST_REG(0), - REG_A6XX_VPC_GS_PACK, - REG_A6XX_VPC_GS_CLIP_CNTL, - REG_A6XX_GRAS_GS_CL_CNTL, - REG_A6XX_PC_GS_OUT_CNTL, - REG_A6XX_SP_GS_PRIMITIVE_CNTL, - REG_A6XX_VPC_GS_LAYER_CNTL, - REG_A6XX_GRAS_GS_LAYER_CNTL - }, - }; - - const struct ir3_shader_variant *last_shader; - if (gs) { - last_shader = gs; - } else if (hs) { - last_shader = ds; - } else { - last_shader = vs; - } - - const struct reg_config *cfg = ®_config[last_shader->type]; - - struct ir3_shader_linkage linkage = { - .primid_loc = 0xff, - .clip0_loc = 0xff, - .clip1_loc = 0xff, - }; - if (fs) - ir3_link_shaders(&linkage, last_shader, fs, true); - - if (last_shader->stream_output.num_outputs) - ir3_link_stream_out(&linkage, last_shader); - - /* We do this after linking shaders in order to know whether PrimID - * passthrough needs to be enabled. - */ - bool primid_passthru = linkage.primid_loc != 0xff; - tu6_emit_vs_system_values(cs, vs, hs, ds, gs, primid_passthru); - - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4); - tu_cs_emit(cs, ~linkage.varmask[0]); - tu_cs_emit(cs, ~linkage.varmask[1]); - tu_cs_emit(cs, ~linkage.varmask[2]); - tu_cs_emit(cs, ~linkage.varmask[3]); - - /* a6xx finds position/pointsize at the end */ - const uint32_t pointsize_regid = - ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ); - const uint32_t layer_regid = - ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER); - const uint32_t view_regid = - ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT); - const uint32_t clip0_regid = - ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0); - const uint32_t clip1_regid = - ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1); - uint32_t flags_regid = gs ? - ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0; - - uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff; - - if (layer_regid != regid(63, 0)) { - layer_loc = linkage.max_loc; - ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc); - } - - if (view_regid != regid(63, 0)) { - view_loc = linkage.max_loc; - ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc); - } - - unsigned extra_pos = 0; - - for (unsigned i = 0; i < last_shader->outputs_count; i++) { - if (last_shader->outputs[i].slot != VARYING_SLOT_POS) - continue; - - if (position_loc == 0xff) - position_loc = linkage.max_loc; - - ir3_link_add(&linkage, last_shader->outputs[i].slot, - last_shader->outputs[i].regid, - 0xf, position_loc + 4 * last_shader->outputs[i].view); - extra_pos = MAX2(extra_pos, last_shader->outputs[i].view); - } - - if (pointsize_regid != regid(63, 0)) { - pointsize_loc = linkage.max_loc; - ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc); - } - - uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask; - - /* Handle the case where clip/cull distances aren't read by the FS */ - uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc; - if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) { - clip0_loc = linkage.max_loc; - ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid, - clip_cull_mask & 0xf, linkage.max_loc); - } - if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) { - clip1_loc = linkage.max_loc; - ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid, - clip_cull_mask >> 4, linkage.max_loc); - } - - tu6_setup_streamout(cs, last_shader, &linkage); - - /* The GPU hangs on some models when there are no outputs (xs_pack::CNT), - * at least when a DS is the last stage, so add a dummy output to keep it - * happy if there aren't any. We do this late in order to avoid emitting - * any unused code and make sure that optimizations don't remove it. - */ - if (linkage.cnt == 0) - ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc); - - /* map outputs of the last shader to VPC */ - assert(linkage.cnt <= 32); - const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2); - const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4); - uint32_t sp_out[16] = {0}; - uint32_t sp_vpc_dst[8] = {0}; - for (uint32_t i = 0; i < linkage.cnt; i++) { - ((uint16_t *) sp_out)[i] = - A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) | - A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask); - ((uint8_t *) sp_vpc_dst)[i] = - A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc); - } - - tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count); - tu_cs_emit_array(cs, sp_out, sp_out_count); - - tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count); - tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count); - - tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1); - tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) | - A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) | - A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) | - A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos)); - - tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1); - tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | - A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | - A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); - - tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1); - tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) | - A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask)); - - const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs }; - - for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) { - const struct ir3_shader_variant *shader = geom_shaders[i]; - if (!shader) - continue; - - bool primid = shader->type != MESA_SHADER_VERTEX && - VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID)); - - tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1); - if (shader == last_shader) { - tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) | - CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) | - CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) | - CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) | - COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) | - A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); - } else { - tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID)); - } - } - - /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */ - if (gs) - assert(flags_regid != INVALID_REG); - - tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1); - tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) | - A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid)); - - tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1); - tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) | - A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc)); - - tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1); - tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) | - CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW)); - - tu_cs_emit_regs(cs, A6XX_PC_PRIMID_PASSTHRU(primid_passthru)); - - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1); - tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs ? fs->total_in : 0) | - COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) | - A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) | - A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc)); - - if (hs) { - tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1); - tu_cs_emit(cs, hs->tess.tcs_vertices_out); - - /* In SPIR-V generated from GLSL, the tessellation primitive params are - * are specified in the tess eval shader, but in SPIR-V generated from - * HLSL, they are specified in the tess control shader. */ - const struct ir3_shader_variant *tess = - ds->tess.spacing == TESS_SPACING_UNSPECIFIED ? hs : ds; - tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_CNTL, 1); - uint32_t output; - if (tess->tess.point_mode) - output = TESS_POINTS; - else if (tess->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES) - output = TESS_LINES; - else if (tess->tess.ccw) - output = TESS_CCW_TRIS; - else - output = TESS_CW_TRIS; - - enum a6xx_tess_spacing spacing; - switch (tess->tess.spacing) { - case TESS_SPACING_EQUAL: - spacing = TESS_EQUAL; - break; - case TESS_SPACING_FRACTIONAL_ODD: - spacing = TESS_FRACTIONAL_ODD; - break; - case TESS_SPACING_FRACTIONAL_EVEN: - spacing = TESS_FRACTIONAL_EVEN; - break; - case TESS_SPACING_UNSPECIFIED: - default: - unreachable("invalid tess spacing"); - } - tu_cs_emit(cs, A6XX_PC_TESS_CNTL_SPACING(spacing) | - A6XX_PC_TESS_CNTL_OUTPUT(output)); - - tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER); - tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER); - } - - - if (gs) { - uint32_t vertices_out, invocations, output, vec4_size; - uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size; - - if (hs) { - tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER); - } else { - tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER); - } - vertices_out = gs->gs.vertices_out - 1; - output = primitive_to_tess(gs->gs.output_primitive); - invocations = gs->gs.invocations - 1; - /* Size of per-primitive alloction in ldlw memory in vec4s. */ - vec4_size = gs->gs.vertices_in * - DIV_ROUND_UP(prev_stage_output_size, 4); - - tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1); - tu_cs_emit(cs, - A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) | - A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | - A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations)); - - tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1); - tu_cs_emit(cs, 0xff); - - tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); - tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); - - uint32_t prim_size = prev_stage_output_size; - if (prim_size > 64) - prim_size = 64; - else if (prim_size == 64) - prim_size = 63; - tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1); - tu_cs_emit(cs, prim_size); - } +static void +tu6_emit_fs_system_values(struct tu_cs *cs, + const struct ir3_shader_variant *fs) +{ + const uint32_t frontfacing_regid = + ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); + const uint32_t sampleid_regid = + ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); + const uint32_t samplemaskin_regid = + ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); + const uint32_t fragcoord_xy_regid = + ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); + const uint32_t fragcoord_zw_regid = (fragcoord_xy_regid != regid(63, 0)) + ? (fragcoord_xy_regid + 2) + : fragcoord_xy_regid; + const uint32_t varyingcoord_regid = + ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PIXEL); - tu6_emit_vpc_varying_modes(cs, fs, last_shader); + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5); + tu_cs_emit(cs, 0x7); + tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(frontfacing_regid) | + A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(sampleid_regid) | + A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(samplemaskin_regid) | + A6XX_HLSQ_CONTROL_2_REG_SIZE(regid(63, 0))); + tu_cs_emit(cs, + A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_PIXEL(varyingcoord_regid) | + A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_CENTROID(regid(63, 0)) | + 0xfc00fc00); + tu_cs_emit(cs, + A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(fragcoord_xy_regid) | + A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(fragcoord_zw_regid) | + A6XX_HLSQ_CONTROL_4_REG_BARY_IJ_PIXEL_PERSAMP(regid(63, 0)) | + 0x0000fc00); + tu_cs_emit(cs, 0xfc); } -void +static void tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) { - uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; - uint32_t ij_regid[IJ_COUNT]; - uint32_t smask_in_regid; - - bool sample_shading = fs->per_samp | fs->key.sample_shading; - bool enable_varyings = fs->total_in > 0; - - samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); - smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); - face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); - coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); - zwcoord_regid = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0); - for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) - ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); - - if (fs->num_sampler_prefetch > 0) { - assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL])); - /* also, it seems like ij_pix is *required* to be r0.x */ - assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0)); - } + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_UNKNOWN_B980, 1); + tu_cs_emit(cs, fs->total_in > 0 ? 3 : 1); - tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); - tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | - A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) | - 0x7000); // XXX); - for (int i = 0; i < fs->num_sampler_prefetch; i++) { - const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; - tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) | - A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) | - A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) | - A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) | - A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) | - COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) | - A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd)); - } + tu_cs_emit_pkt4(cs, REG_A6XX_SP_UNKNOWN_A982, 1); + tu_cs_emit(cs, 0); /* XXX */ - if (fs->num_sampler_prefetch > 0) { - tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch); - for (int i = 0; i < fs->num_sampler_prefetch; i++) { - const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; - tu_cs_emit(cs, - A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) | - A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id)); - } - } + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 1); + tu_cs_emit(cs, 0xff); /* XXX */ - tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5); - tu_cs_emit(cs, 0x7); - tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | - A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | - A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) | - A6XX_HLSQ_CONTROL_2_REG_CENTERRHW(ij_regid[IJ_PERSP_CENTER_RHW])); - tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | - A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | - A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) | - A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID])); - tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | - A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | - A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) | - A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE])); - tu_cs_emit(cs, 0xfcfc); - - enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64; - tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1); - tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz) | - COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS)); - - bool need_size = fs->frag_face || fs->fragcoord_compmask != 0; - bool need_size_persamp = false; - if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) { - if (sample_shading) - need_size_persamp = true; - else - need_size = true; + uint32_t gras_cntl = 0; + if (fs->total_in > 0) + gras_cntl |= A6XX_GRAS_CNTL_VARYING; + if (fs->frag_coord) { + gras_cntl |= A6XX_GRAS_CNTL_SIZE | A6XX_GRAS_CNTL_XCOORD | + A6XX_GRAS_CNTL_YCOORD | A6XX_GRAS_CNTL_ZCOORD | + A6XX_GRAS_CNTL_WCOORD; } tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1); - tu_cs_emit(cs, - CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) | - CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) | - CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) | - CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | - CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) | - CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | - COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | - COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | - COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask))); + tu_cs_emit(cs, gras_cntl); + + uint32_t rb_render_control = 0; + if (fs->total_in > 0) { + rb_render_control = + A6XX_RB_RENDER_CONTROL0_VARYING | A6XX_RB_RENDER_CONTROL0_UNK10; + } + if (fs->frag_coord) { + rb_render_control |= + A6XX_RB_RENDER_CONTROL0_SIZE | A6XX_RB_RENDER_CONTROL0_XCOORD | + A6XX_RB_RENDER_CONTROL0_YCOORD | A6XX_RB_RENDER_CONTROL0_ZCOORD | + A6XX_RB_RENDER_CONTROL0_WCOORD; + } tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2); - tu_cs_emit(cs, - CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) | - CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) | - CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) | - CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | - CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) | - CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | - COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | - COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) | - COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | - COND(fs->fragcoord_compmask != 0, - A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask))); - tu_cs_emit(cs, - A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE( - sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) | - CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | - CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | - CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) | - COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1); - tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE)); - - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1); - tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) | - A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE( - sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER)); - - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1); - tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE)); + tu_cs_emit(cs, rb_render_control); + tu_cs_emit(cs, (fs->frag_face ? A6XX_RB_RENDER_CONTROL1_FACENESS : 0)); } static void tu6_emit_fs_outputs(struct tu_cs *cs, const struct ir3_shader_variant *fs, - struct tu_pipeline *pipeline) + uint32_t mrt_count) { - uint32_t smask_regid, posz_regid, stencilref_regid; - - posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); - smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK); - stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL); - - int output_reg_count = 0; + const uint32_t fragdepth_regid = + ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); uint32_t fragdata_regid[8]; - - assert(!fs->color0_mrt); - for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) { - fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i); - if (VALIDREG(fragdata_regid[i])) - output_reg_count = i + 1; + if (fs->color0_mrt) { + fragdata_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_COLOR); + for (uint32_t i = 1; i < ARRAY_SIZE(fragdata_regid); i++) + fragdata_regid[i] = fragdata_regid[0]; + } else { + for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) + fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i); } - tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1); - tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | - A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | - A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) | - COND(fs->dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); - - /* There is no point in having component enabled which is not written - * by the shader. Per VK spec it is an UB, however a few apps depend on - * attachment not being changed if FS doesn't have corresponding output. - */ - uint32_t fs_render_components = 0; + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2); + tu_cs_emit( + cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(fragdepth_regid) | 0xfcfc0000); + tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count)); - tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), output_reg_count); - for (uint32_t i = 0; i < output_reg_count; i++) { + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 8); + for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) { + // TODO we could have a mix of half and full precision outputs, + // we really need to figure out half-precision from IR3_REG_HALF tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) | - (COND(fragdata_regid[i] & HALF_REG_ID, - A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION))); - - if (VALIDREG(fragdata_regid[i])) { - fs_render_components |= 0xf << (i * 4); - } + (false ? A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION : 0)); } - tu_cs_emit_regs(cs, - A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components)); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 1); - tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) | - COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) | - COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) | - COND(fs->dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); + tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2); + tu_cs_emit(cs, fs->writes_pos ? A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z : 0); + tu_cs_emit(cs, A6XX_RB_FS_OUTPUT_CNTL1_MRT(mrt_count)); - tu_cs_emit_regs(cs, - A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components)); - - if (pipeline) { - pipeline->lrz.fs.has_kill = fs->has_kill; - pipeline->lrz.fs.early_fragment_tests = fs->fs.early_fragment_tests; - - if (!fs->fs.early_fragment_tests && - (fs->no_earlyz || fs->writes_pos || fs->writes_stencilref || fs->writes_smask)) { - pipeline->lrz.force_late_z = true; - } - - pipeline->lrz.fs.force_early_z = fs->fs.early_fragment_tests; + uint32_t gras_su_depth_plane_cntl = 0; + uint32_t rb_depth_plane_cntl = 0; + if (fs->no_earlyz | fs->writes_pos) { + gras_su_depth_plane_cntl |= A6XX_GRAS_SU_DEPTH_PLANE_CNTL_FRAG_WRITES_Z; + rb_depth_plane_cntl |= A6XX_RB_DEPTH_PLANE_CNTL_FRAG_WRITES_Z; } -} -static void -tu6_emit_vs_params(struct tu_cs *cs, - const struct ir3_const_state *const_state, - unsigned constlen, - unsigned param_stride, - unsigned num_vertices) -{ - uint32_t vs_params[4] = { - param_stride * num_vertices * 4, /* vs primitive stride */ - param_stride * 4, /* vs vertex stride */ - 0, - 0, - }; - uint32_t vs_base = const_state->offsets.primitive_param; - tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0, - ARRAY_SIZE(vs_params), vs_params); + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1); + tu_cs_emit(cs, gras_su_depth_plane_cntl); + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1); + tu_cs_emit(cs, rb_depth_plane_cntl); } static void -tu_get_tess_iova(struct tu_device *dev, - uint64_t *tess_factor_iova, - uint64_t *tess_param_iova) -{ - /* Create the shared tess factor BO the first time tess is used on the device. */ - if (!dev->tess_bo) { - mtx_lock(&dev->mutex); - if (!dev->tess_bo) - tu_bo_init_new(dev, &dev->tess_bo, TU_TESS_BO_SIZE, TU_BO_ALLOC_NO_FLAGS, "tess"); - mtx_unlock(&dev->mutex); +tu6_emit_shader_object(struct tu_cs *cs, + gl_shader_stage stage, + const struct ir3_shader_variant *variant, + const struct tu_bo *binary_bo, + uint32_t binary_offset) +{ + uint16_t reg; + uint8_t opcode; + enum a6xx_state_block sb; + switch (stage) { + case MESA_SHADER_VERTEX: + reg = REG_A6XX_SP_VS_OBJ_START_LO; + opcode = CP_LOAD_STATE6_GEOM; + sb = SB6_VS_SHADER; + break; + case MESA_SHADER_TESS_CTRL: + reg = REG_A6XX_SP_HS_OBJ_START_LO; + opcode = CP_LOAD_STATE6_GEOM; + sb = SB6_HS_SHADER; + break; + case MESA_SHADER_TESS_EVAL: + reg = REG_A6XX_SP_DS_OBJ_START_LO; + opcode = CP_LOAD_STATE6_GEOM; + sb = SB6_DS_SHADER; + break; + case MESA_SHADER_GEOMETRY: + reg = REG_A6XX_SP_GS_OBJ_START_LO; + opcode = CP_LOAD_STATE6_GEOM; + sb = SB6_GS_SHADER; + break; + case MESA_SHADER_FRAGMENT: + reg = REG_A6XX_SP_FS_OBJ_START_LO; + opcode = CP_LOAD_STATE6_FRAG; + sb = SB6_FS_SHADER; + break; + case MESA_SHADER_COMPUTE: + reg = REG_A6XX_SP_CS_OBJ_START_LO; + opcode = CP_LOAD_STATE6_FRAG; + sb = SB6_CS_SHADER; + break; + default: + unreachable("invalid gl_shader_stage"); + opcode = CP_LOAD_STATE6_GEOM; + sb = SB6_VS_SHADER; + break; } - *tess_factor_iova = dev->tess_bo->iova; - *tess_param_iova = dev->tess_bo->iova + TU_TESS_FACTOR_SIZE; -} - -void -tu6_emit_patch_control_points(struct tu_cs *cs, - const struct tu_pipeline *pipeline, - unsigned patch_control_points) -{ - if (!(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)) + if (!variant->instrlen) { + tu_cs_emit_pkt4(cs, reg, 2); + tu_cs_emit_qw(cs, 0); return; - - struct tu_device *dev = cs->device; - - tu6_emit_vs_params(cs, - &pipeline->program.link[MESA_SHADER_VERTEX].const_state, - pipeline->program.link[MESA_SHADER_VERTEX].constlen, - pipeline->program.vs_param_stride, - patch_control_points); - - uint64_t tess_factor_iova, tess_param_iova; - tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova); - - uint32_t hs_params[8] = { - pipeline->program.vs_param_stride * patch_control_points * 4, /* hs primitive stride */ - pipeline->program.vs_param_stride * 4, /* hs vertex stride */ - pipeline->program.hs_param_stride, - patch_control_points, - tess_param_iova, - tess_param_iova >> 32, - tess_factor_iova, - tess_factor_iova >> 32, - }; - - const struct ir3_const_state *hs_const = - &pipeline->program.link[MESA_SHADER_TESS_CTRL].const_state; - uint32_t hs_base = hs_const->offsets.primitive_param; - tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0, - pipeline->program.hs_param_dwords, hs_params); - - uint32_t patch_local_mem_size_16b = - patch_control_points * pipeline->program.vs_param_stride / 4; - - /* Total attribute slots in HS incoming patch. */ - tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1); - tu_cs_emit(cs, patch_local_mem_size_16b); - - const uint32_t wavesize = 64; - const uint32_t vs_hs_local_mem_size = 16384; - - uint32_t max_patches_per_wave; - if (dev->physical_device->info->a6xx.tess_use_shared) { - /* HS invocations for a patch are always within the same wave, - * making barriers less expensive. VS can't have barriers so we - * don't care about VS invocations being in the same wave. - */ - max_patches_per_wave = wavesize / pipeline->program.hs_vertices_out; - } else { - /* VS is also in the same wave */ - max_patches_per_wave = - wavesize / MAX2(patch_control_points, - pipeline->program.hs_vertices_out); } - uint32_t patches_per_wave = - MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16), - max_patches_per_wave); - - uint32_t wave_input_size = DIV_ROUND_UP( - patches_per_wave * patch_local_mem_size_16b * 16, 256); + assert(variant->type == stage); - tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); - tu_cs_emit(cs, wave_input_size); - - /* maximum number of patches that can fit in tess factor/param buffers */ - uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(pipeline->tess.patch_type), - TU_TESS_PARAM_SIZE / (pipeline->program.hs_param_stride * 4)); - /* convert from # of patches to draw count */ - subdraw_size *= patch_control_points; - - tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1); - tu_cs_emit(cs, subdraw_size); -} + const uint64_t binary_iova = binary_bo->iova + binary_offset; + assert((binary_iova & 0x3) == 0); -static void -tu6_emit_geom_tess_consts(struct tu_cs *cs, - const struct ir3_shader_variant *vs, - const struct ir3_shader_variant *hs, - const struct ir3_shader_variant *ds, - const struct ir3_shader_variant *gs) -{ - struct tu_device *dev = cs->device; - - if (gs && !hs) { - tu6_emit_vs_params(cs, ir3_const_state(vs), vs->constlen, - vs->output_size, gs->gs.vertices_in); - } + tu_cs_emit_pkt4(cs, reg, 2); + tu_cs_emit_qw(cs, binary_iova); - if (hs) { - uint64_t tess_factor_iova, tess_param_iova; - tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova); - - uint32_t ds_params[8] = { - gs ? ds->output_size * gs->gs.vertices_in * 4 : 0, /* ds primitive stride */ - ds->output_size * 4, /* ds vertex stride */ - hs->output_size, /* hs vertex stride (dwords) */ - hs->tess.tcs_vertices_out, - tess_param_iova, - tess_param_iova >> 32, - tess_factor_iova, - tess_factor_iova >> 32, - }; - - uint32_t ds_base = ds->const_state->offsets.primitive_param; - uint32_t ds_param_dwords = MIN2((ds->constlen - ds_base) * 4, ARRAY_SIZE(ds_params)); - tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0, - ds_param_dwords, ds_params); - } + /* always indirect */ + const bool indirect = true; + if (indirect) { + tu_cs_emit_pkt7(cs, opcode, 3); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(sb) | + CP_LOAD_STATE6_0_NUM_UNIT(variant->instrlen)); + tu_cs_emit_qw(cs, binary_iova); + } else { + const void *binary = binary_bo->map + binary_offset; - if (gs) { - const struct ir3_shader_variant *prev = ds ? ds : vs; - uint32_t gs_params[4] = { - prev->output_size * gs->gs.vertices_in * 4, /* gs primitive stride */ - prev->output_size * 4, /* gs vertex stride */ - 0, - 0, - }; - uint32_t gs_base = gs->const_state->offsets.primitive_param; - tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0, - ARRAY_SIZE(gs_params), gs_params); + tu_cs_emit_pkt7(cs, opcode, 3 + variant->info.sizedwords); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(sb) | + CP_LOAD_STATE6_0_NUM_UNIT(variant->instrlen)); + tu_cs_emit_qw(cs, 0); + tu_cs_emit_array(cs, binary, variant->info.sizedwords); } } static void -tu6_emit_program_config(struct tu_cs *cs, - struct tu_pipeline_builder *builder) +tu6_emit_program(struct tu_cs *cs, + const struct tu_pipeline_builder *builder, + const struct tu_bo *binary_bo, + bool binning_pass) { - gl_shader_stage stage = MESA_SHADER_VERTEX; - - STATIC_ASSERT(MESA_SHADER_VERTEX == 0); - - bool shared_consts_enable = tu6_shared_constants_enable(&builder->layout, - builder->device->compiler); - tu6_emit_shared_consts_enable(cs, shared_consts_enable); - - tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( - .vs_state = true, - .hs_state = true, - .ds_state = true, - .gs_state = true, - .fs_state = true, - .gfx_ibo = true, - .gfx_shared_const = shared_consts_enable)); - for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) { - tu6_emit_xs_config(cs, stage, builder->variants[stage]); - } + static const struct ir3_shader_variant dummy_variant = { + .type = MESA_SHADER_NONE + }; + assert(builder->shaders[MESA_SHADER_VERTEX]); + const struct ir3_shader_variant *vs = + &builder->shaders[MESA_SHADER_VERTEX]->variants[0]; + const struct ir3_shader_variant *hs = + builder->shaders[MESA_SHADER_TESS_CTRL] + ? &builder->shaders[MESA_SHADER_TESS_CTRL]->variants[0] + : &dummy_variant; + const struct ir3_shader_variant *ds = + builder->shaders[MESA_SHADER_TESS_EVAL] + ? &builder->shaders[MESA_SHADER_TESS_EVAL]->variants[0] + : &dummy_variant; + const struct ir3_shader_variant *gs = + builder->shaders[MESA_SHADER_GEOMETRY] + ? &builder->shaders[MESA_SHADER_GEOMETRY]->variants[0] + : &dummy_variant; + const struct ir3_shader_variant *fs = + builder->shaders[MESA_SHADER_FRAGMENT] + ? &builder->shaders[MESA_SHADER_FRAGMENT]->variants[0] + : &dummy_variant; + + if (binning_pass) { + vs = &builder->shaders[MESA_SHADER_VERTEX]->variants[1]; + fs = &dummy_variant; + } + + tu6_emit_vs_config(cs, vs); + tu6_emit_hs_config(cs, hs); + tu6_emit_ds_config(cs, ds); + tu6_emit_gs_config(cs, gs); + tu6_emit_fs_config(cs, fs); + + tu6_emit_vs_system_values(cs, vs); + tu6_emit_vpc(cs, vs, fs, binning_pass); + tu6_emit_vpc_varying_modes(cs, fs, binning_pass); + tu6_emit_fs_system_values(cs, fs); + tu6_emit_fs_inputs(cs, fs); + tu6_emit_fs_outputs(cs, fs, builder->color_attachment_count); + + tu6_emit_shader_object(cs, MESA_SHADER_VERTEX, vs, binary_bo, + builder->shader_offsets[MESA_SHADER_VERTEX]); + + tu6_emit_shader_object(cs, MESA_SHADER_FRAGMENT, fs, binary_bo, + builder->shader_offsets[MESA_SHADER_FRAGMENT]); } static void -tu6_emit_program(struct tu_cs *cs, - struct tu_pipeline_builder *builder, - bool binning_pass, - struct tu_pipeline *pipeline) -{ - const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX]; - const struct ir3_shader_variant *bs = builder->binning_variant; - const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL]; - const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL]; - const struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY]; - const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT]; - gl_shader_stage stage = MESA_SHADER_VERTEX; - bool multi_pos_output = vs->multi_pos_output; - - /* Don't use the binning pass variant when GS is present because we don't - * support compiling correct binning pass variants with GS. - */ - if (binning_pass && !gs) { - vs = bs; - tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova); - tu6_emit_dynamic_offset(cs, bs, builder); - stage++; - } - - for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) { - const struct ir3_shader_variant *xs = builder->variants[stage]; - - if (stage == MESA_SHADER_FRAGMENT && binning_pass) - fs = xs = NULL; +tu6_emit_vertex_input(struct tu_cs *cs, + const struct ir3_shader_variant *vs, + const VkPipelineVertexInputStateCreateInfo *vi_info, + uint8_t bindings[MAX_VERTEX_ATTRIBS], + uint16_t strides[MAX_VERTEX_ATTRIBS], + uint16_t offsets[MAX_VERTEX_ATTRIBS], + uint32_t *count) +{ + uint32_t vfd_decode_idx = 0; + + /* why do we go beyond inputs_count? */ + assert(vs->inputs_count + 1 <= MAX_VERTEX_ATTRIBS); + for (uint32_t i = 0; i <= vs->inputs_count; i++) { + if (vs->inputs[i].sysval || !vs->inputs[i].compmask) + continue; - tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]); - tu6_emit_dynamic_offset(cs, xs, builder); - } + const VkVertexInputAttributeDescription *vi_attr = + tu_find_vertex_input_attribute(vi_info, vs->inputs[i].slot); + const VkVertexInputBindingDescription *vi_binding = + tu_find_vertex_input_binding(vi_info, vi_attr); + assert(vi_attr && vi_binding); - uint32_t multiview_views = util_logbase2(pipeline->rast.multiview_mask) + 1; - uint32_t multiview_cntl = pipeline->rast.multiview_mask ? - A6XX_PC_MULTIVIEW_CNTL_ENABLE | - A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) | - COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS) - : 0; + const struct tu_native_format *format = + tu6_get_native_format(vi_attr->format); + assert(format && format->vtx >= 0); - /* Copy what the blob does here. This will emit an extra 0x3f - * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what - * this is working around yet. - */ - if (builder->device->physical_device->info->a6xx.has_cp_reg_write) { - tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3); - tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE)); - tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL); - } else { - tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1); - } - tu_cs_emit(cs, multiview_cntl); + uint32_t vfd_decode = A6XX_VFD_DECODE_INSTR_IDX(vfd_decode_idx) | + A6XX_VFD_DECODE_INSTR_FORMAT(format->vtx) | + A6XX_VFD_DECODE_INSTR_SWAP(format->swap) | + A6XX_VFD_DECODE_INSTR_UNK30; + if (vi_binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) + vfd_decode |= A6XX_VFD_DECODE_INSTR_INSTANCED; + if (!vk_format_is_int(vi_attr->format)) + vfd_decode |= A6XX_VFD_DECODE_INSTR_FLOAT; - tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1); - tu_cs_emit(cs, multiview_cntl); + const uint32_t vfd_decode_step_rate = 1; - if (multiview_cntl && - builder->device->physical_device->info->a6xx.supports_multiview_mask) { - tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1); - tu_cs_emit(cs, pipeline->rast.multiview_mask); - } + const uint32_t vfd_dest_cntl = + A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) | + A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid); - tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); - tu_cs_emit(cs, 0); + tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DECODE(vfd_decode_idx), 2); + tu_cs_emit(cs, vfd_decode); + tu_cs_emit(cs, vfd_decode_step_rate); - tu6_emit_vfd_dest(cs, vs); + tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL(vfd_decode_idx), 1); + tu_cs_emit(cs, vfd_dest_cntl); - tu6_emit_vpc(cs, vs, hs, ds, gs, fs); + bindings[vfd_decode_idx] = vi_binding->binding; + strides[vfd_decode_idx] = vi_binding->stride; + offsets[vfd_decode_idx] = vi_attr->offset; - if (fs) { - tu6_emit_fs_inputs(cs, fs); - tu6_emit_fs_outputs(cs, fs, pipeline); - } else { - /* TODO: check if these can be skipped if fs is disabled */ - struct ir3_shader_variant dummy_variant = {}; - tu6_emit_fs_inputs(cs, &dummy_variant); - tu6_emit_fs_outputs(cs, &dummy_variant, NULL); + vfd_decode_idx++; } - if (gs || hs) { - tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs); - } + tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_0, 1); + tu_cs_emit( + cs, A6XX_VFD_CONTROL_0_VTXCNT(vfd_decode_idx) | (vfd_decode_idx << 8)); + + *count = vfd_decode_idx; } -void -tu6_emit_vertex_input(struct tu_cs *cs, - uint32_t binding_count, - const VkVertexInputBindingDescription2EXT *bindings, - uint32_t unsorted_attr_count, - const VkVertexInputAttributeDescription2EXT *unsorted_attrs) +static uint32_t +tu6_guardband_adj(uint32_t v) { - uint32_t binding_instanced = 0; /* bitmask of instanced bindings */ - uint32_t step_rate[MAX_VBS]; - - for (uint32_t i = 0; i < binding_count; i++) { - const VkVertexInputBindingDescription2EXT *binding = &bindings[i]; - - if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) - binding_instanced |= 1u << binding->binding; - - step_rate[binding->binding] = binding->divisor; - } - - const VkVertexInputAttributeDescription2EXT *attrs[MAX_VERTEX_ATTRIBS] = { }; - unsigned attr_count = 0; - for (uint32_t i = 0; i < unsorted_attr_count; i++) { - const VkVertexInputAttributeDescription2EXT *attr = &unsorted_attrs[i]; - attrs[attr->location] = attr; - attr_count = MAX2(attr_count, attr->location + 1); - } - - if (attr_count != 0) - tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DECODE_INSTR(0), attr_count * 2); - - for (uint32_t loc = 0; loc < attr_count; loc++) { - const VkVertexInputAttributeDescription2EXT *attr = attrs[loc]; - - if (attr) { - const struct tu_native_format format = tu6_format_vtx(attr->format); - tu_cs_emit(cs, A6XX_VFD_DECODE_INSTR(0, - .idx = attr->binding, - .offset = attr->offset, - .instanced = binding_instanced & (1 << attr->binding), - .format = format.fmt, - .swap = format.swap, - .unk30 = 1, - ._float = !vk_format_is_int(attr->format)).value); - tu_cs_emit(cs, A6XX_VFD_DECODE_STEP_RATE(0, step_rate[attr->binding]).value); - } else { - tu_cs_emit(cs, 0); - tu_cs_emit(cs, 0); - } - } + if (v > 256) + return (uint32_t)(511.0 - 65.0 * (log2(v) - 8.0)); + else + return 511; } void -tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewports, uint32_t num_viewport, - bool z_negative_one_to_one) -{ - VkExtent2D guardband = {511, 511}; - - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), num_viewport * 6); - for (uint32_t i = 0; i < num_viewport; i++) { - const VkViewport *viewport = &viewports[i]; - float offsets[3]; - float scales[3]; - scales[0] = viewport->width / 2.0f; - scales[1] = viewport->height / 2.0f; - if (z_negative_one_to_one) { - scales[2] = 0.5 * (viewport->maxDepth - viewport->minDepth); - } else { - scales[2] = viewport->maxDepth - viewport->minDepth; - } - - offsets[0] = viewport->x + scales[0]; - offsets[1] = viewport->y + scales[1]; - if (z_negative_one_to_one) { - offsets[2] = 0.5 * (viewport->minDepth + viewport->maxDepth); - } else { - offsets[2] = viewport->minDepth; - } - - for (uint32_t j = 0; j < 3; j++) { - tu_cs_emit(cs, fui(offsets[j])); - tu_cs_emit(cs, fui(scales[j])); - } - - guardband.width = - MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false)); - guardband.height = - MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false)); - } - - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), num_viewport * 2); - for (uint32_t i = 0; i < num_viewport; i++) { - const VkViewport *viewport = &viewports[i]; - VkOffset2D min; - VkOffset2D max; - min.x = (int32_t) viewport->x; - max.x = (int32_t) ceilf(viewport->x + viewport->width); - if (viewport->height >= 0.0f) { - min.y = (int32_t) viewport->y; - max.y = (int32_t) ceilf(viewport->y + viewport->height); - } else { - min.y = (int32_t)(viewport->y + viewport->height); - max.y = (int32_t) ceilf(viewport->y); - } - /* the spec allows viewport->height to be 0.0f */ - if (min.y == max.y) - max.y++; - /* allow viewport->width = 0.0f for un-initialized viewports: */ - if (min.x == max.x) - max.x++; - - min.x = MAX2(min.x, 0); - min.y = MAX2(min.y, 0); - max.x = MAX2(max.x, 1); - max.y = MAX2(max.y, 1); - - assert(min.x < max.x); - assert(min.y < max.y); - - tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) | - A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y)); - tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_X(max.x - 1) | - A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_Y(max.y - 1)); - } +tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport) +{ + float offsets[3]; + float scales[3]; + scales[0] = viewport->width / 2.0f; + scales[1] = viewport->height / 2.0f; + scales[2] = viewport->maxDepth - viewport->minDepth; + offsets[0] = viewport->x + scales[0]; + offsets[1] = viewport->y + scales[1]; + offsets[2] = viewport->minDepth; + + VkOffset2D min; + VkOffset2D max; + min.x = (int32_t) viewport->x; + max.x = (int32_t) ceilf(viewport->x + viewport->width); + if (viewport->height >= 0.0f) { + min.y = (int32_t) viewport->y; + max.y = (int32_t) ceilf(viewport->y + viewport->height); + } else { + min.y = (int32_t)(viewport->y + viewport->height); + max.y = (int32_t) ceilf(viewport->y); + } + /* the spec allows viewport->height to be 0.0f */ + if (min.y == max.y) + max.y++; + assert(min.x >= 0 && min.x < max.x); + assert(min.y >= 0 && min.y < max.y); + + VkExtent2D guardband_adj; + guardband_adj.width = tu6_guardband_adj(max.x - min.x); + guardband_adj.height = tu6_guardband_adj(max.y - min.y); + + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET_0, 6); + tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_XOFFSET_0(offsets[0])); + tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_XSCALE_0(scales[0])); + tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_YOFFSET_0(offsets[1])); + tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_YSCALE_0(scales[1])); + tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_ZOFFSET_0(offsets[2])); + tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_ZSCALE_0(scales[2])); + + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0, 2); + tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_X(min.x) | + A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_Y(min.y)); + tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_X(max.x - 1) | + A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_Y(max.y - 1)); - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), num_viewport * 2); - for (uint32_t i = 0; i < num_viewport; i++) { - const VkViewport *viewport = &viewports[i]; - tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth))); - tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth))); - } tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1); - tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) | - A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height)); - - /* TODO: what to do about this and multi viewport ? */ - float z_clamp_min = num_viewport ? MIN2(viewports[0].minDepth, viewports[0].maxDepth) : 0; - float z_clamp_max = num_viewport ? MAX2(viewports[0].minDepth, viewports[0].maxDepth) : 0; - - tu_cs_emit_regs(cs, - A6XX_RB_Z_CLAMP_MIN(z_clamp_min), - A6XX_RB_Z_CLAMP_MAX(z_clamp_max)); + tu_cs_emit(cs, + A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband_adj.width) | + A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband_adj.height)); } void -tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissors, uint32_t scissor_count) +tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissor) { - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), scissor_count * 2); - - for (uint32_t i = 0; i < scissor_count; i++) { - const VkRect2D *scissor = &scissors[i]; - - uint32_t min_x = scissor->offset.x; - uint32_t min_y = scissor->offset.y; - uint32_t max_x = min_x + scissor->extent.width - 1; - uint32_t max_y = min_y + scissor->extent.height - 1; - - if (!scissor->extent.width || !scissor->extent.height) { - min_x = min_y = 1; - max_x = max_y = 0; - } else { - /* avoid overflow */ - uint32_t scissor_max = BITFIELD_MASK(15); - min_x = MIN2(scissor_max, min_x); - min_y = MIN2(scissor_max, min_y); - max_x = MIN2(scissor_max, max_x); - max_y = MIN2(scissor_max, max_y); - } + const VkOffset2D min = scissor->offset; + const VkOffset2D max = { + scissor->offset.x + scissor->extent.width, + scissor->offset.y + scissor->extent.height, + }; - tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) | - A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y)); - tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) | - A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y)); - } + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0, 2); + tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0_X(min.x) | + A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0_Y(min.y)); + tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0_X(max.x - 1) | + A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0_Y(max.y - 1)); } -void -tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc) +static void +tu6_emit_gras_unknowns(struct tu_cs *cs) { - if (!samp_loc) { - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1); - tu_cs_emit(cs, 0); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1); - tu_cs_emit(cs, 0); - - tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1); - tu_cs_emit(cs, 0); - return; - } - - assert(samp_loc->sampleLocationsPerPixel == samp_loc->sampleLocationsCount); - assert(samp_loc->sampleLocationGridSize.width == 1); - assert(samp_loc->sampleLocationGridSize.height == 1); - - uint32_t sample_config = - A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE; - uint32_t sample_locations = 0; - for (uint32_t i = 0; i < samp_loc->sampleLocationsCount; i++) { - sample_locations |= - (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(samp_loc->pSampleLocations[i].x) | - A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(samp_loc->pSampleLocations[i].y)) << i*8; - } - - tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 2); - tu_cs_emit(cs, sample_config); - tu_cs_emit(cs, sample_locations); - - tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 2); - tu_cs_emit(cs, sample_config); - tu_cs_emit(cs, sample_locations); + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_8000, 1); + tu_cs_emit(cs, 0x80); + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_8001, 1); + tu_cs_emit(cs, 0x0); + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_8004, 1); + tu_cs_emit(cs, 0x0); +} - tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 2); - tu_cs_emit(cs, sample_config); - tu_cs_emit(cs, sample_locations); +static void +tu6_emit_point_size(struct tu_cs *cs) +{ + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POINT_MINMAX, 2); + tu_cs_emit(cs, A6XX_GRAS_SU_POINT_MINMAX_MIN(1.0f / 16.0f) | + A6XX_GRAS_SU_POINT_MINMAX_MAX(4092.0f)); + tu_cs_emit(cs, A6XX_GRAS_SU_POINT_SIZE(1.0f)); } static uint32_t tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info, - enum a5xx_line_mode line_mode, - bool multiview) + VkSampleCountFlagBits samples) { uint32_t gras_su_cntl = 0; @@ -2194,33 +1114,117 @@ tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info, if (rast_info->frontFace == VK_FRONT_FACE_CLOCKWISE) gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW; - gras_su_cntl |= - A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(rast_info->lineWidth / 2.0f); + /* don't set A6XX_GRAS_SU_CNTL_LINEHALFWIDTH */ if (rast_info->depthBiasEnable) gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET; - gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINE_MODE(line_mode); - - if (multiview) { - gras_su_cntl |= - A6XX_GRAS_SU_CNTL_UNK17 | - A6XX_GRAS_SU_CNTL_MULTIVIEW_ENABLE; - } + if (samples > VK_SAMPLE_COUNT_1_BIT) + gras_su_cntl |= A6XX_GRAS_SU_CNTL_MSAA_ENABLE; return gras_su_cntl; } void +tu6_emit_gras_su_cntl(struct tu_cs *cs, + uint32_t gras_su_cntl, + float line_width) +{ + assert((gras_su_cntl & A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK) == 0); + gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(line_width / 2.0f); + + tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_CNTL, 1); + tu_cs_emit(cs, gras_su_cntl); +} + +void tu6_emit_depth_bias(struct tu_cs *cs, float constant_factor, float clamp, float slope_factor) { tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3); - tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor).value); - tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor).value); - tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp).value); + tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor)); + tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor)); + tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp)); +} + +static void +tu6_emit_alpha_control_disable(struct tu_cs *cs) +{ + tu_cs_emit_pkt4(cs, REG_A6XX_RB_ALPHA_CONTROL, 1); + tu_cs_emit(cs, 0); +} + +static void +tu6_emit_depth_control(struct tu_cs *cs, + const VkPipelineDepthStencilStateCreateInfo *ds_info) +{ + assert(!ds_info->depthBoundsTestEnable); + + uint32_t rb_depth_cntl = 0; + if (ds_info->depthTestEnable) { + rb_depth_cntl |= + A6XX_RB_DEPTH_CNTL_Z_ENABLE | + A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) | + A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; + + if (ds_info->depthWriteEnable) + rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; + } + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_CNTL, 1); + tu_cs_emit(cs, rb_depth_cntl); +} + +static void +tu6_emit_stencil_control(struct tu_cs *cs, + const VkPipelineDepthStencilStateCreateInfo *ds_info) +{ + uint32_t rb_stencil_control = 0; + if (ds_info->stencilTestEnable) { + const VkStencilOpState *front = &ds_info->front; + const VkStencilOpState *back = &ds_info->back; + rb_stencil_control |= + A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | + A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | + A6XX_RB_STENCIL_CONTROL_STENCIL_READ | + A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) | + A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) | + A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) | + A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) | + A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) | + A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) | + A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) | + A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp)); + } + + tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_CONTROL, 1); + tu_cs_emit(cs, rb_stencil_control); +} + +void +tu6_emit_stencil_compare_mask(struct tu_cs *cs, uint32_t front, uint32_t back) +{ + tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCILMASK, 1); + tu_cs_emit( + cs, A6XX_RB_STENCILMASK_MASK(front) | A6XX_RB_STENCILMASK_BFMASK(back)); +} + +void +tu6_emit_stencil_write_mask(struct tu_cs *cs, uint32_t front, uint32_t back) +{ + tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCILWRMASK, 1); + tu_cs_emit(cs, A6XX_RB_STENCILWRMASK_WRMASK(front) | + A6XX_RB_STENCILWRMASK_BFWRMASK(back)); +} + +void +tu6_emit_stencil_reference(struct tu_cs *cs, uint32_t front, uint32_t back) +{ + tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCILREF, 1); + tu_cs_emit(cs, + A6XX_RB_STENCILREF_REF(front) | A6XX_RB_STENCILREF_BFREF(back)); } static uint32_t @@ -2251,11 +1255,18 @@ tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState *att, static uint32_t tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att, uint32_t rb_mrt_control_rop, + bool is_int, bool has_alpha) { uint32_t rb_mrt_control = A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE(att->colorWriteMask); + /* ignore blending and logic op for integer attachments */ + if (is_int) { + rb_mrt_control |= A6XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY); + return rb_mrt_control; + } + rb_mrt_control |= rb_mrt_control_rop; if (att->blendEnable) { @@ -2268,44 +1279,23 @@ tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att, return rb_mrt_control; } -uint32_t -tu6_rb_mrt_control_rop(VkLogicOp op, bool *rop_reads_dst) -{ - *rop_reads_dst = tu_logic_op_reads_dst(op); - return A6XX_RB_MRT_CONTROL_ROP_ENABLE | - A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(op)); -} - static void -tu6_emit_rb_mrt_controls(struct tu_pipeline *pipeline, +tu6_emit_rb_mrt_controls(struct tu_cs *cs, const VkPipelineColorBlendStateCreateInfo *blend_info, const VkFormat attachment_formats[MAX_RTS], - bool *rop_reads_dst, - uint32_t *color_bandwidth_per_sample) + uint32_t *blend_enable_mask) { - const VkPipelineColorWriteCreateInfoEXT *color_info = - vk_find_struct_const(blend_info->pNext, - PIPELINE_COLOR_WRITE_CREATE_INFO_EXT); - - /* The static state is ignored if it's dynamic. In that case assume - * everything is enabled and then the appropriate registers will be zero'd - * dynamically. - */ - if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE)) - color_info = NULL; - - *rop_reads_dst = false; - *color_bandwidth_per_sample = 0; + *blend_enable_mask = 0; + bool rop_reads_dst = false; uint32_t rb_mrt_control_rop = 0; if (blend_info->logicOpEnable) { - pipeline->blend.logic_op_enabled = true; - rb_mrt_control_rop = tu6_rb_mrt_control_rop(blend_info->logicOp, - rop_reads_dst); + rop_reads_dst = tu_logic_op_reads_dst(blend_info->logicOp); + rb_mrt_control_rop = + A6XX_RB_MRT_CONTROL_ROP_ENABLE | + A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(blend_info->logicOp)); } - uint32_t total_bpp = 0; - pipeline->blend.num_rts = blend_info->attachmentCount; for (uint32_t i = 0; i < blend_info->attachmentCount; i++) { const VkPipelineColorBlendAttachmentState *att = &blend_info->pAttachments[i]; @@ -2313,1273 +1303,179 @@ tu6_emit_rb_mrt_controls(struct tu_pipeline *pipeline, uint32_t rb_mrt_control = 0; uint32_t rb_mrt_blend_control = 0; - if (format != VK_FORMAT_UNDEFINED && - (!color_info || color_info->pColorWriteEnables[i])) { + if (format != VK_FORMAT_UNDEFINED) { + const bool is_int = vk_format_is_int(format); const bool has_alpha = vk_format_has_alpha(format); rb_mrt_control = - tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha); + tu6_rb_mrt_control(att, rb_mrt_control_rop, is_int, has_alpha); rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha); - /* calculate bpp based on format and write mask */ - uint32_t write_bpp = 0; - if (att->colorWriteMask == 0xf) { - write_bpp = vk_format_get_blocksizebits(format); - } else { - const enum pipe_format pipe_format = vk_format_to_pipe_format(format); - for (uint32_t i = 0; i < 4; i++) { - if (att->colorWriteMask & (1 << i)) { - write_bpp += util_format_get_component_bits(pipe_format, - UTIL_FORMAT_COLORSPACE_RGB, i); - } - } - } - total_bpp += write_bpp; - - pipeline->blend.color_write_enable |= BIT(i); - if (att->blendEnable) - pipeline->blend.blend_enable |= BIT(i); - - if (att->blendEnable || *rop_reads_dst) { - total_bpp += write_bpp; - } + if (att->blendEnable || rop_reads_dst) + *blend_enable_mask |= 1 << i; } - pipeline->blend.rb_mrt_control[i] = rb_mrt_control & pipeline->blend.rb_mrt_control_mask; - pipeline->blend.rb_mrt_blend_control[i] = rb_mrt_blend_control; + tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_CONTROL(i), 2); + tu_cs_emit(cs, rb_mrt_control); + tu_cs_emit(cs, rb_mrt_blend_control); } - *color_bandwidth_per_sample = total_bpp / 8; + for (uint32_t i = blend_info->attachmentCount; i < MAX_RTS; i++) { + tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_CONTROL(i), 2); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, 0); + } } static void -tu6_emit_blend_control(struct tu_pipeline *pipeline, +tu6_emit_blend_control(struct tu_cs *cs, uint32_t blend_enable_mask, - bool dual_src_blend, const VkPipelineMultisampleStateCreateInfo *msaa_info) { - const uint32_t sample_mask = - msaa_info->pSampleMask ? (*msaa_info->pSampleMask & 0xffff) - : ((1 << msaa_info->rasterizationSamples) - 1); + assert(!msaa_info->sampleShadingEnable); + assert(!msaa_info->alphaToOneEnable); + uint32_t sp_blend_cntl = A6XX_SP_BLEND_CNTL_UNK8; + if (blend_enable_mask) + sp_blend_cntl |= A6XX_SP_BLEND_CNTL_ENABLED; + if (msaa_info->alphaToCoverageEnable) + sp_blend_cntl |= A6XX_SP_BLEND_CNTL_ALPHA_TO_COVERAGE; - pipeline->blend.sp_blend_cntl = - A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask, - .dual_color_in_enable = dual_src_blend, - .alpha_to_coverage = msaa_info->alphaToCoverageEnable, - .unk8 = true).value & pipeline->blend.sp_blend_cntl_mask; + const uint32_t sample_mask = + msaa_info->pSampleMask ? *msaa_info->pSampleMask + : ((1 << msaa_info->rasterizationSamples) - 1); /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */ - pipeline->blend.rb_blend_cntl = - A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask, - .independent_blend = true, - .sample_mask = sample_mask, - .dual_color_in_enable = dual_src_blend, - .alpha_to_coverage = msaa_info->alphaToCoverageEnable, - .alpha_to_one = msaa_info->alphaToOneEnable).value & - pipeline->blend.rb_blend_cntl_mask; -} + uint32_t rb_blend_cntl = + A6XX_RB_BLEND_CNTL_ENABLE_BLEND(blend_enable_mask) | + A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND | + A6XX_RB_BLEND_CNTL_SAMPLE_MASK(sample_mask); + if (msaa_info->alphaToCoverageEnable) + rb_blend_cntl |= A6XX_RB_BLEND_CNTL_ALPHA_TO_COVERAGE; -static void -tu6_emit_blend(struct tu_cs *cs, - struct tu_pipeline *pipeline) -{ - tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = pipeline->blend.num_rts)); - tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = pipeline->blend.num_rts)); - tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.dword = pipeline->blend.sp_blend_cntl)); - tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.dword = pipeline->blend.rb_blend_cntl)); - - for (unsigned i = 0; i < pipeline->blend.num_rts; i++) { - tu_cs_emit_regs(cs, - A6XX_RB_MRT_CONTROL(i, .dword = pipeline->blend.rb_mrt_control[i]), - A6XX_RB_MRT_BLEND_CONTROL(i, .dword = pipeline->blend.rb_mrt_blend_control[i])); - } -} + tu_cs_emit_pkt4(cs, REG_A6XX_SP_BLEND_CNTL, 1); + tu_cs_emit(cs, sp_blend_cntl); -static VkResult -tu_setup_pvtmem(struct tu_device *dev, - struct tu_pipeline *pipeline, - struct tu_pvtmem_config *config, - uint32_t pvtmem_bytes, - bool per_wave) -{ - if (!pvtmem_bytes) { - memset(config, 0, sizeof(*config)); - return VK_SUCCESS; - } - - /* There is a substantial memory footprint from private memory BOs being - * allocated on a per-pipeline basis and it isn't required as the same - * BO can be utilized by multiple pipelines as long as they have the - * private memory layout (sizes and per-wave/per-fiber) to avoid being - * overwritten by other active pipelines using the same BO with differing - * private memory layouts resulting memory corruption. - * - * To avoid this, we create private memory BOs on a per-device level with - * an associated private memory layout then dynamically grow them when - * needed and reuse them across pipelines. Growth is done in terms of - * powers of two so that we can avoid frequent reallocation of the - * private memory BOs. - */ - - struct tu_pvtmem_bo *pvtmem_bo = - per_wave ? &dev->wave_pvtmem_bo : &dev->fiber_pvtmem_bo; - mtx_lock(&pvtmem_bo->mtx); - - if (pvtmem_bo->per_fiber_size < pvtmem_bytes) { - if (pvtmem_bo->bo) - tu_bo_finish(dev, pvtmem_bo->bo); - - pvtmem_bo->per_fiber_size = - util_next_power_of_two(ALIGN(pvtmem_bytes, 512)); - pvtmem_bo->per_sp_size = - ALIGN(pvtmem_bo->per_fiber_size * - dev->physical_device->info->a6xx.fibers_per_sp, - 1 << 12); - uint32_t total_size = - dev->physical_device->info->num_sp_cores * pvtmem_bo->per_sp_size; - - VkResult result = tu_bo_init_new(dev, &pvtmem_bo->bo, total_size, - TU_BO_ALLOC_NO_FLAGS, "pvtmem"); - if (result != VK_SUCCESS) { - mtx_unlock(&pvtmem_bo->mtx); - return result; - } - } - - config->per_wave = per_wave; - config->per_fiber_size = pvtmem_bo->per_fiber_size; - config->per_sp_size = pvtmem_bo->per_sp_size; - - pipeline->pvtmem_bo = tu_bo_get_ref(pvtmem_bo->bo); - config->iova = pipeline->pvtmem_bo->iova; - - mtx_unlock(&pvtmem_bo->mtx); - - return VK_SUCCESS; + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_CNTL, 1); + tu_cs_emit(cs, rb_blend_cntl); } -static bool -contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state) -{ - return (state & - (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) == - (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT); -} - -/* Return true if this pipeline contains all of the GPL stages listed but none - * of the libraries it uses do, so this is "the first time" that all of them - * are defined together. This is useful for state that needs to be combined - * from multiple GPL stages. - */ - -static bool -set_combined_state(struct tu_pipeline_builder *builder, - struct tu_pipeline *pipeline, - VkGraphicsPipelineLibraryFlagsEXT state) +void +tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4]) { - if ((pipeline->state & state) != state) - return false; - - for (unsigned i = 0; i < builder->num_libraries; i++) { - if ((builder->libraries[i]->state & state) == state) - return false; - } - - return true; + tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_RED_F32, 4); + tu_cs_emit_array(cs, (const uint32_t *) constants, 4); } static VkResult -tu_pipeline_allocate_cs(struct tu_device *dev, - struct tu_pipeline *pipeline, - struct tu_pipeline_layout *layout, - struct tu_pipeline_builder *builder, - struct ir3_shader_variant *compute) +tu_pipeline_builder_create_pipeline(struct tu_pipeline_builder *builder, + struct tu_pipeline **out_pipeline) { - uint32_t size = 1024; - - /* graphics case: */ - if (builder) { - if (builder->state & - VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) { - size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS; - } - - if (set_combined_state(builder, pipeline, - VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) { - size += 2 * TU6_EMIT_VFD_DEST_MAX_DWORDS; - size += tu6_load_state_size(pipeline, layout); + struct tu_device *dev = builder->device; - for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) { - if (builder->variants[i]) { - size += builder->variants[i]->info.size / 4; - } - } - - size += builder->binning_variant->info.size / 4; - - builder->additional_cs_reserve_size = 0; - for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) { - struct ir3_shader_variant *variant = builder->variants[i]; - if (variant) { - builder->additional_cs_reserve_size += - tu_xs_get_additional_cs_size_dwords(variant); - - if (variant->binning) { - builder->additional_cs_reserve_size += - tu_xs_get_additional_cs_size_dwords(variant->binning); - } - } - } - - /* The additional size is used twice, once per tu6_emit_program() call. */ - size += builder->additional_cs_reserve_size * 2; - } - } else { - size += tu6_load_state_size(pipeline, layout); + struct tu_pipeline *pipeline = + vk_zalloc2(&dev->alloc, builder->alloc, sizeof(*pipeline), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!pipeline) + return VK_ERROR_OUT_OF_HOST_MEMORY; - size += compute->info.size / 4; + tu_cs_init(&pipeline->cs, TU_CS_MODE_SUB_STREAM, 2048); - size += tu_xs_get_additional_cs_size_dwords(compute); - } - - /* Allocate the space for the pipeline out of the device's RO suballocator. - * - * Sub-allocating BOs saves memory and also kernel overhead in refcounting of - * BOs at exec time. - * - * The pipeline cache would seem like a natural place to stick the - * suballocator, except that it is not guaranteed to outlive the pipelines - * created from it, so you can't store any long-lived state there, and you - * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because - * pipeline destroy isn't synchronized by the cache. - */ - pthread_mutex_lock(&dev->pipeline_mutex); - VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc, - size * 4, 128); - pthread_mutex_unlock(&dev->pipeline_mutex); - if (result != VK_SUCCESS) + /* reserve the space now such that tu_cs_begin_sub_stream never fails */ + VkResult result = tu_cs_reserve_space(dev, &pipeline->cs, 2048); + if (result != VK_SUCCESS) { + vk_free2(&dev->alloc, builder->alloc, pipeline); return result; - - tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo); - - return VK_SUCCESS; -} - -static void -tu_pipeline_shader_key_init(struct ir3_shader_key *key, - const struct tu_pipeline *pipeline, - struct tu_pipeline_builder *builder, - nir_shader **nir) -{ - /* We set this after we compile to NIR because we need the prim mode */ - key->tessellation = IR3_TESS_NONE; - - for (unsigned i = 0; i < builder->num_libraries; i++) { - if (!(builder->libraries[i]->state & - (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT))) - continue; - - const struct ir3_shader_key *library_key = - &builder->libraries[i]->ir3_key; - - if (library_key->tessellation != IR3_TESS_NONE) - key->tessellation = library_key->tessellation; - key->has_gs |= library_key->has_gs; - key->sample_shading |= library_key->sample_shading; - } - - for (uint32_t i = 0; i < builder->create_info->stageCount; i++) { - if (builder->create_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) { - key->has_gs = true; - break; - } - } - - if (!(builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) - return; - - if (builder->rasterizer_discard) - return; - - const VkPipelineMultisampleStateCreateInfo *msaa_info = - builder->create_info->pMultisampleState; - - /* The 1.3.215 spec says: - * - * Sample shading can be used to specify a minimum number of unique - * samples to process for each fragment. If sample shading is enabled, - * an implementation must provide a minimum of - * - * max(ceil(minSampleShadingFactor * totalSamples), 1) - * - * unique associated data for each fragment, where - * minSampleShadingFactor is the minimum fraction of sample shading. - * - * The definition is pretty much the same as OpenGL's GL_SAMPLE_SHADING. - * They both require unique associated data. - * - * There are discussions to change the definition, such that - * sampleShadingEnable does not imply unique associated data. Before the - * discussions are settled and before apps (i.e., ANGLE) are fixed to - * follow the new and incompatible definition, we should stick to the - * current definition. - * - * Note that ir3_shader_key::sample_shading is not actually used by ir3, - * just checked in tu6_emit_fs_inputs. We will also copy the value to - * tu_shader_key::force_sample_interp in a bit. - */ - if (msaa_info && msaa_info->sampleShadingEnable && - (msaa_info->minSampleShading * msaa_info->rasterizationSamples) > 1.0f) - key->sample_shading = true; -} - -static uint32_t -tu6_get_tessmode(struct tu_shader* shader) -{ - enum tess_primitive_mode primitive_mode = shader->ir3_shader->nir->info.tess._primitive_mode; - switch (primitive_mode) { - case TESS_PRIMITIVE_ISOLINES: - return IR3_TESS_ISOLINES; - case TESS_PRIMITIVE_TRIANGLES: - return IR3_TESS_TRIANGLES; - case TESS_PRIMITIVE_QUADS: - return IR3_TESS_QUADS; - case TESS_PRIMITIVE_UNSPECIFIED: - return IR3_TESS_NONE; - default: - unreachable("bad tessmode"); - } -} - -static uint64_t -tu_upload_variant(struct tu_pipeline *pipeline, - const struct ir3_shader_variant *variant) -{ - struct tu_cs_memory memory; - - if (!variant) - return 0; - - /* this expects to get enough alignment because shaders are allocated first - * and total size is always aligned correctly - * note: an assert in tu6_emit_xs_config validates the alignment - */ - tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory); - - memcpy(memory.map, variant->bin, variant->info.size); - return memory.iova; -} - -static void -tu_append_executable(struct tu_pipeline *pipeline, struct ir3_shader_variant *variant, - char *nir_from_spirv) -{ - struct tu_pipeline_executable exe = { - .stage = variant->type, - .nir_from_spirv = nir_from_spirv, - .nir_final = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.nir), - .disasm = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.disasm), - .stats = variant->info, - .is_binning = variant->binning_pass, - }; - - util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe); -} - -static bool -can_remove_out_var(nir_variable *var, void *data) -{ - return !var->data.explicit_xfb_buffer && !var->data.explicit_xfb_stride; -} - -static void -tu_link_shaders(struct tu_pipeline_builder *builder, - nir_shader **shaders, unsigned shaders_count) -{ - nir_shader *consumer = NULL; - for (gl_shader_stage stage = shaders_count - 1; - stage >= MESA_SHADER_VERTEX; stage--) { - if (!shaders[stage]) - continue; - - nir_shader *producer = shaders[stage]; - if (!consumer) { - consumer = producer; - continue; - } - - if (nir_link_opt_varyings(producer, consumer)) { - NIR_PASS_V(consumer, nir_opt_constant_folding); - NIR_PASS_V(consumer, nir_opt_algebraic); - NIR_PASS_V(consumer, nir_opt_dce); - } - - const nir_remove_dead_variables_options out_var_opts = { - .can_remove_var = can_remove_out_var, - }; - NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, &out_var_opts); - - NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL); - - bool progress = nir_remove_unused_varyings(producer, consumer); - - nir_compact_varyings(producer, consumer, true); - if (progress) { - if (nir_lower_global_vars_to_local(producer)) { - /* Remove dead writes, which can remove input loads */ - NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL); - NIR_PASS_V(producer, nir_opt_dce); - } - nir_lower_global_vars_to_local(consumer); - } - - consumer = producer; - } -} - -static void -tu_shader_key_init(struct tu_shader_key *key, - const VkPipelineShaderStageCreateInfo *stage_info, - struct tu_device *dev) -{ - enum ir3_wavesize_option api_wavesize, real_wavesize; - - if (stage_info) { - if (stage_info->flags & - VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT) { - api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE; - } else { - const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *size_info = - vk_find_struct_const(stage_info->pNext, - PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO); - - if (size_info) { - if (size_info->requiredSubgroupSize == dev->compiler->threadsize_base) { - api_wavesize = IR3_SINGLE_ONLY; - } else { - assert(size_info->requiredSubgroupSize == dev->compiler->threadsize_base * 2); - api_wavesize = IR3_DOUBLE_ONLY; - } - } else { - /* Match the exposed subgroupSize. */ - api_wavesize = IR3_DOUBLE_ONLY; - } - - if (stage_info->flags & - VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT) - real_wavesize = api_wavesize; - else if (api_wavesize == IR3_SINGLE_ONLY) - real_wavesize = IR3_SINGLE_ONLY; - else - real_wavesize = IR3_SINGLE_OR_DOUBLE; - } - } else { - api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE; - } - - key->api_wavesize = api_wavesize; - key->real_wavesize = real_wavesize; -} - -static void -tu_hash_stage(struct mesa_sha1 *ctx, - const VkPipelineShaderStageCreateInfo *stage, - const nir_shader *nir, - const struct tu_shader_key *key) -{ - - if (nir) { - struct blob blob; - blob_init(&blob); - nir_serialize(&blob, nir, true); - _mesa_sha1_update(ctx, blob.data, blob.size); - blob_finish(&blob); - } else { - unsigned char stage_hash[SHA1_DIGEST_LENGTH]; - vk_pipeline_hash_shader_stage(stage, NULL, stage_hash); - _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash)); - } - _mesa_sha1_update(ctx, key, sizeof(*key)); -} - -/* Hash flags which can affect ir3 shader compilation which aren't known until - * logical device creation. - */ -static void -tu_hash_compiler(struct mesa_sha1 *ctx, const struct ir3_compiler *compiler) -{ - _mesa_sha1_update(ctx, &compiler->robust_buffer_access2, - sizeof(compiler->robust_buffer_access2)); - _mesa_sha1_update(ctx, &ir3_shader_debug, sizeof(ir3_shader_debug)); -} - -static void -tu_hash_shaders(unsigned char *hash, - const VkPipelineShaderStageCreateInfo **stages, - nir_shader *const *nir, - const struct tu_pipeline_layout *layout, - const struct tu_shader_key *keys, - const struct ir3_shader_key *ir3_key, - VkGraphicsPipelineLibraryFlagsEXT state, - const struct ir3_compiler *compiler) -{ - struct mesa_sha1 ctx; - - _mesa_sha1_init(&ctx); - - if (layout) - _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1)); - - _mesa_sha1_update(&ctx, ir3_key, sizeof(ir3_key)); - - for (int i = 0; i < MESA_SHADER_STAGES; ++i) { - if (stages[i] || nir[i]) { - tu_hash_stage(&ctx, stages[i], nir[i], &keys[i]); - } - } - _mesa_sha1_update(&ctx, &state, sizeof(state)); - tu_hash_compiler(&ctx, compiler); - _mesa_sha1_final(&ctx, hash); -} - -static void -tu_hash_compute(unsigned char *hash, - const VkPipelineShaderStageCreateInfo *stage, - const struct tu_pipeline_layout *layout, - const struct tu_shader_key *key, - const struct ir3_compiler *compiler) -{ - struct mesa_sha1 ctx; - - _mesa_sha1_init(&ctx); - - if (layout) - _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1)); - - tu_hash_stage(&ctx, stage, NULL, key); - - tu_hash_compiler(&ctx, compiler); - _mesa_sha1_final(&ctx, hash); -} - -static bool -tu_shaders_serialize(struct vk_pipeline_cache_object *object, - struct blob *blob); - -static struct vk_pipeline_cache_object * -tu_shaders_deserialize(struct vk_device *device, - const void *key_data, size_t key_size, - struct blob_reader *blob); - -static void -tu_shaders_destroy(struct vk_pipeline_cache_object *object) -{ - struct tu_compiled_shaders *shaders = - container_of(object, struct tu_compiled_shaders, base); - - for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) - ralloc_free(shaders->variants[i]); - - for (unsigned i = 0; i < ARRAY_SIZE(shaders->safe_const_variants); i++) - ralloc_free(shaders->safe_const_variants[i]); - - vk_pipeline_cache_object_finish(&shaders->base); - vk_free(&object->device->alloc, shaders); -} - -const struct vk_pipeline_cache_object_ops tu_shaders_ops = { - .serialize = tu_shaders_serialize, - .deserialize = tu_shaders_deserialize, - .destroy = tu_shaders_destroy, -}; - -static struct tu_compiled_shaders * -tu_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size) -{ - VK_MULTIALLOC(ma); - VK_MULTIALLOC_DECL(&ma, struct tu_compiled_shaders, shaders, 1); - VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, key_size); - - if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE)) - return NULL; - - memcpy(obj_key_data, key_data, key_size); - vk_pipeline_cache_object_init(&dev->vk, &shaders->base, - &tu_shaders_ops, obj_key_data, key_size); - - return shaders; -} - -static bool -tu_shaders_serialize(struct vk_pipeline_cache_object *object, - struct blob *blob) -{ - struct tu_compiled_shaders *shaders = - container_of(object, struct tu_compiled_shaders, base); - - blob_write_bytes(blob, shaders->const_state, sizeof(shaders->const_state)); - blob_write_uint8(blob, shaders->active_desc_sets); - - for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) { - if (shaders->variants[i]) { - blob_write_uint8(blob, 1); - ir3_store_variant(blob, shaders->variants[i]); - } else { - blob_write_uint8(blob, 0); - } - - if (shaders->safe_const_variants[i]) { - blob_write_uint8(blob, 1); - ir3_store_variant(blob, shaders->safe_const_variants[i]); - } else { - blob_write_uint8(blob, 0); - } } - return true; -} - -static struct vk_pipeline_cache_object * -tu_shaders_deserialize(struct vk_device *_device, - const void *key_data, size_t key_size, - struct blob_reader *blob) -{ - struct tu_device *dev = container_of(_device, struct tu_device, vk); - struct tu_compiled_shaders *shaders = - tu_shaders_init(dev, key_data, key_size); - - if (!shaders) - return NULL; - - blob_copy_bytes(blob, shaders->const_state, sizeof(shaders->const_state)); - shaders->active_desc_sets = blob_read_uint8(blob); - - for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) { - if (blob_read_uint8(blob)) { - shaders->variants[i] = ir3_retrieve_variant(blob, dev->compiler, NULL); - } - - if (blob_read_uint8(blob)) { - shaders->safe_const_variants[i] = ir3_retrieve_variant(blob, dev->compiler, NULL); - } - } - - return &shaders->base; -} - -static struct tu_compiled_shaders * -tu_pipeline_cache_lookup(struct vk_pipeline_cache *cache, - const void *key_data, size_t key_size, - bool *application_cache_hit) -{ - struct vk_pipeline_cache_object *object = - vk_pipeline_cache_lookup_object(cache, key_data, key_size, - &tu_shaders_ops, application_cache_hit); - if (object) - return container_of(object, struct tu_compiled_shaders, base); - else - return NULL; -} - -static struct tu_compiled_shaders * -tu_pipeline_cache_insert(struct vk_pipeline_cache *cache, - struct tu_compiled_shaders *shaders) -{ - struct vk_pipeline_cache_object *object = - vk_pipeline_cache_add_object(cache, &shaders->base); - return container_of(object, struct tu_compiled_shaders, base); -} - -static bool -tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object, - struct blob *blob); - -static struct vk_pipeline_cache_object * -tu_nir_shaders_deserialize(struct vk_device *device, - const void *key_data, size_t key_size, - struct blob_reader *blob); - -static void -tu_nir_shaders_destroy(struct vk_pipeline_cache_object *object) -{ - struct tu_nir_shaders *shaders = - container_of(object, struct tu_nir_shaders, base); - - for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) - ralloc_free(shaders->nir[i]); - - vk_pipeline_cache_object_finish(&shaders->base); - vk_free(&object->device->alloc, shaders); -} - -const struct vk_pipeline_cache_object_ops tu_nir_shaders_ops = { - .serialize = tu_nir_shaders_serialize, - .deserialize = tu_nir_shaders_deserialize, - .destroy = tu_nir_shaders_destroy, -}; - -static struct tu_nir_shaders * -tu_nir_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size) -{ - VK_MULTIALLOC(ma); - VK_MULTIALLOC_DECL(&ma, struct tu_nir_shaders, shaders, 1); - VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, key_size); - - if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc, - VK_SYSTEM_ALLOCATION_SCOPE_DEVICE)) - return NULL; - - memcpy(obj_key_data, key_data, key_size); - vk_pipeline_cache_object_init(&dev->vk, &shaders->base, - &tu_nir_shaders_ops, obj_key_data, key_size); - - return shaders; -} - -static bool -tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object, - struct blob *blob) -{ - struct tu_nir_shaders *shaders = - container_of(object, struct tu_nir_shaders, base); - - for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) { - if (shaders->nir[i]) { - blob_write_uint8(blob, 1); - nir_serialize(blob, shaders->nir[i], true); - } else { - blob_write_uint8(blob, 0); - } - } - - return true; -} - -static struct vk_pipeline_cache_object * -tu_nir_shaders_deserialize(struct vk_device *_device, - const void *key_data, size_t key_size, - struct blob_reader *blob) -{ - struct tu_device *dev = container_of(_device, struct tu_device, vk); - struct tu_nir_shaders *shaders = - tu_nir_shaders_init(dev, key_data, key_size); - - if (!shaders) - return NULL; - - for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) { - if (blob_read_uint8(blob)) { - shaders->nir[i] = - nir_deserialize(NULL, ir3_get_compiler_options(dev->compiler), blob); - } - } - - return &shaders->base; -} + *out_pipeline = pipeline; -static struct tu_nir_shaders * -tu_nir_cache_lookup(struct vk_pipeline_cache *cache, - const void *key_data, size_t key_size, - bool *application_cache_hit) -{ - struct vk_pipeline_cache_object *object = - vk_pipeline_cache_lookup_object(cache, key_data, key_size, - &tu_nir_shaders_ops, application_cache_hit); - if (object) - return container_of(object, struct tu_nir_shaders, base); - else - return NULL; -} - -static struct tu_nir_shaders * -tu_nir_cache_insert(struct vk_pipeline_cache *cache, - struct tu_nir_shaders *shaders) -{ - struct vk_pipeline_cache_object *object = - vk_pipeline_cache_add_object(cache, &shaders->base); - return container_of(object, struct tu_nir_shaders, base); + return VK_SUCCESS; } - static VkResult -tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder, - struct tu_pipeline *pipeline) +tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder) { - VkResult result = VK_SUCCESS; - const struct ir3_compiler *compiler = builder->device->compiler; const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = { NULL }; - VkPipelineCreationFeedback pipeline_feedback = { - .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT, - }; - VkPipelineCreationFeedback stage_feedbacks[MESA_SHADER_STAGES] = { 0 }; - - int64_t pipeline_start = os_time_get_nano(); - - const VkPipelineCreationFeedbackCreateInfo *creation_feedback = - vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO); - - bool must_compile = - builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT; for (uint32_t i = 0; i < builder->create_info->stageCount; i++) { gl_shader_stage stage = - vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage); + tu_shader_stage(builder->create_info->pStages[i].stage); stage_infos[stage] = &builder->create_info->pStages[i]; - must_compile = true; - } - - if (tu6_shared_constants_enable(&builder->layout, builder->device->compiler)) { - pipeline->shared_consts = (struct tu_push_constant_range) { - .lo = 0, - .dwords = builder->layout.push_constant_size / 4, - }; - } - - nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL }; - - struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { }; - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(keys); stage++) { - tu_shader_key_init(&keys[stage], stage_infos[stage], builder->device); - } - - if (builder->create_info->flags & - VK_PIPELINE_CREATE_LINK_TIME_OPTIMIZATION_BIT_EXT) { - for (unsigned i = 0; i < builder->num_libraries; i++) { - struct tu_pipeline *library = builder->libraries[i]; - - for (unsigned j = 0; j < ARRAY_SIZE(library->shaders); j++) { - if (library->shaders[j].nir) { - assert(!nir[j]); - nir[j] = nir_shader_clone(builder->mem_ctx, - library->shaders[j].nir); - keys[j] = library->shaders[j].key; - must_compile = true; - } - } - } } - struct ir3_shader_key ir3_key = {}; - tu_pipeline_shader_key_init(&ir3_key, pipeline, builder, nir); + struct tu_shader_compile_options options; + tu_shader_compile_options_init(&options, builder->create_info); - struct tu_compiled_shaders *compiled_shaders = NULL; - struct tu_nir_shaders *nir_shaders = NULL; - if (!must_compile) - goto done; - - if (builder->state & - VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) { - keys[MESA_SHADER_VERTEX].multiview_mask = builder->multiview_mask; - } - - if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) { - keys[MESA_SHADER_FRAGMENT].multiview_mask = builder->multiview_mask; - keys[MESA_SHADER_FRAGMENT].force_sample_interp = ir3_key.sample_shading; - } - - unsigned char pipeline_sha1[20]; - tu_hash_shaders(pipeline_sha1, stage_infos, nir, &builder->layout, keys, - &ir3_key, builder->state, compiler); - - unsigned char nir_sha1[21]; - memcpy(nir_sha1, pipeline_sha1, sizeof(pipeline_sha1)); - nir_sha1[20] = 'N'; - - const bool executable_info = builder->create_info->flags & - VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR; - - char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL }; - - if (!executable_info) { - bool cache_hit = false; - bool application_cache_hit = false; - - compiled_shaders = - tu_pipeline_cache_lookup(builder->cache, &pipeline_sha1, - sizeof(pipeline_sha1), - &application_cache_hit); - - cache_hit = !!compiled_shaders; - - /* If the user asks us to keep the NIR around, we need to have it for a - * successful cache hit. If we only have a "partial" cache hit, then we - * still need to recompile in order to get the NIR. - */ - if (compiled_shaders && - (builder->create_info->flags & - VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT)) { - bool nir_application_cache_hit = false; - nir_shaders = - tu_nir_cache_lookup(builder->cache, &nir_sha1, - sizeof(nir_sha1), - &nir_application_cache_hit); - - application_cache_hit &= nir_application_cache_hit; - cache_hit &= !!nir_shaders; - } - - if (application_cache_hit && builder->cache != builder->device->mem_cache) { - pipeline_feedback.flags |= - VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; - } - - if (cache_hit) - goto done; - } - - if (builder->create_info->flags & - VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) { - return VK_PIPELINE_COMPILE_REQUIRED; - } - - struct tu_shader *shaders[ARRAY_SIZE(nir)] = { NULL }; - - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(nir); stage++) { + /* compile shaders in reverse order */ + struct tu_shader *next_stage_shader = NULL; + for (gl_shader_stage stage = MESA_SHADER_STAGES - 1; + stage > MESA_SHADER_NONE; stage--) { const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage]; if (!stage_info) continue; - int64_t stage_start = os_time_get_nano(); - - nir[stage] = tu_spirv_to_nir(builder->device, builder->mem_ctx, stage_info, stage); - if (!nir[stage]) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail; - } - - stage_feedbacks[stage].flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT; - stage_feedbacks[stage].duration += os_time_get_nano() - stage_start; - } - - if (!nir[MESA_SHADER_FRAGMENT] && - (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) { - const nir_shader_compiler_options *nir_options = - ir3_get_compiler_options(builder->device->compiler); - nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, - nir_options, - "noop_fs"); - nir[MESA_SHADER_FRAGMENT] = fs_b.shader; - } - - if (executable_info) { - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(nir); stage++) { - if (!nir[stage]) - continue; - - nir_initial_disasm[stage] = - nir_shader_as_str(nir[stage], pipeline->executables_mem_ctx); - } - } - - tu_link_shaders(builder, nir, ARRAY_SIZE(nir)); - - if (builder->create_info->flags & - VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT) { - nir_shaders = - tu_nir_shaders_init(builder->device, &nir_sha1, sizeof(nir_sha1)); - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(nir); stage++) { - if (!nir[stage]) - continue; - - nir_shaders->nir[stage] = nir_shader_clone(NULL, nir[stage]); - } - - nir_shaders = tu_nir_cache_insert(builder->cache, nir_shaders); - - if (compiled_shaders) - goto done; - } - - compiled_shaders = - tu_shaders_init(builder->device, &pipeline_sha1, sizeof(pipeline_sha1)); - - if (!compiled_shaders) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail; - } - - uint32_t desc_sets = 0; - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(nir); stage++) { - if (!nir[stage]) - continue; - - int64_t stage_start = os_time_get_nano(); - struct tu_shader *shader = - tu_shader_create(builder->device, nir[stage], &keys[stage], - &builder->layout, builder->alloc); - if (!shader) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail; - } - - /* In SPIR-V generated from GLSL, the primitive mode is specified in the - * tessellation evaluation shader, but in SPIR-V generated from HLSL, - * the mode is specified in the tessellation control shader. */ - if ((stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_TESS_CTRL) && - ir3_key.tessellation == IR3_TESS_NONE) { - ir3_key.tessellation = tu6_get_tessmode(shader); - } - - if (stage > MESA_SHADER_TESS_CTRL) { - if (stage == MESA_SHADER_FRAGMENT) { - ir3_key.tcs_store_primid = ir3_key.tcs_store_primid || - (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID)); - } else { - ir3_key.tcs_store_primid = ir3_key.tcs_store_primid || - BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID); - } - } - - /* Keep track of the status of each shader's active descriptor sets, - * which is set in tu_lower_io. */ - desc_sets |= shader->active_desc_sets; - - shaders[stage] = shader; - - stage_feedbacks[stage].duration += os_time_get_nano() - stage_start; - } - - /* In the the tess-but-not-FS case we don't know whether the FS will read - * PrimID so we need to unconditionally store it. - */ - if (nir[MESA_SHADER_TESS_CTRL] && !nir[MESA_SHADER_FRAGMENT]) - ir3_key.tcs_store_primid = true; - - struct tu_shader *last_shader = shaders[MESA_SHADER_GEOMETRY]; - if (!last_shader) - last_shader = shaders[MESA_SHADER_TESS_EVAL]; - if (!last_shader) - last_shader = shaders[MESA_SHADER_VERTEX]; - - compiled_shaders->active_desc_sets = desc_sets; - - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(shaders); stage++) { - if (!shaders[stage]) - continue; - - int64_t stage_start = os_time_get_nano(); - - compiled_shaders->variants[stage] = - ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key, - executable_info); - if (!compiled_shaders->variants[stage]) + tu_shader_create(builder->device, stage, stage_info, builder->alloc); + if (!shader) return VK_ERROR_OUT_OF_HOST_MEMORY; - compiled_shaders->const_state[stage] = shaders[stage]->const_state; - - stage_feedbacks[stage].duration += os_time_get_nano() - stage_start; - } - - uint32_t safe_constlens = ir3_trim_constlen(compiled_shaders->variants, compiler); - - ir3_key.safe_constlen = true; - - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(shaders); stage++) { - if (!shaders[stage]) - continue; - - if (safe_constlens & (1 << stage)) { - int64_t stage_start = os_time_get_nano(); - - ralloc_free(compiled_shaders->variants[stage]); - compiled_shaders->variants[stage] = - ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key, - executable_info); - if (!compiled_shaders->variants[stage]) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail; - } - - stage_feedbacks[stage].duration += os_time_get_nano() - stage_start; - } else if (contains_all_shader_state(builder->state)) { - compiled_shaders->safe_const_variants[stage] = - ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key, - executable_info); - if (!compiled_shaders->variants[stage]) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail; - } - } - } - - ir3_key.safe_constlen = false; - - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(nir); stage++) { - if (shaders[stage]) { - tu_shader_destroy(builder->device, shaders[stage], builder->alloc); - } - } - - compiled_shaders = - tu_pipeline_cache_insert(builder->cache, compiled_shaders); - -done:; - - struct ir3_shader_variant *safe_const_variants[ARRAY_SIZE(nir)] = { NULL }; - nir_shader *post_link_nir[ARRAY_SIZE(nir)] = { NULL }; - - if (compiled_shaders) { - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(nir); stage++) { - if (compiled_shaders->variants[stage]) { - tu_append_executable(pipeline, compiled_shaders->variants[stage], - nir_initial_disasm[stage]); - builder->variants[stage] = compiled_shaders->variants[stage]; - safe_const_variants[stage] = - compiled_shaders->safe_const_variants[stage]; - builder->const_state[stage] = - compiled_shaders->const_state[stage]; - } - } - } - - if (nir_shaders) { - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(nir); stage++) { - if (nir_shaders->nir[stage]) { - post_link_nir[stage] = nir_shaders->nir[stage]; - } - } - } - - /* In the case where we're building a library without link-time - * optimization but with sub-libraries that retain LTO info, we should - * retain it ourselves in case another pipeline includes us with LTO. - */ - for (unsigned i = 0; i < builder->num_libraries; i++) { - struct tu_pipeline *library = builder->libraries[i]; - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(library->shaders); stage++) { - if (!post_link_nir[stage] && library->shaders[stage].nir) { - post_link_nir[stage] = library->shaders[stage].nir; - keys[stage] = library->shaders[stage].key; - } - } - } + VkResult result = + tu_shader_compile(builder->device, shader, next_stage_shader, + &options, builder->alloc); + if (result != VK_SUCCESS) + return result; - if (!(builder->create_info->flags & - VK_PIPELINE_CREATE_LINK_TIME_OPTIMIZATION_BIT_EXT)) { - for (unsigned i = 0; i < builder->num_libraries; i++) { - struct tu_pipeline *library = builder->libraries[i]; - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(library->shaders); stage++) { - if (library->shaders[stage].variant) { - assert(!builder->variants[stage]); - builder->variants[stage] = library->shaders[stage].variant; - safe_const_variants[stage] = - library->shaders[stage].safe_const_variant; - builder->const_state[stage] = - library->shaders[stage].const_state; - post_link_nir[stage] = library->shaders[stage].nir; - } - } - } + builder->shaders[stage] = shader; + builder->shader_offsets[stage] = builder->shader_total_size; + builder->shader_total_size += + sizeof(uint32_t) * shader->variants[0].info.sizedwords; - /* Because we added more variants, we need to trim constlen again. - */ - if (builder->num_libraries > 0) { - uint32_t safe_constlens = ir3_trim_constlen(builder->variants, compiler); - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(builder->variants); stage++) { - if (safe_constlens & (1u << stage)) - builder->variants[stage] = safe_const_variants[stage]; - } - } + next_stage_shader = shader; } - if (compiled_shaders) - pipeline->active_desc_sets = compiled_shaders->active_desc_sets; - - for (unsigned i = 0; i < builder->num_libraries; i++) { - struct tu_pipeline *library = builder->libraries[i]; - pipeline->active_desc_sets |= library->active_desc_sets; + if (builder->shaders[MESA_SHADER_VERTEX]->has_binning_pass) { + const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX]; + builder->binning_vs_offset = builder->shader_total_size; + builder->shader_total_size += + sizeof(uint32_t) * vs->variants[1].info.sizedwords; } - if (compiled_shaders && compiled_shaders->variants[MESA_SHADER_TESS_CTRL]) { - pipeline->tess.patch_type = - compiled_shaders->variants[MESA_SHADER_TESS_CTRL]->key.tessellation; - } + return VK_SUCCESS; +} - if (contains_all_shader_state(pipeline->state)) { - struct ir3_shader_variant *vs = - builder->variants[MESA_SHADER_VERTEX]; +static VkResult +tu_pipeline_builder_upload_shaders(struct tu_pipeline_builder *builder, + struct tu_pipeline *pipeline) +{ + struct tu_bo *bo = &pipeline->program.binary_bo; - struct ir3_shader_variant *variant; - if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) { - tu_append_executable(pipeline, vs->binning, NULL); - variant = vs->binning; - } else { - variant = vs; - } + VkResult result = + tu_bo_init_new(builder->device, bo, builder->shader_total_size); + if (result != VK_SUCCESS) + return result; - builder->binning_variant = variant; + result = tu_bo_map(builder->device, bo); + if (result != VK_SUCCESS) + return result; - builder->compiled_shaders = compiled_shaders; + for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) { + const struct tu_shader *shader = builder->shaders[i]; + if (!shader) + continue; - /* It doesn't make much sense to use RETAIN_LINK_TIME_OPTIMIZATION_INFO - * when compiling all stages, but make sure we don't leak. - */ - if (nir_shaders) - vk_pipeline_cache_object_unref(&nir_shaders->base); - } else { - pipeline->compiled_shaders = compiled_shaders; - pipeline->nir_shaders = nir_shaders; - pipeline->ir3_key = ir3_key; - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(pipeline->shaders); stage++) { - pipeline->shaders[stage].nir = post_link_nir[stage]; - pipeline->shaders[stage].key = keys[stage]; - pipeline->shaders[stage].const_state = builder->const_state[stage]; - pipeline->shaders[stage].variant = builder->variants[stage]; - pipeline->shaders[stage].safe_const_variant = - safe_const_variants[stage]; - } + memcpy(bo->map + builder->shader_offsets[i], shader->binary, + sizeof(uint32_t) * shader->variants[0].info.sizedwords); } - pipeline_feedback.duration = os_time_get_nano() - pipeline_start; - if (creation_feedback) { - *creation_feedback->pPipelineCreationFeedback = pipeline_feedback; - - for (uint32_t i = 0; i < builder->create_info->stageCount; i++) { - gl_shader_stage s = - vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage); - creation_feedback->pPipelineStageCreationFeedbacks[i] = stage_feedbacks[s]; - } + if (builder->shaders[MESA_SHADER_VERTEX]->has_binning_pass) { + const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX]; + memcpy(bo->map + builder->binning_vs_offset, vs->binning_binary, + sizeof(uint32_t) * vs->variants[1].info.sizedwords); } return VK_SUCCESS; - -fail: - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < ARRAY_SIZE(nir); stage++) { - if (shaders[stage]) { - tu_shader_destroy(builder->device, shaders[stage], builder->alloc); - } - } - - if (compiled_shaders) - vk_pipeline_cache_object_unref(&compiled_shaders->base); - - if (nir_shaders) - vk_pipeline_cache_object_unref(&nir_shaders->base); - - return result; } static void @@ -3589,449 +1485,56 @@ tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder *builder, const VkPipelineDynamicStateCreateInfo *dynamic_info = builder->create_info->pDynamicState; - pipeline->rast.gras_su_cntl_mask = ~0u; - pipeline->rast.pc_raster_cntl_mask = ~0u; - pipeline->rast.vpc_unknown_9107_mask = ~0u; - pipeline->ds.rb_depth_cntl_mask = ~0u; - pipeline->ds.rb_stencil_cntl_mask = ~0u; - pipeline->blend.sp_blend_cntl_mask = ~0u; - pipeline->blend.rb_blend_cntl_mask = ~0u; - pipeline->blend.rb_mrt_control_mask = ~0u; - if (!dynamic_info) return; for (uint32_t i = 0; i < dynamic_info->dynamicStateCount; i++) { - VkDynamicState state = dynamic_info->pDynamicStates[i]; - switch (state) { - case VK_DYNAMIC_STATE_VIEWPORT ... VK_DYNAMIC_STATE_STENCIL_REFERENCE: - if (state == VK_DYNAMIC_STATE_LINE_WIDTH) - pipeline->rast.gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK; - pipeline->dynamic_state_mask |= BIT(state); - break; - case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT: - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS); - break; - case VK_DYNAMIC_STATE_CULL_MODE: - pipeline->rast.gras_su_cntl_mask &= - ~(A6XX_GRAS_SU_CNTL_CULL_BACK | A6XX_GRAS_SU_CNTL_CULL_FRONT); - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL); - break; - case VK_DYNAMIC_STATE_FRONT_FACE: - pipeline->rast.gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_FRONT_CW; - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL); - break; - case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY: - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY); - break; - case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE: - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VB_STRIDE); - break; - case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT: - pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT); - break; - case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT: - pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR); - break; - case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE: - pipeline->ds.rb_depth_cntl_mask &= - ~(A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE); - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); - break; - case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE: - pipeline->ds.rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); - break; - case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP: - pipeline->ds.rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK; - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); - break; - case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE: - pipeline->ds.rb_depth_cntl_mask &= - ~(A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE); - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL); - break; - case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE: - pipeline->ds.rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | - A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | - A6XX_RB_STENCIL_CONTROL_STENCIL_READ); - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL); - break; - case VK_DYNAMIC_STATE_STENCIL_OP: - pipeline->ds.rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_FUNC__MASK | - A6XX_RB_STENCIL_CONTROL_FAIL__MASK | - A6XX_RB_STENCIL_CONTROL_ZPASS__MASK | - A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK | - A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK | - A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK | - A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK | - A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK); - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL); - break; - case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE: - pipeline->rast.gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET; - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL); - break; - case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE: - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE); - break; - case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE: - pipeline->rast.pc_raster_cntl_mask &= ~A6XX_PC_RASTER_CNTL_DISCARD; - pipeline->rast.vpc_unknown_9107_mask &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD; - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD); - break; - case VK_DYNAMIC_STATE_LOGIC_OP_EXT: - pipeline->blend.sp_blend_cntl_mask &= ~A6XX_SP_BLEND_CNTL_ENABLE_BLEND__MASK; - pipeline->blend.rb_blend_cntl_mask &= ~A6XX_RB_BLEND_CNTL_ENABLE_BLEND__MASK; - pipeline->blend.rb_mrt_control_mask &= ~A6XX_RB_MRT_CONTROL_ROP_CODE__MASK; - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_BLEND); - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_LOGIC_OP); - break; - case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT: - pipeline->blend.sp_blend_cntl_mask &= ~A6XX_SP_BLEND_CNTL_ENABLE_BLEND__MASK; - pipeline->blend.rb_blend_cntl_mask &= ~A6XX_RB_BLEND_CNTL_ENABLE_BLEND__MASK; - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_BLEND); - - /* Dynamic color write enable doesn't directly change any of the - * registers, but it causes us to make some of the registers 0, so we - * set this dynamic state instead of making the register dynamic. - */ - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE); - break; - case VK_DYNAMIC_STATE_VERTEX_INPUT_EXT: - pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VERTEX_INPUT) | - BIT(TU_DYNAMIC_STATE_VB_STRIDE); - break; - case VK_DYNAMIC_STATE_PATCH_CONTROL_POINTS_EXT: - pipeline->dynamic_state_mask |= - BIT(TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS); - break; - default: - assert(!"unsupported dynamic state"); - break; - } - } -} - -static void -tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder *builder, - struct tu_pipeline *pipeline) -{ - const VkPipelineLibraryCreateInfoKHR *library_info = - vk_find_struct_const(builder->create_info->pNext, - PIPELINE_LIBRARY_CREATE_INFO_KHR); - - if (library_info) { - assert(library_info->libraryCount <= MAX_LIBRARIES); - builder->num_libraries = library_info->libraryCount; - for (unsigned i = 0; i < library_info->libraryCount; i++) { - TU_FROM_HANDLE(tu_pipeline, library, library_info->pLibraries[i]); - builder->libraries[i] = library; - } - } - - /* Merge in the state from libraries. The program state is a bit special - * and is handled separately. - */ - pipeline->state = builder->state; - for (unsigned i = 0; i < builder->num_libraries; i++) { - struct tu_pipeline *library = builder->libraries[i]; - pipeline->state |= library->state; - - uint32_t library_dynamic_state = 0; - if (library->state & - VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) { - pipeline->vi = library->vi; - pipeline->ia = library->ia; - library_dynamic_state |= - BIT(TU_DYNAMIC_STATE_VERTEX_INPUT) | - BIT(TU_DYNAMIC_STATE_VB_STRIDE) | - BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY) | - BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE); - pipeline->shared_consts = library->shared_consts; - } - - if (library->state & - VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) { - pipeline->tess = library->tess; - pipeline->rast = library->rast; - pipeline->viewport = library->viewport; - library_dynamic_state |= - BIT(VK_DYNAMIC_STATE_VIEWPORT) | - BIT(VK_DYNAMIC_STATE_SCISSOR) | - BIT(VK_DYNAMIC_STATE_LINE_WIDTH) | - BIT(VK_DYNAMIC_STATE_DEPTH_BIAS) | - BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD) | - BIT(TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS); - } - - if (library->state & - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) { - pipeline->ds = library->ds; - pipeline->lrz.fs = library->lrz.fs; - pipeline->lrz.force_disable_mask |= library->lrz.force_disable_mask; - pipeline->lrz.force_late_z |= library->lrz.force_late_z; - library_dynamic_state |= - BIT(VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK) | - BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK) | - BIT(VK_DYNAMIC_STATE_STENCIL_REFERENCE) | - BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL) | - BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL) | - BIT(VK_DYNAMIC_STATE_DEPTH_BOUNDS); - pipeline->shared_consts = library->shared_consts; - } - - if (library->state & - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) { - pipeline->blend = library->blend; - pipeline->output = library->output; - pipeline->lrz.force_disable_mask |= library->lrz.force_disable_mask; - pipeline->lrz.force_late_z |= library->lrz.force_late_z; - pipeline->prim_order = library->prim_order; - library_dynamic_state |= - BIT(VK_DYNAMIC_STATE_BLEND_CONSTANTS) | - BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS) | - BIT(TU_DYNAMIC_STATE_BLEND) | - BIT(TU_DYNAMIC_STATE_LOGIC_OP) | - BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE); - } - - if ((library->state & - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) && - (library->state & - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) { - pipeline->prim_order = library->prim_order; - } - - if ((library->state & - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) && - (library->state & - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) && - (library->state & - VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT)) { - pipeline->rast_ds = library->rast_ds; - } - - pipeline->dynamic_state_mask = - (pipeline->dynamic_state_mask & ~library_dynamic_state) | - (library->dynamic_state_mask & library_dynamic_state); - - u_foreach_bit (i, library_dynamic_state & ~library->dynamic_state_mask) { - if (i >= TU_DYNAMIC_STATE_COUNT) - break; - - pipeline->dynamic_state[i] = library->dynamic_state[i]; - } - - if (contains_all_shader_state(library->state)) { - pipeline->program = library->program; - pipeline->load_state = library->load_state; - } + pipeline->dynamic_state.mask |= + tu_dynamic_state_bit(dynamic_info->pDynamicStates[i]); } } static void -tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder, - struct tu_pipeline *pipeline) -{ - TU_FROM_HANDLE(tu_pipeline_layout, layout, builder->create_info->layout); - - if (layout) { - /* Note: it's still valid to have a layout even if there are libraries. - * This allows the app to e.g. overwrite an INDEPENDENT_SET layout with - * a non-INDEPENDENT_SET layout which may make us use a faster path, - * currently this just affects dynamic offset descriptors. - */ - builder->layout = *layout; - } else { - for (unsigned i = 0; i < builder->num_libraries; i++) { - struct tu_pipeline *library = builder->libraries[i]; - builder->layout.num_sets = MAX2(builder->layout.num_sets, - library->num_sets); - for (unsigned j = 0; j < library->num_sets; j++) { - if (library->layouts[i]) - builder->layout.set[i].layout = library->layouts[i]; - } - - builder->layout.push_constant_size = pipeline->push_constant_size; - builder->layout.independent_sets |= pipeline->independent_sets; - } - - tu_pipeline_layout_init(&builder->layout); - } - - if (builder->create_info->flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR) { - pipeline->num_sets = builder->layout.num_sets; - for (unsigned i = 0; i < pipeline->num_sets; i++) { - pipeline->layouts[i] = builder->layout.set[i].layout; - if (pipeline->layouts[i]) - vk_descriptor_set_layout_ref(&pipeline->layouts[i]->vk); - } - pipeline->push_constant_size = builder->layout.push_constant_size; - pipeline->independent_sets = builder->layout.independent_sets; - } -} - -static void -tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link, - struct tu_const_state *const_state, - struct ir3_shader_variant *v) -{ - link->const_state = *ir3_const_state(v); - link->tu_const_state = *const_state; - link->constlen = v->constlen; -} - -static bool -tu_pipeline_static_state(struct tu_pipeline *pipeline, struct tu_cs *cs, - uint32_t id, uint32_t size) -{ - assert(id < ARRAY_SIZE(pipeline->dynamic_state)); - - if (pipeline->dynamic_state_mask & BIT(id)) - return false; - - pipeline->dynamic_state[id] = tu_cs_draw_state(&pipeline->cs, cs, size); - return true; -} - -static void tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline) { struct tu_cs prog_cs; + tu_cs_begin_sub_stream(builder->device, &pipeline->cs, 512, &prog_cs); + tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, false); + pipeline->program.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs); - /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything - * else that could depend on that state (like push constants) - * - * Note also that this always uses the full VS even in binning pass. The - * binning pass variant has the same const layout as the full VS, and - * the constlen for the VS will be the same or greater than the constlen - * for the binning pass variant. It is required that the constlen state - * matches between binning and draw passes, as some parts of the push - * consts are emitted in state groups that are shared between the binning - * and draw passes. - */ - tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs); - tu6_emit_program_config(&prog_cs, builder); - pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); - - tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs); - tu6_emit_program(&prog_cs, builder, false, pipeline); - pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); - - tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs); - tu6_emit_program(&prog_cs, builder, true, pipeline); - pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); - - for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) { - if (!builder->variants[i]) - continue; - - tu_pipeline_set_linkage(&pipeline->program.link[i], - &builder->const_state[i], - builder->variants[i]); - } - - struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX]; - struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL]; - if (hs) { - pipeline->program.vs_param_stride = vs->output_size; - pipeline->program.hs_param_stride = hs->output_size; - pipeline->program.hs_vertices_out = hs->tess.tcs_vertices_out; - - const struct ir3_const_state *hs_const = - &pipeline->program.link[MESA_SHADER_TESS_CTRL].const_state; - unsigned hs_constlen = - pipeline->program.link[MESA_SHADER_TESS_CTRL].constlen; - uint32_t hs_base = hs_const->offsets.primitive_param; - pipeline->program.hs_param_dwords = - MIN2((hs_constlen - hs_base) * 4, 8); - - uint32_t state_size = TU6_EMIT_PATCH_CONTROL_POINTS_DWORDS( - pipeline->program.hs_param_dwords); - - struct tu_cs cs; - if (tu_pipeline_static_state(pipeline, &cs, - TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS, - state_size)) { - tu6_emit_patch_control_points(&cs, pipeline, - pipeline->tess.patch_control_points); - } - } + tu_cs_begin_sub_stream(builder->device, &pipeline->cs, 512, &prog_cs); + tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, true); + pipeline->program.binning_state_ib = + tu_cs_end_sub_stream(&pipeline->cs, &prog_cs); } static void tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline) { - if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VERTEX_INPUT)) - return; - const VkPipelineVertexInputStateCreateInfo *vi_info = builder->create_info->pVertexInputState; + const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX]; - struct tu_cs cs; - if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_VB_STRIDE, - 2 * vi_info->vertexBindingDescriptionCount)) { - for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) { - const VkVertexInputBindingDescription *binding = - &vi_info->pVertexBindingDescriptions[i]; + struct tu_cs vi_cs; + tu_cs_begin_sub_stream(builder->device, &pipeline->cs, + MAX_VERTEX_ATTRIBS * 5 + 2, &vi_cs); + tu6_emit_vertex_input(&vi_cs, &vs->variants[0], vi_info, + pipeline->vi.bindings, pipeline->vi.strides, + pipeline->vi.offsets, &pipeline->vi.count); + pipeline->vi.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &vi_cs); - tu_cs_emit_regs(&cs, - A6XX_VFD_FETCH_STRIDE(binding->binding, binding->stride)); - } - } - - VkVertexInputBindingDescription2EXT bindings[MAX_VBS]; - VkVertexInputAttributeDescription2EXT attrs[MAX_VERTEX_ATTRIBS]; - - for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) { - const VkVertexInputBindingDescription *binding = - &vi_info->pVertexBindingDescriptions[i]; - bindings[i] = (VkVertexInputBindingDescription2EXT) { - .sType = VK_STRUCTURE_TYPE_VERTEX_INPUT_BINDING_DESCRIPTION_2_EXT, - .pNext = NULL, - .binding = binding->binding, - .inputRate = binding->inputRate, - .stride = binding->stride, - .divisor = 1, - }; - - /* Bindings may contain holes */ - pipeline->vi.num_vbs = MAX2(pipeline->vi.num_vbs, binding->binding + 1); - } - - const VkPipelineVertexInputDivisorStateCreateInfoEXT *div_state = - vk_find_struct_const(vi_info->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT); - if (div_state) { - for (uint32_t i = 0; i < div_state->vertexBindingDivisorCount; i++) { - const VkVertexInputBindingDivisorDescriptionEXT *desc = - &div_state->pVertexBindingDivisors[i]; - bindings[desc->binding].divisor = desc->divisor; - } + if (vs->has_binning_pass) { + tu_cs_begin_sub_stream(builder->device, &pipeline->cs, + MAX_VERTEX_ATTRIBS * 5 + 2, &vi_cs); + tu6_emit_vertex_input( + &vi_cs, &vs->variants[1], vi_info, pipeline->vi.binning_bindings, + pipeline->vi.binning_strides, pipeline->vi.binning_offsets, + &pipeline->vi.binning_count); + pipeline->vi.binning_state_ib = + tu_cs_end_sub_stream(&pipeline->cs, &vi_cs); } - - for (unsigned i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) { - const VkVertexInputAttributeDescription *attr = - &vi_info->pVertexAttributeDescriptions[i]; - attrs[i] = (VkVertexInputAttributeDescription2EXT) { - .sType = VK_STRUCTURE_TYPE_VERTEX_INPUT_ATTRIBUTE_DESCRIPTION_2_EXT, - .pNext = NULL, - .binding = attr->binding, - .location = attr->location, - .offset = attr->offset, - .format = attr->format, - }; - } - - tu_cs_begin_sub_stream(&pipeline->cs, - TU6_EMIT_VERTEX_INPUT_MAX_DWORDS, &cs); - tu6_emit_vertex_input(&cs, - vi_info->vertexBindingDescriptionCount, bindings, - vi_info->vertexAttributeDescriptionCount, attrs); - pipeline->dynamic_state[TU_DYNAMIC_STATE_VERTEX_INPUT] = - tu_cs_end_draw_state(&pipeline->cs, &cs); } static void @@ -4046,29 +1549,6 @@ tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder *builder, } static void -tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder, - struct tu_pipeline *pipeline) -{ - if (!(builder->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) || - !(builder->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)) - return; - - const VkPipelineTessellationStateCreateInfo *tess_info = - builder->create_info->pTessellationState; - - if (!(pipeline->dynamic_state_mask & - BIT(TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS))) { - assert(tess_info->patchControlPoints <= 32); - pipeline->tess.patch_control_points = tess_info->patchControlPoints; - } - - const VkPipelineTessellationDomainOriginStateCreateInfo *domain_info = - vk_find_struct_const(tess_info->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO); - pipeline->tess.upper_left_domain_origin = !domain_info || - domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT; -} - -static void tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline) { @@ -4085,17 +1565,21 @@ tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder, const VkPipelineViewportStateCreateInfo *vp_info = builder->create_info->pViewportState; - const VkPipelineViewportDepthClipControlCreateInfoEXT *depth_clip_info = - vk_find_struct_const(vp_info->pNext, PIPELINE_VIEWPORT_DEPTH_CLIP_CONTROL_CREATE_INFO_EXT); - pipeline->viewport.z_negative_one_to_one = depth_clip_info ? depth_clip_info->negativeOneToOne : false; - struct tu_cs cs; + struct tu_cs vp_cs; + tu_cs_begin_sub_stream(builder->device, &pipeline->cs, 15, &vp_cs); - if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount)) - tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount, pipeline->viewport.z_negative_one_to_one); + if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_VIEWPORT)) { + assert(vp_info->viewportCount == 1); + tu6_emit_viewport(&vp_cs, vp_info->pViewports); + } - if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount)) - tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount); + if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_SCISSOR)) { + assert(vp_info->scissorCount == 1); + tu6_emit_scissor(&vp_cs, vp_info->pScissors); + } + + pipeline->vp.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &vp_cs); } static void @@ -4105,95 +1589,31 @@ tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder, const VkPipelineRasterizationStateCreateInfo *rast_info = builder->create_info->pRasterizationState; - enum a6xx_polygon_mode mode = tu6_polygon_mode(rast_info->polygonMode); - - bool depth_clip_disable = rast_info->depthClampEnable; - - const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state = - vk_find_struct_const(rast_info, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT); - if (depth_clip_state) - depth_clip_disable = !depth_clip_state->depthClipEnable; - - pipeline->rast.rb_depth_cntl = - COND(rast_info->depthClampEnable, A6XX_RB_DEPTH_CNTL_Z_CLAMP_ENABLE); - - pipeline->rast.line_mode = RECTANGULAR; - - const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_state = - vk_find_struct_const(rast_info->pNext, - PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT); - - if (rast_line_state && - rast_line_state->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) { - pipeline->rast.line_mode = BRESENHAM; - } - - struct tu_cs cs; - uint32_t cs_size = 9 + - (builder->device->physical_device->info->a6xx.has_shading_rate ? 8 : 0); - pipeline->rast.state = tu_cs_draw_state(&pipeline->cs, &cs, cs_size); - - tu_cs_emit_regs(&cs, - A6XX_GRAS_CL_CNTL( - .znear_clip_disable = depth_clip_disable, - .zfar_clip_disable = depth_clip_disable, - .z_clamp_enable = rast_info->depthClampEnable, - .zero_gb_scale_z = pipeline->viewport.z_negative_one_to_one ? 0 : 1, - .vp_clip_code_ignore = 1)); + assert(!rast_info->depthClampEnable); + assert(rast_info->polygonMode == VK_POLYGON_MODE_FILL); - tu_cs_emit_regs(&cs, - A6XX_VPC_POLYGON_MODE(mode)); - - tu_cs_emit_regs(&cs, - A6XX_PC_POLYGON_MODE(mode)); + struct tu_cs rast_cs; + tu_cs_begin_sub_stream(builder->device, &pipeline->cs, 20, &rast_cs); /* move to hw ctx init? */ - tu_cs_emit_regs(&cs, - A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f), - A6XX_GRAS_SU_POINT_SIZE(1.0f)); - - if (builder->device->physical_device->info->a6xx.has_shading_rate) { - tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A00()); - tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A10()); - tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A20()); - tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A30()); - } - - const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info = - vk_find_struct_const(rast_info->pNext, - PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT); - unsigned stream = stream_info ? stream_info->rasterizationStream : 0; - - pipeline->rast.pc_raster_cntl = A6XX_PC_RASTER_CNTL_STREAM(stream); - pipeline->rast.vpc_unknown_9107 = 0; - if (rast_info->rasterizerDiscardEnable) { - pipeline->rast.pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD; - pipeline->rast.vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD; - } + tu6_emit_gras_unknowns(&rast_cs); + tu6_emit_point_size(&rast_cs); - if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4)) { - tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = pipeline->rast.pc_raster_cntl)); - tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = pipeline->rast.vpc_unknown_9107)); - } - - pipeline->rast.gras_su_cntl = - tu6_gras_su_cntl(rast_info, pipeline->rast.line_mode, builder->multiview_mask != 0); + const uint32_t gras_su_cntl = + tu6_gras_su_cntl(rast_info, builder->samples); - if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2)) - tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = pipeline->rast.gras_su_cntl)); + if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_LINE_WIDTH)) + tu6_emit_gras_su_cntl(&rast_cs, gras_su_cntl, rast_info->lineWidth); - if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BIAS, 4)) { - tu6_emit_depth_bias(&cs, rast_info->depthBiasConstantFactor, + if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_DEPTH_BIAS)) { + tu6_emit_depth_bias(&rast_cs, rast_info->depthBiasConstantFactor, rast_info->depthBiasClamp, rast_info->depthBiasSlopeFactor); } - const struct VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_state = - vk_find_struct_const(rast_info->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT); - pipeline->rast.provoking_vertex_last = provoking_vtx_state && - provoking_vtx_state->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT; + pipeline->rast.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &rast_cs); - pipeline->rast.multiview_mask = builder->multiview_mask; + pipeline->rast.gras_su_cntl = gras_su_cntl; } static void @@ -4207,128 +1627,38 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder, * the pipeline has rasterization disabled or if the subpass of the * render pass the pipeline is created against does not use a * depth/stencil attachment. + * + * We disable both depth and stenil tests in those cases. */ + static const VkPipelineDepthStencilStateCreateInfo dummy_ds_info; const VkPipelineDepthStencilStateCreateInfo *ds_info = - builder->create_info->pDepthStencilState; - uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0; - struct tu_cs cs; - - if (!builder->attachment_state_valid || - (builder->depth_attachment_format != VK_FORMAT_UNDEFINED && - builder->depth_attachment_format != VK_FORMAT_S8_UINT)) { - if (ds_info->depthTestEnable) { - rb_depth_cntl |= - A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | - A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) | - A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; /* TODO: don't set for ALWAYS/NEVER */ - - if (ds_info->depthWriteEnable) - rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; - } - - if (ds_info->depthBoundsTestEnable) - rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; + builder->use_depth_stencil_attachment + ? builder->create_info->pDepthStencilState + : &dummy_ds_info; - if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable) - tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl); - } - - if (!builder->attachment_state_valid || - builder->depth_attachment_format != VK_FORMAT_UNDEFINED) { - const VkStencilOpState *front = &ds_info->front; - const VkStencilOpState *back = &ds_info->back; - - rb_stencil_cntl |= - A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) | - A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) | - A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) | - A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) | - A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) | - A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) | - A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) | - A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp)); - - if (ds_info->stencilTestEnable) { - rb_stencil_cntl |= - A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | - A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | - A6XX_RB_STENCIL_CONTROL_STENCIL_READ; - } - - pipeline->ds.raster_order_attachment_access = - ds_info->flags & - (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_ARM | - VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_ARM); - - pipeline->ds.write_enable = - ds_info->depthWriteEnable || ds_info->stencilTestEnable; - } - - pipeline->ds.rb_depth_cntl = rb_depth_cntl; - - if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2)) { - tu_cs_emit_pkt4(&cs, REG_A6XX_RB_STENCIL_CONTROL, 1); - tu_cs_emit(&cs, rb_stencil_cntl); - } - pipeline->ds.rb_stencil_cntl = rb_stencil_cntl; + struct tu_cs ds_cs; + tu_cs_begin_sub_stream(builder->device, &pipeline->cs, 12, &ds_cs); - /* the remaining draw states arent used if there is no d/s, leave them empty */ - if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED && - builder->attachment_state_valid) - return; + /* move to hw ctx init? */ + tu6_emit_alpha_control_disable(&ds_cs); - if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3)) { - tu_cs_emit_regs(&cs, - A6XX_RB_Z_BOUNDS_MIN(ds_info->minDepthBounds), - A6XX_RB_Z_BOUNDS_MAX(ds_info->maxDepthBounds)); - } + tu6_emit_depth_control(&ds_cs, ds_info); + tu6_emit_stencil_control(&ds_cs, ds_info); - if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2)) { - tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.mask = ds_info->front.compareMask & 0xff, - .bfmask = ds_info->back.compareMask & 0xff)); + if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_COMPARE_MASK)) { + tu6_emit_stencil_compare_mask(&ds_cs, ds_info->front.compareMask, + ds_info->back.compareMask); } - - if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2)) { - update_stencil_mask(&pipeline->ds.stencil_wrmask, VK_STENCIL_FACE_FRONT_BIT, ds_info->front.writeMask); - update_stencil_mask(&pipeline->ds.stencil_wrmask, VK_STENCIL_FACE_BACK_BIT, ds_info->back.writeMask); - tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = pipeline->ds.stencil_wrmask)); + if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_WRITE_MASK)) { + tu6_emit_stencil_write_mask(&ds_cs, ds_info->front.writeMask, + ds_info->back.writeMask); } - - if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2)) { - tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.ref = ds_info->front.reference & 0xff, - .bfref = ds_info->back.reference & 0xff)); + if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_REFERENCE)) { + tu6_emit_stencil_reference(&ds_cs, ds_info->front.reference, + ds_info->back.reference); } - if (builder->variants[MESA_SHADER_FRAGMENT]) { - const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT]; - if (fs->has_kill) { - pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE; - } - if (fs->no_earlyz || fs->writes_pos) { - pipeline->lrz.force_disable_mask = TU_LRZ_FORCE_DISABLE_LRZ; - } - } -} - -static void -tu_pipeline_builder_parse_rast_ds(struct tu_pipeline_builder *builder, - struct tu_pipeline *pipeline) -{ - if (builder->rasterizer_discard) - return; - - pipeline->rast_ds.rb_depth_cntl = - pipeline->rast.rb_depth_cntl | pipeline->ds.rb_depth_cntl; - pipeline->rast_ds.rb_depth_cntl_mask = pipeline->ds.rb_depth_cntl_mask; - - struct tu_cs cs; - if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) { - tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_CNTL, 1); - if (pipeline->output.rb_depth_cntl_disable) - tu_cs_emit(&cs, 0); - else - tu_cs_emit(&cs, pipeline->rast_ds.rb_depth_cntl); - } + pipeline->ds.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &ds_cs); } static void @@ -4351,189 +1681,31 @@ tu_pipeline_builder_parse_multisample_and_color_blend( * * We leave the relevant registers stale when rasterization is disabled. */ - if (builder->rasterizer_discard) { - pipeline->output.samples = VK_SAMPLE_COUNT_1_BIT; + if (builder->rasterizer_discard) return; - } - - pipeline->output.feedback_loop_may_involve_textures = - builder->feedback_loop_may_involve_textures; static const VkPipelineColorBlendStateCreateInfo dummy_blend_info; const VkPipelineMultisampleStateCreateInfo *msaa_info = builder->create_info->pMultisampleState; - pipeline->output.samples = msaa_info->rasterizationSamples; - const VkPipelineColorBlendStateCreateInfo *blend_info = builder->use_color_attachments ? builder->create_info->pColorBlendState : &dummy_blend_info; - bool no_earlyz = builder->depth_attachment_format == VK_FORMAT_S8_UINT || - /* alpha to coverage can behave like a discard */ - msaa_info->alphaToCoverageEnable; - pipeline->lrz.force_late_z |= no_earlyz; - - pipeline->output.subpass_feedback_loop_color = - builder->subpass_feedback_loop_color; - pipeline->output.subpass_feedback_loop_ds = - builder->subpass_feedback_loop_ds; - - if (builder->use_color_attachments) { - pipeline->blend.raster_order_attachment_access = - blend_info->flags & - VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_ARM; - } - - const enum pipe_format ds_pipe_format = - vk_format_to_pipe_format(builder->depth_attachment_format); + struct tu_cs blend_cs; + tu_cs_begin_sub_stream(builder->device, &pipeline->cs, MAX_RTS * 3 + 9, + &blend_cs); - if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED && - builder->depth_attachment_format != VK_FORMAT_S8_UINT) { - pipeline->output.depth_cpp_per_sample = util_format_get_component_bits( - ds_pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 0) / 8; - } else { - /* We need to make sure RB_DEPTH_CNTL is set to 0 when this pipeline is - * used, regardless of whether it's linked with a fragment shader - * pipeline that has an enabled depth test or if RB_DEPTH_CNTL is set - * dynamically. - */ - pipeline->output.rb_depth_cntl_disable = true; - } - - if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) { - pipeline->output.stencil_cpp_per_sample = util_format_get_component_bits( - ds_pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 1) / 8; - } - - struct tu_cs cs; - tu6_emit_rb_mrt_controls(pipeline, blend_info, + uint32_t blend_enable_mask; + tu6_emit_rb_mrt_controls(&blend_cs, blend_info, builder->color_attachment_formats, - &pipeline->blend.rop_reads_dst, - &pipeline->output.color_bandwidth_per_sample); - - if (msaa_info->alphaToCoverageEnable && pipeline->blend.num_rts == 0) { - /* In addition to changing the *_OUTPUT_CNTL1 registers, this will also - * make sure we disable memory writes for MRT0 rather than using - * whatever setting was leftover. - */ - pipeline->blend.num_rts = 1; - } - - uint32_t blend_enable_mask = - pipeline->blend.rop_reads_dst ? - pipeline->blend.color_write_enable : - pipeline->blend.blend_enable; - tu6_emit_blend_control(pipeline, blend_enable_mask, - tu_blend_state_is_dual_src(blend_info), msaa_info); - - if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_BLEND, - pipeline->blend.num_rts * 3 + 8)) { - tu6_emit_blend(&cs, pipeline); - assert(cs.cur == cs.end); /* validate draw state size */ - } - - /* Disable LRZ writes when blend or logic op that reads the destination is - * enabled, since the resulting pixel value from the blend-draw depends on - * an earlier draw, which LRZ in the draw pass could early-reject if the - * previous blend-enabled draw wrote LRZ. - * - * TODO: We need to disable LRZ writes only for the binning pass. - * Therefore, we need to emit it in a separate draw state. We keep - * it disabled for sysmem path as well for the moment. - */ - if (blend_enable_mask) - pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE; - - for (int i = 0; i < blend_info->attachmentCount; i++) { - VkPipelineColorBlendAttachmentState blendAttachment = blend_info->pAttachments[i]; - /* From the PoV of LRZ, having masked color channels is - * the same as having blend enabled, in that the draw will - * care about the fragments from an earlier draw. - */ - VkFormat format = builder->color_attachment_formats[i]; - unsigned mask = MASK(vk_format_get_nr_components(format)); - if (format != VK_FORMAT_UNDEFINED && - ((blendAttachment.colorWriteMask & mask) != mask || - !(pipeline->blend.color_write_enable & BIT(i)))) { - pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE; - } - } - - if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5)) { - tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4); - tu_cs_emit_array(&cs, (const uint32_t *) blend_info->blendConstants, 4); - } + &blend_enable_mask); - const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations = - vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT); - const VkSampleLocationsInfoEXT *samp_loc = NULL; + if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_BLEND_CONSTANTS)) + tu6_emit_blend_constants(&blend_cs, blend_info->blendConstants); - if (sample_locations && sample_locations->sampleLocationsEnable) - samp_loc = &sample_locations->sampleLocationsInfo; + tu6_emit_blend_control(&blend_cs, blend_enable_mask, msaa_info); - if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, - samp_loc ? 9 : 6)) { - tu6_emit_sample_locations(&cs, samp_loc); - } -} - -static void -tu_pipeline_builder_parse_rasterization_order( - struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline) -{ - if (builder->rasterizer_discard) - return; - - bool raster_order_attachment_access = - pipeline->blend.raster_order_attachment_access || - pipeline->ds.raster_order_attachment_access || - unlikely(builder->device->physical_device->instance->debug_flags & TU_DEBUG_RAST_ORDER); - - /* VK_EXT_blend_operation_advanced would also require ordered access - * when implemented in the future. - */ - - uint32_t sysmem_prim_mode = NO_FLUSH; - uint32_t gmem_prim_mode = NO_FLUSH; - - if (raster_order_attachment_access) { - /* VK_EXT_rasterization_order_attachment_access: - * - * This extension allow access to framebuffer attachments when used as - * both input and color attachments from one fragment to the next, - * in rasterization order, without explicit synchronization. - */ - sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE; - gmem_prim_mode = FLUSH_PER_OVERLAP; - pipeline->prim_order.sysmem_single_prim_mode = true; - } else { - /* If there is a feedback loop, then the shader can read the previous value - * of a pixel being written out. It can also write some components and then - * read different components without a barrier in between. This is a - * problem in sysmem mode with UBWC, because the main buffer and flags - * buffer can get out-of-sync if only one is flushed. We fix this by - * setting the SINGLE_PRIM_MODE field to the same value that the blob does - * for advanced_blend in sysmem mode if a feedback loop is detected. - */ - if (pipeline->output.subpass_feedback_loop_color || - (pipeline->output.subpass_feedback_loop_ds && - pipeline->ds.write_enable)) { - sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE; - pipeline->prim_order.sysmem_single_prim_mode = true; - } - } - - struct tu_cs cs; - - pipeline->prim_order.state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2); - tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL, - A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) | - A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode)); - - pipeline->prim_order.state_sysmem = tu_cs_draw_state(&pipeline->cs, &cs, 2); - tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL, - A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) | - A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(sysmem_prim_mode)); + pipeline->blend.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &blend_cs); } static void @@ -4541,161 +1713,45 @@ tu_pipeline_finish(struct tu_pipeline *pipeline, struct tu_device *dev, const VkAllocationCallbacks *alloc) { - tu_cs_finish(&pipeline->cs); - pthread_mutex_lock(&dev->pipeline_mutex); - tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo); - pthread_mutex_unlock(&dev->pipeline_mutex); - - if (pipeline->pvtmem_bo) - tu_bo_finish(dev, pipeline->pvtmem_bo); - - if (pipeline->compiled_shaders) - vk_pipeline_cache_object_unref(&pipeline->compiled_shaders->base); - - if (pipeline->nir_shaders) - vk_pipeline_cache_object_unref(&pipeline->nir_shaders->base); + tu_cs_finish(dev, &pipeline->cs); - for (unsigned i = 0; i < pipeline->num_sets; i++) { - if (pipeline->layouts[i]) - vk_descriptor_set_layout_unref(&dev->vk, &pipeline->layouts[i]->vk); - } - - ralloc_free(pipeline->executables_mem_ctx); + if (pipeline->program.binary_bo.gem_handle) + tu_bo_finish(dev, &pipeline->program.binary_bo); } - static VkResult tu_pipeline_builder_build(struct tu_pipeline_builder *builder, struct tu_pipeline **pipeline) { - VkResult result; - - *pipeline = vk_object_zalloc(&builder->device->vk, builder->alloc, - sizeof(**pipeline), VK_OBJECT_TYPE_PIPELINE); - if (!*pipeline) - return VK_ERROR_OUT_OF_HOST_MEMORY; - - (*pipeline)->executables_mem_ctx = ralloc_context(NULL); - util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx); - - tu_pipeline_builder_parse_dynamic(builder, *pipeline); - tu_pipeline_builder_parse_libraries(builder, *pipeline); - - VkShaderStageFlags stages = 0; - for (unsigned i = 0; i < builder->create_info->stageCount; i++) { - stages |= builder->create_info->pStages[i].stage; - } - builder->active_stages = stages; - - (*pipeline)->active_stages = stages; - for (unsigned i = 0; i < builder->num_libraries; i++) - (*pipeline)->active_stages |= builder->libraries[i]->active_stages; + VkResult result = tu_pipeline_builder_create_pipeline(builder, pipeline); + if (result != VK_SUCCESS) + return result; - /* Compile and upload shaders unless a library has already done that. */ - if ((*pipeline)->program.state.size == 0) { - tu_pipeline_builder_parse_layout(builder, *pipeline); + /* compile and upload shaders */ + result = tu_pipeline_builder_compile_shaders(builder); + if (result == VK_SUCCESS) + result = tu_pipeline_builder_upload_shaders(builder, *pipeline); + if (result != VK_SUCCESS) { + tu_pipeline_finish(*pipeline, builder->device, builder->alloc); + vk_free2(&builder->device->alloc, builder->alloc, *pipeline); + *pipeline = VK_NULL_HANDLE; - result = tu_pipeline_builder_compile_shaders(builder, *pipeline); - if (result != VK_SUCCESS) { - vk_object_free(&builder->device->vk, builder->alloc, *pipeline); - return result; - } + return result; } - result = tu_pipeline_allocate_cs(builder->device, *pipeline, - &builder->layout, builder, NULL); - - - /* This has to come before emitting the program so that - * pipeline->tess.patch_control_points and pipeline->rast.multiview_mask - * are always set. + tu_pipeline_builder_parse_dynamic(builder, *pipeline); + tu_pipeline_builder_parse_shader_stages(builder, *pipeline); + tu_pipeline_builder_parse_vertex_input(builder, *pipeline); + tu_pipeline_builder_parse_input_assembly(builder, *pipeline); + tu_pipeline_builder_parse_viewport(builder, *pipeline); + tu_pipeline_builder_parse_rasterization(builder, *pipeline); + tu_pipeline_builder_parse_depth_stencil(builder, *pipeline); + tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline); + + /* we should have reserved enough space upfront such that the CS never + * grows */ - if (builder->state & - VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) { - tu_pipeline_builder_parse_tessellation(builder, *pipeline); - (*pipeline)->rast.multiview_mask = builder->multiview_mask; - } - - if (set_combined_state(builder, *pipeline, - VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) { - if (result != VK_SUCCESS) { - vk_object_free(&builder->device->vk, builder->alloc, *pipeline); - return result; - } - - for (uint32_t i = 0; i < ARRAY_SIZE(builder->shader_iova); i++) - builder->shader_iova[i] = - tu_upload_variant(*pipeline, builder->variants[i]); - - builder->binning_vs_iova = - tu_upload_variant(*pipeline, builder->binning_variant); - - /* Setup private memory. Note that because we're sharing the same private - * memory for all stages, all stages must use the same config, or else - * fibers from one stage might overwrite fibers in another. - */ - - uint32_t pvtmem_size = 0; - bool per_wave = true; - for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) { - if (builder->variants[i]) { - pvtmem_size = MAX2(pvtmem_size, builder->variants[i]->pvtmem_size); - if (!builder->variants[i]->pvtmem_per_wave) - per_wave = false; - } - } - - if (builder->binning_variant) { - pvtmem_size = MAX2(pvtmem_size, builder->binning_variant->pvtmem_size); - if (!builder->binning_variant->pvtmem_per_wave) - per_wave = false; - } - - result = tu_setup_pvtmem(builder->device, *pipeline, &builder->pvtmem, - pvtmem_size, per_wave); - if (result != VK_SUCCESS) { - vk_object_free(&builder->device->vk, builder->alloc, *pipeline); - return result; - } - - tu_pipeline_builder_parse_shader_stages(builder, *pipeline); - tu6_emit_load_state(*pipeline, &builder->layout); - } - - if (builder->state & - VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) { - tu_pipeline_builder_parse_vertex_input(builder, *pipeline); - tu_pipeline_builder_parse_input_assembly(builder, *pipeline); - } - - if (builder->state & - VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) { - tu_pipeline_builder_parse_viewport(builder, *pipeline); - tu_pipeline_builder_parse_rasterization(builder, *pipeline); - } - - if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) { - tu_pipeline_builder_parse_depth_stencil(builder, *pipeline); - } - - if (builder->state & - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) { - tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline); - } - - if (set_combined_state(builder, *pipeline, - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) { - tu_pipeline_builder_parse_rasterization_order(builder, *pipeline); - } - - if (set_combined_state(builder, *pipeline, - VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) { - tu_pipeline_builder_parse_rast_ds(builder, *pipeline); - } + assert((*pipeline)->cs.bo_count == 1); return VK_SUCCESS; } @@ -4703,231 +1759,59 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder, static void tu_pipeline_builder_finish(struct tu_pipeline_builder *builder) { - if (builder->compiled_shaders) - vk_pipeline_cache_object_unref(&builder->compiled_shaders->base); - ralloc_free(builder->mem_ctx); + for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) { + if (!builder->shaders[i]) + continue; + tu_shader_destroy(builder->device, builder->shaders[i], builder->alloc); + } } static void tu_pipeline_builder_init_graphics( struct tu_pipeline_builder *builder, struct tu_device *dev, - struct vk_pipeline_cache *cache, + struct tu_pipeline_cache *cache, const VkGraphicsPipelineCreateInfo *create_info, const VkAllocationCallbacks *alloc) { *builder = (struct tu_pipeline_builder) { .device = dev, - .mem_ctx = ralloc_context(NULL), .cache = cache, .create_info = create_info, .alloc = alloc, }; - const VkGraphicsPipelineLibraryCreateInfoEXT *gpl_info = - vk_find_struct_const(builder->create_info->pNext, - GRAPHICS_PIPELINE_LIBRARY_CREATE_INFO_EXT); - - const VkPipelineLibraryCreateInfoKHR *library_info = - vk_find_struct_const(builder->create_info->pNext, - PIPELINE_LIBRARY_CREATE_INFO_KHR); - - if (gpl_info) { - builder->state = gpl_info->flags; - } else { - /* Implement this bit of spec text: - * - * If this structure is omitted, and either - * VkGraphicsPipelineCreateInfo::flags includes - * VK_PIPELINE_CREATE_LIBRARY_BIT_KHR or the - * VkGraphicsPipelineCreateInfo::pNext chain includes a - * VkPipelineLibraryCreateInfoKHR structure with a libraryCount - * greater than 0, it is as if flags is 0. Otherwise if this - * structure is omitted, it is as if flags includes all possible - * subsets of the graphics pipeline (i.e. a complete graphics - * pipeline). - */ - if ((library_info && library_info->libraryCount > 0) || - (builder->create_info->flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR)) { - builder->state = 0; - } else { - builder->state = - VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT; - } - } - - bool rasterizer_discard_dynamic = false; - if (create_info->pDynamicState) { - for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) { - if (create_info->pDynamicState->pDynamicStates[i] == - VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE) { - rasterizer_discard_dynamic = true; - break; - } - } - } - builder->rasterizer_discard = - (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) && - builder->create_info->pRasterizationState->rasterizerDiscardEnable && - !rasterizer_discard_dynamic; - - if (builder->state & - (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) { - const VkPipelineRenderingCreateInfo *rendering_info = - vk_find_struct_const(create_info->pNext, PIPELINE_RENDERING_CREATE_INFO); - - if (unlikely(dev->instance->debug_flags & TU_DEBUG_DYNAMIC) && !rendering_info) - rendering_info = vk_get_pipeline_rendering_create_info(create_info); - - /* Get multiview_mask, which is only used for shaders */ - if (builder->state & - (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) { - if (rendering_info) { - builder->multiview_mask = rendering_info->viewMask; - } else { - const struct tu_render_pass *pass = - tu_render_pass_from_handle(create_info->renderPass); - const struct tu_subpass *subpass = - &pass->subpasses[create_info->subpass]; - builder->multiview_mask = subpass->multiview_mask; - } - } - - /* Get the attachment state. This is valid: - * - * - With classic renderpasses, when either fragment shader or fragment - * output interface state is being compiled. This includes when we - * emulate classic renderpasses with dynamic rendering with the debug - * flag. - * - With dynamic rendering (renderPass is NULL) only when compiling the - * output interface state. - * - * We only actually need this for the fragment output interface state, - * but the spec also requires us to skip parsing depth/stencil state - * when the attachment state is defined *and* no depth/stencil - * attachment is not used, so we have to parse it for fragment shader - * state when possible. Life is pain. - */ - if (((builder->state & - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) || - ((builder->state & - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) && - builder->create_info->renderPass)) && - rendering_info) { - builder->subpass_raster_order_attachment_access = false; - builder->subpass_feedback_loop_ds = false; - builder->subpass_feedback_loop_color = false; - - const VkRenderingSelfDependencyInfoMESA *self_dependency = - vk_find_struct_const(rendering_info->pNext, RENDERING_SELF_DEPENDENCY_INFO_MESA); - - if (self_dependency) { - builder->subpass_feedback_loop_ds = - self_dependency->depthSelfDependency || - self_dependency->stencilSelfDependency; - builder->subpass_feedback_loop_color = - self_dependency->colorSelfDependencies; - } - - if (!builder->rasterizer_discard) { - builder->depth_attachment_format = - rendering_info->depthAttachmentFormat == VK_FORMAT_UNDEFINED ? - rendering_info->stencilAttachmentFormat : - rendering_info->depthAttachmentFormat; - - for (unsigned i = 0; i < rendering_info->colorAttachmentCount; i++) { - builder->color_attachment_formats[i] = - rendering_info->pColorAttachmentFormats[i]; - if (builder->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) { - builder->use_color_attachments = true; - } - } - } + create_info->pRasterizationState->rasterizerDiscardEnable; - builder->attachment_state_valid = true; - } else if ((builder->state & - (VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT | - VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) && - create_info->renderPass != VK_NULL_HANDLE) { - const struct tu_render_pass *pass = - tu_render_pass_from_handle(create_info->renderPass); - const struct tu_subpass *subpass = - &pass->subpasses[create_info->subpass]; - - builder->subpass_raster_order_attachment_access = - subpass->raster_order_attachment_access; - builder->subpass_feedback_loop_color = subpass->feedback_loop_color; - builder->subpass_feedback_loop_ds = subpass->feedback_loop_ds; - - if (!builder->rasterizer_discard) { - const uint32_t a = subpass->depth_stencil_attachment.attachment; - builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ? - pass->attachments[a].format : VK_FORMAT_UNDEFINED; - - assert(subpass->color_count == 0 || - !create_info->pColorBlendState || - subpass->color_count == create_info->pColorBlendState->attachmentCount); - for (uint32_t i = 0; i < subpass->color_count; i++) { - const uint32_t a = subpass->color_attachments[i].attachment; - if (a == VK_ATTACHMENT_UNUSED) - continue; - - builder->color_attachment_formats[i] = pass->attachments[a].format; - builder->use_color_attachments = true; - } - } + if (builder->rasterizer_discard) { + builder->samples = VK_SAMPLE_COUNT_1_BIT; + } else { + builder->samples = create_info->pMultisampleState->rasterizationSamples; + + const struct tu_render_pass *pass = + tu_render_pass_from_handle(create_info->renderPass); + const struct tu_subpass *subpass = + &pass->subpasses[create_info->subpass]; + + builder->use_depth_stencil_attachment = + subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED; + + assert(subpass->color_count == + create_info->pColorBlendState->attachmentCount); + builder->color_attachment_count = subpass->color_count; + for (uint32_t i = 0; i < subpass->color_count; i++) { + const uint32_t a = subpass->color_attachments[i].attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; - builder->attachment_state_valid = true; + builder->color_attachment_formats[i] = pass->attachments[a].format; + builder->use_color_attachments = true; } } - - if (builder->create_info->flags & VK_PIPELINE_CREATE_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT) { - builder->subpass_feedback_loop_color = true; - builder->feedback_loop_may_involve_textures = true; - } - - if (builder->create_info->flags & VK_PIPELINE_CREATE_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT) { - builder->subpass_feedback_loop_ds = true; - builder->feedback_loop_may_involve_textures = true; - } } -static VkResult -tu_graphics_pipeline_create(VkDevice device, - VkPipelineCache pipelineCache, - const VkGraphicsPipelineCreateInfo *pCreateInfo, - const VkAllocationCallbacks *pAllocator, - VkPipeline *pPipeline) -{ - TU_FROM_HANDLE(tu_device, dev, device); - TU_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache); - - cache = cache ? cache : dev->mem_cache; - - struct tu_pipeline_builder builder; - tu_pipeline_builder_init_graphics(&builder, dev, cache, - pCreateInfo, pAllocator); - - struct tu_pipeline *pipeline = NULL; - VkResult result = tu_pipeline_builder_build(&builder, &pipeline); - tu_pipeline_builder_finish(&builder); - - if (result == VK_SUCCESS) - *pPipeline = tu_pipeline_to_handle(pipeline); - else - *pPipeline = VK_NULL_HANDLE; - - return result; -} - -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count, @@ -4935,242 +1819,68 @@ tu_CreateGraphicsPipelines(VkDevice device, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines) { - MESA_TRACE_FUNC(); - VkResult final_result = VK_SUCCESS; - uint32_t i = 0; + TU_FROM_HANDLE(tu_device, dev, device); + TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache); - for (; i < count; i++) { - VkResult result = tu_graphics_pipeline_create(device, pipelineCache, - &pCreateInfos[i], pAllocator, - &pPipelines[i]); + for (uint32_t i = 0; i < count; i++) { + struct tu_pipeline_builder builder; + tu_pipeline_builder_init_graphics(&builder, dev, cache, + &pCreateInfos[i], pAllocator); + + struct tu_pipeline *pipeline; + VkResult result = tu_pipeline_builder_build(&builder, &pipeline); + tu_pipeline_builder_finish(&builder); if (result != VK_SUCCESS) { - final_result = result; - pPipelines[i] = VK_NULL_HANDLE; + for (uint32_t j = 0; j < i; j++) { + tu_DestroyPipeline(device, pPipelines[j], pAllocator); + pPipelines[j] = VK_NULL_HANDLE; + } - if (pCreateInfos[i].flags & - VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT) - break; + return result; } - } - for (; i < count; i++) - pPipelines[i] = VK_NULL_HANDLE; + pPipelines[i] = tu_pipeline_to_handle(pipeline); + } - return final_result; + return VK_SUCCESS; } static VkResult -tu_compute_pipeline_create(VkDevice device, - VkPipelineCache pipelineCache, +tu_compute_pipeline_create(VkDevice _device, + VkPipelineCache _cache, const VkComputePipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline) { - TU_FROM_HANDLE(tu_device, dev, device); - TU_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache); - TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout); - const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage; - VkResult result; - - cache = cache ? cache : dev->mem_cache; - - struct tu_pipeline *pipeline; - - *pPipeline = VK_NULL_HANDLE; - - VkPipelineCreationFeedback pipeline_feedback = { - .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT, - }; - - const VkPipelineCreationFeedbackCreateInfo *creation_feedback = - vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO); - - int64_t pipeline_start = os_time_get_nano(); - - pipeline = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pipeline), - VK_OBJECT_TYPE_PIPELINE); - if (!pipeline) - return VK_ERROR_OUT_OF_HOST_MEMORY; - - pipeline->executables_mem_ctx = ralloc_context(NULL); - util_dynarray_init(&pipeline->executables, pipeline->executables_mem_ctx); - pipeline->active_stages = VK_SHADER_STAGE_COMPUTE_BIT; - - struct tu_shader_key key = { }; - tu_shader_key_init(&key, stage_info, dev); - - void *pipeline_mem_ctx = ralloc_context(NULL); - - unsigned char pipeline_sha1[20]; - tu_hash_compute(pipeline_sha1, stage_info, layout, &key, dev->compiler); - - struct tu_compiled_shaders *compiled = NULL; - - const bool executable_info = pCreateInfo->flags & - VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR; - - bool application_cache_hit = false; - - if (!executable_info) { - compiled = - tu_pipeline_cache_lookup(cache, pipeline_sha1, sizeof(pipeline_sha1), - &application_cache_hit); - } - - if (application_cache_hit && cache != dev->mem_cache) { - pipeline_feedback.flags |= - VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; - } - - if (tu6_shared_constants_enable(layout, dev->compiler)) { - pipeline->shared_consts = (struct tu_push_constant_range) { - .lo = 0, - .dwords = layout->push_constant_size / 4, - }; - } - - char *nir_initial_disasm = NULL; - - if (!compiled) { - if (pCreateInfo->flags & - VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) { - result = VK_PIPELINE_COMPILE_REQUIRED; - goto fail; - } - - struct ir3_shader_key ir3_key = {}; - - nir_shader *nir = tu_spirv_to_nir(dev, pipeline_mem_ctx, stage_info, - MESA_SHADER_COMPUTE); - - nir_initial_disasm = executable_info ? - nir_shader_as_str(nir, pipeline->executables_mem_ctx) : NULL; - - struct tu_shader *shader = - tu_shader_create(dev, nir, &key, layout, pAllocator); - if (!shader) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail; - } - - compiled = tu_shaders_init(dev, &pipeline_sha1, sizeof(pipeline_sha1)); - if (!compiled) { - tu_shader_destroy(dev, shader, pAllocator); - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail; - } - - compiled->active_desc_sets = shader->active_desc_sets; - compiled->const_state[MESA_SHADER_COMPUTE] = shader->const_state; - - struct ir3_shader_variant *v = - ir3_shader_create_variant(shader->ir3_shader, &ir3_key, executable_info); - - tu_shader_destroy(dev, shader, pAllocator); - - if (!v) { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - goto fail; - } - - compiled->variants[MESA_SHADER_COMPUTE] = v; - - compiled = tu_pipeline_cache_insert(cache, compiled); - } - - pipeline_feedback.duration = os_time_get_nano() - pipeline_start; - - if (creation_feedback) { - *creation_feedback->pPipelineCreationFeedback = pipeline_feedback; - assert(creation_feedback->pipelineStageCreationFeedbackCount == 1); - creation_feedback->pPipelineStageCreationFeedbacks[0] = pipeline_feedback; - } - - pipeline->active_desc_sets = compiled->active_desc_sets; - - struct ir3_shader_variant *v = compiled->variants[MESA_SHADER_COMPUTE]; - - tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE], - &compiled->const_state[MESA_SHADER_COMPUTE], v); - - result = tu_pipeline_allocate_cs(dev, pipeline, layout, NULL, v); - if (result != VK_SUCCESS) - goto fail; - - uint64_t shader_iova = tu_upload_variant(pipeline, v); - - struct tu_pvtmem_config pvtmem; - tu_setup_pvtmem(dev, pipeline, &pvtmem, v->pvtmem_size, v->pvtmem_per_wave); - - for (int i = 0; i < 3; i++) - pipeline->compute.local_size[i] = v->local_size[i]; - - pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64; - - struct tu_cs prog_cs; - uint32_t additional_reserve_size = tu_xs_get_additional_cs_size_dwords(v); - tu_cs_begin_sub_stream(&pipeline->cs, 64 + additional_reserve_size, &prog_cs); - tu6_emit_cs_config(&prog_cs, v, &pvtmem, shader_iova); - pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs); - - tu6_emit_load_state(pipeline, layout); - - tu_append_executable(pipeline, v, nir_initial_disasm); - - pipeline->program.cs_instrlen = v->instrlen; - - vk_pipeline_cache_object_unref(&compiled->base); - ralloc_free(pipeline_mem_ctx); - - *pPipeline = tu_pipeline_to_handle(pipeline); - return VK_SUCCESS; - -fail: - if (compiled) - vk_pipeline_cache_object_unref(&compiled->base); - - ralloc_free(pipeline_mem_ctx); - - vk_object_free(&dev->vk, pAllocator, pipeline); - - return result; } -VKAPI_ATTR VkResult VKAPI_CALL -tu_CreateComputePipelines(VkDevice device, +VkResult +tu_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count, const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines) { - MESA_TRACE_FUNC(); - VkResult final_result = VK_SUCCESS; - uint32_t i = 0; + VkResult result = VK_SUCCESS; + unsigned i = 0; for (; i < count; i++) { - VkResult result = tu_compute_pipeline_create(device, pipelineCache, - &pCreateInfos[i], - pAllocator, &pPipelines[i]); - if (result != VK_SUCCESS) { - final_result = result; + VkResult r; + r = tu_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], + pAllocator, &pPipelines[i]); + if (r != VK_SUCCESS) { + result = r; pPipelines[i] = VK_NULL_HANDLE; - - if (pCreateInfos[i].flags & - VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT) - break; } } - for (; i < count; i++) - pPipelines[i] = VK_NULL_HANDLE; - - return final_result; + return result; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyPipeline(VkDevice _device, VkPipeline _pipeline, const VkAllocationCallbacks *pAllocator) @@ -5182,274 +1892,5 @@ tu_DestroyPipeline(VkDevice _device, return; tu_pipeline_finish(pipeline, dev, pAllocator); - vk_object_free(&dev->vk, pAllocator, pipeline); -} - -#define WRITE_STR(field, ...) ({ \ - memset(field, 0, sizeof(field)); \ - UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \ - assert(_i > 0 && _i < sizeof(field)); \ -}) - -static const struct tu_pipeline_executable * -tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index) -{ - assert(index < util_dynarray_num_elements(&pipeline->executables, - struct tu_pipeline_executable)); - return util_dynarray_element( - &pipeline->executables, struct tu_pipeline_executable, index); -} - -VKAPI_ATTR VkResult VKAPI_CALL -tu_GetPipelineExecutablePropertiesKHR( - VkDevice _device, - const VkPipelineInfoKHR* pPipelineInfo, - uint32_t* pExecutableCount, - VkPipelineExecutablePropertiesKHR* pProperties) -{ - TU_FROM_HANDLE(tu_device, dev, _device); - TU_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline); - VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, - pProperties, pExecutableCount); - - util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) { - vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) { - gl_shader_stage stage = exe->stage; - props->stages = mesa_to_vk_shader_stage(stage); - - if (!exe->is_binning) - WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage)); - else - WRITE_STR(props->name, "Binning VS"); - - WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage)); - - props->subgroupSize = - dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1); - } - } - - return vk_outarray_status(&out); -} - -VKAPI_ATTR VkResult VKAPI_CALL -tu_GetPipelineExecutableStatisticsKHR( - VkDevice _device, - const VkPipelineExecutableInfoKHR* pExecutableInfo, - uint32_t* pStatisticCount, - VkPipelineExecutableStatisticKHR* pStatistics) -{ - TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline); - VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, - pStatistics, pStatisticCount); - - const struct tu_pipeline_executable *exe = - tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex); - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "Max Waves Per Core"); - WRITE_STR(stat->description, - "Maximum number of simultaneous waves per core."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.max_waves; - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "Instruction Count"); - WRITE_STR(stat->description, - "Total number of IR3 instructions in the final generated " - "shader executable."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.instrs_count; - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "Code size"); - WRITE_STR(stat->description, - "Total number of dwords in the final generated " - "shader executable."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.sizedwords; - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "NOPs Count"); - WRITE_STR(stat->description, - "Number of NOP instructions in the final generated " - "shader executable."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.nops_count; - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "MOV Count"); - WRITE_STR(stat->description, - "Number of MOV instructions in the final generated " - "shader executable."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.mov_count; - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "COV Count"); - WRITE_STR(stat->description, - "Number of COV instructions in the final generated " - "shader executable."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.cov_count; - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "Registers used"); - WRITE_STR(stat->description, - "Number of registers used in the final generated " - "shader executable."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.max_reg + 1; - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "Half-registers used"); - WRITE_STR(stat->description, - "Number of half-registers used in the final generated " - "shader executable."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.max_half_reg + 1; - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "Instructions with SS sync bit"); - WRITE_STR(stat->description, - "SS bit is set for instructions which depend on a result " - "of \"long\" instructions to prevent RAW hazard."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.ss; - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "Instructions with SY sync bit"); - WRITE_STR(stat->description, - "SY bit is set for instructions which depend on a result " - "of loads from global memory to prevent RAW hazard."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.sy; - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "Estimated cycles stalled on SS"); - WRITE_STR(stat->description, - "A better metric to estimate the impact of SS syncs."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.sstall; - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "Estimated cycles stalled on SY"); - WRITE_STR(stat->description, - "A better metric to estimate the impact of SY syncs."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.systall; - } - - for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) { - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "cat%d instructions", i); - WRITE_STR(stat->description, - "Number of cat%d instructions.", i); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.instrs_per_cat[i]; - } - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "STP Count"); - WRITE_STR(stat->description, - "Number of STore Private instructions in the final generated " - "shader executable."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.stp_count; - } - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { - WRITE_STR(stat->name, "LDP Count"); - WRITE_STR(stat->description, - "Number of LoaD Private instructions in the final generated " - "shader executable."); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = exe->stats.ldp_count; - } - - return vk_outarray_status(&out); -} - -static bool -write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir, - const char *data) -{ - ir->isText = VK_TRUE; - - size_t data_len = strlen(data) + 1; - - if (ir->pData == NULL) { - ir->dataSize = data_len; - return true; - } - - strncpy(ir->pData, data, ir->dataSize); - if (ir->dataSize < data_len) - return false; - - ir->dataSize = data_len; - return true; -} - -VKAPI_ATTR VkResult VKAPI_CALL -tu_GetPipelineExecutableInternalRepresentationsKHR( - VkDevice _device, - const VkPipelineExecutableInfoKHR* pExecutableInfo, - uint32_t* pInternalRepresentationCount, - VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations) -{ - TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline); - VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out, - pInternalRepresentations, pInternalRepresentationCount); - bool incomplete_text = false; - - const struct tu_pipeline_executable *exe = - tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex); - - if (exe->nir_from_spirv) { - vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { - WRITE_STR(ir->name, "NIR from SPIRV"); - WRITE_STR(ir->description, - "Initial NIR before any optimizations"); - - if (!write_ir_text(ir, exe->nir_from_spirv)) - incomplete_text = true; - } - } - - if (exe->nir_final) { - vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { - WRITE_STR(ir->name, "Final NIR"); - WRITE_STR(ir->description, - "Final NIR before going into the back-end compiler"); - - if (!write_ir_text(ir, exe->nir_final)) - incomplete_text = true; - } - } - - if (exe->disasm) { - vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { - WRITE_STR(ir->name, "IR3 Assembly"); - WRITE_STR(ir->description, - "Final IR3 assembly for the generated shader binary"); - - if (!write_ir_text(ir, exe->disasm)) - incomplete_text = true; - } - } - - return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out); + vk_free2(&dev->alloc, pAllocator, pipeline); } diff --git a/lib/mesa/src/freedreno/vulkan/tu_query.c b/lib/mesa/src/freedreno/vulkan/tu_query.c index 6da5102cc..2cb710fb1 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_query.c +++ b/lib/mesa/src/freedreno/vulkan/tu_query.c @@ -1,339 +1,57 @@ /* * Copyrigh 2016 Red Hat Inc. - * SPDX-License-Identifier: MIT - * * Based on anv: * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ -#include "tu_query.h" +#include "tu_private.h" +#include <assert.h> #include <fcntl.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> #include "nir/nir_builder.h" -#include "util/os_time.h" - -#include "vk_util.h" - -#include "tu_cmd_buffer.h" -#include "tu_cs.h" -#include "tu_device.h" - -#define NSEC_PER_SEC 1000000000ull -#define WAIT_TIMEOUT 5 -#define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1) - -struct PACKED query_slot { - uint64_t available; -}; - -struct PACKED occlusion_slot_value { - /* Seems sample counters are placed to be 16-byte aligned - * even though this query needs an 8-byte slot. */ - uint64_t value; - uint64_t _padding; -}; - -struct PACKED occlusion_query_slot { - struct query_slot common; - uint64_t result; - - struct occlusion_slot_value begin; - struct occlusion_slot_value end; -}; - -struct PACKED timestamp_query_slot { - struct query_slot common; - uint64_t result; -}; - -struct PACKED primitive_slot_value { - uint64_t values[2]; -}; - -struct PACKED pipeline_stat_query_slot { - struct query_slot common; - uint64_t results[STAT_COUNT]; - - uint64_t begin[STAT_COUNT]; - uint64_t end[STAT_COUNT]; -}; - -struct PACKED primitive_query_slot { - struct query_slot common; - /* The result of transform feedback queries is two integer values: - * results[0] is the count of primitives written, - * results[1] is the count of primitives generated. - * Also a result for each stream is stored at 4 slots respectively. - */ - uint64_t results[2]; - - /* Primitive counters also need to be 16-byte aligned. */ - uint64_t _padding; - - struct primitive_slot_value begin[4]; - struct primitive_slot_value end[4]; -}; - -struct PACKED perfcntr_query_slot { - uint64_t result; - uint64_t begin; - uint64_t end; -}; - -struct PACKED perf_query_slot { - struct query_slot common; - struct perfcntr_query_slot perfcntr; -}; - -struct PACKED primitives_generated_query_slot { - struct query_slot common; - uint64_t result; - uint64_t begin; - uint64_t end; -}; - -/* Returns the IOVA of a given uint64_t field in a given slot of a query - * pool. */ -#define query_iova(type, pool, query, field) \ - pool->bo->iova + pool->stride * (query) + offsetof(type, field) - -#define occlusion_query_iova(pool, query, field) \ - query_iova(struct occlusion_query_slot, pool, query, field) - -#define pipeline_stat_query_iova(pool, query, field) \ - pool->bo->iova + pool->stride * (query) + \ - offsetof(struct pipeline_stat_query_slot, field) - -#define primitive_query_iova(pool, query, field, i) \ - query_iova(struct primitive_query_slot, pool, query, field) + \ - offsetof(struct primitive_slot_value, values[i]) - -#define perf_query_iova(pool, query, field, i) \ - pool->bo->iova + pool->stride * (query) + \ - sizeof(struct query_slot) + \ - sizeof(struct perfcntr_query_slot) * (i) + \ - offsetof(struct perfcntr_query_slot, field) - -#define primitives_generated_query_iova(pool, query, field) \ - query_iova(struct primitives_generated_query_slot, pool, query, field) - -#define query_available_iova(pool, query) \ - query_iova(struct query_slot, pool, query, available) - -#define query_result_iova(pool, query, type, i) \ - pool->bo->iova + pool->stride * (query) + \ - sizeof(struct query_slot) + sizeof(type) * (i) - -#define query_result_addr(pool, query, type, i) \ - pool->bo->map + pool->stride * (query) + \ - sizeof(struct query_slot) + sizeof(type) * (i) - -#define query_is_available(slot) slot->available - -static const VkPerformanceCounterUnitKHR -fd_perfcntr_type_to_vk_unit[] = { - [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, - [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, - [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, - [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR, - [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR, - /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */ - [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, - [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR, - [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, - [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, - [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, - [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, - [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, -}; - -/* TODO. Basically this comes from the freedreno implementation where - * only UINT64 is used. We'd better confirm this by the blob vulkan driver - * when it starts supporting perf query. - */ -static const VkPerformanceCounterStorageKHR -fd_perfcntr_type_to_vk_storage[] = { - [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR, - [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, - [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, - [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, - [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, - [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, - [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, - [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, - [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, - [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, - [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, - [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, -}; - -/* - * Returns a pointer to a given slot in a query pool. - */ -static void* slot_address(struct tu_query_pool *pool, uint32_t query) -{ - return (char*)pool->bo->map + query * pool->stride; -} - -static void -perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count, - uint32_t index, uint32_t *gid, uint32_t *cid) - -{ - uint32_t i; - - for (i = 0; i < group_count; i++) { - if (group[i].num_countables > index) { - *gid = i; - *cid = index; - break; - } - index -= group[i].num_countables; - } - - assert(i < group_count); -} -static int -compare_perfcntr_pass(const void *a, const void *b) -{ - return ((struct tu_perf_query_data *)a)->pass - - ((struct tu_perf_query_data *)b)->pass; -} - -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool) { TU_FROM_HANDLE(tu_device, device, _device); - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); - assert(pCreateInfo->queryCount > 0); - - uint32_t pool_size, slot_size; - const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL; - - pool_size = sizeof(struct tu_query_pool); - - switch (pCreateInfo->queryType) { - case VK_QUERY_TYPE_OCCLUSION: - slot_size = sizeof(struct occlusion_query_slot); - break; - case VK_QUERY_TYPE_TIMESTAMP: - slot_size = sizeof(struct timestamp_query_slot); - break; - case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - slot_size = sizeof(struct primitive_query_slot); - break; - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - slot_size = sizeof(struct primitives_generated_query_slot); - break; - case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { - perf_query_info = - vk_find_struct_const(pCreateInfo->pNext, - QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); - assert(perf_query_info); - - slot_size = sizeof(struct perf_query_slot) + - sizeof(struct perfcntr_query_slot) * - (perf_query_info->counterIndexCount - 1); - - /* Size of the array pool->tu_perf_query_data */ - pool_size += sizeof(struct tu_perf_query_data) * - perf_query_info->counterIndexCount; - break; - } - case VK_QUERY_TYPE_PIPELINE_STATISTICS: - slot_size = sizeof(struct pipeline_stat_query_slot); - break; - default: - unreachable("Invalid query type"); - } - struct tu_query_pool *pool = - vk_object_alloc(&device->vk, pAllocator, pool_size, - VK_OBJECT_TYPE_QUERY_POOL); - if (!pool) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { - pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id, - &pool->perf_group_count); - - pool->counter_index_count = perf_query_info->counterIndexCount; - - /* Build all perf counters data that is requested, so we could get - * correct group id, countable id, counter register and pass index with - * only a counter index provided by applications at each command submit. - * - * Also, since this built data will be sorted by pass index later, we - * should keep the original indices and store perfcntrs results according - * to them so apps can get correct results with their own indices. - */ - uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count]; - memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0])); - memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0])); - - for (uint32_t i = 0; i < pool->counter_index_count; i++) { - uint32_t gid = 0, cid = 0; - - perfcntr_index(pool->perf_group, pool->perf_group_count, - perf_query_info->pCounterIndices[i], &gid, &cid); - - pool->perf_query_data[i].gid = gid; - pool->perf_query_data[i].cid = cid; - pool->perf_query_data[i].app_idx = i; - - /* When a counter register is over the capacity(num_counters), - * reset it for next pass. - */ - if (regs[gid] < pool->perf_group[gid].num_counters) { - pool->perf_query_data[i].cntr_reg = regs[gid]++; - pool->perf_query_data[i].pass = pass[gid]; - } else { - pool->perf_query_data[i].pass = ++pass[gid]; - pool->perf_query_data[i].cntr_reg = regs[gid] = 0; - regs[gid]++; - } - } + vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - /* Sort by pass index so we could easily prepare a command stream - * with the ascending order of pass index. - */ - qsort(pool->perf_query_data, pool->counter_index_count, - sizeof(pool->perf_query_data[0]), - compare_perfcntr_pass); - } - - VkResult result = tu_bo_init_new(device, &pool->bo, - pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool"); - if (result != VK_SUCCESS) { - vk_object_free(&device->vk, pAllocator, pool); - return result; - } - - result = tu_bo_map(device, pool->bo); - if (result != VK_SUCCESS) { - tu_bo_finish(device, pool->bo); - vk_object_free(&device->vk, pAllocator, pool); - return result; - } - - /* Initialize all query statuses to unavailable */ - memset(pool->bo->map, 0, pool->bo->size); + if (!pool) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - pool->type = pCreateInfo->queryType; - pool->stride = slot_size; - pool->size = pCreateInfo->queryCount; - pool->pipeline_statistics = pCreateInfo->pipelineStatistics; *pQueryPool = tu_query_pool_to_handle(pool); - return VK_SUCCESS; } -VKAPI_ATTR void VKAPI_CALL +void tu_DestroyQueryPool(VkDevice _device, VkQueryPool _pool, const VkAllocationCallbacks *pAllocator) @@ -344,211 +62,10 @@ tu_DestroyQueryPool(VkDevice _device, if (!pool) return; - tu_bo_finish(device, pool->bo); - vk_object_free(&device->vk, pAllocator, pool); -} - -static uint32_t -get_result_count(struct tu_query_pool *pool) -{ - switch (pool->type) { - /* Occulusion and timestamp queries write one integer value */ - case VK_QUERY_TYPE_OCCLUSION: - case VK_QUERY_TYPE_TIMESTAMP: - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - return 1; - /* Transform feedback queries write two integer values */ - case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - return 2; - case VK_QUERY_TYPE_PIPELINE_STATISTICS: - return util_bitcount(pool->pipeline_statistics); - case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: - return pool->counter_index_count; - default: - assert(!"Invalid query type"); - return 0; - } -} - -static uint32_t -statistics_index(uint32_t *statistics) -{ - uint32_t stat; - stat = u_bit_scan(statistics); - - switch (1 << stat) { - case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT: - case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT: - return 0; - case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT: - return 1; - case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT: - return 2; - case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT: - return 4; - case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT: - return 5; - case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT: - return 6; - case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT: - return 7; - case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT: - return 8; - case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT: - return 9; - case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT: - return 10; - default: - return 0; - } -} - -static bool -is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics) -{ - return pipeline_statistics & - (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT | - VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT | - VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT | - VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT | - VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT | - VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT | - VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT | - VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT | - VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT); -} - -static bool -is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics) -{ - return pipeline_statistics & - VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT; -} - -static bool -is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics) -{ - return pipeline_statistics & - VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT; -} - -/* Wait on the the availability status of a query up until a timeout. */ -static VkResult -wait_for_available(struct tu_device *device, struct tu_query_pool *pool, - uint32_t query) -{ - /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a - * scheduler friendly way instead of busy polling once the patch has landed - * upstream. */ - struct query_slot *slot = slot_address(pool, query); - uint64_t abs_timeout = os_time_get_absolute_timeout( - WAIT_TIMEOUT * NSEC_PER_SEC); - while(os_time_get_nano() < abs_timeout) { - if (query_is_available(slot)) - return VK_SUCCESS; - } - return vk_error(device, VK_TIMEOUT); + vk_free2(&device->alloc, pAllocator, pool); } -/* Writes a query value to a buffer from the CPU. */ -static void -write_query_value_cpu(char* base, - uint32_t offset, - uint64_t value, - VkQueryResultFlags flags) -{ - if (flags & VK_QUERY_RESULT_64_BIT) { - *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value; - } else { - *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value; - } -} - -static VkResult -get_query_pool_results(struct tu_device *device, - struct tu_query_pool *pool, - uint32_t firstQuery, - uint32_t queryCount, - size_t dataSize, - void *pData, - VkDeviceSize stride, - VkQueryResultFlags flags) -{ - assert(dataSize >= stride * queryCount); - - char *result_base = pData; - VkResult result = VK_SUCCESS; - for (uint32_t i = 0; i < queryCount; i++) { - uint32_t query = firstQuery + i; - struct query_slot *slot = slot_address(pool, query); - bool available = query_is_available(slot); - uint32_t result_count = get_result_count(pool); - uint32_t statistics = pool->pipeline_statistics; - - if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) { - VkResult wait_result = wait_for_available(device, pool, query); - if (wait_result != VK_SUCCESS) - return wait_result; - available = true; - } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) { - /* From the Vulkan 1.1.130 spec: - * - * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are - * both not set then no result values are written to pData for - * queries that are in the unavailable state at the time of the - * call, and vkGetQueryPoolResults returns VK_NOT_READY. However, - * availability state is still written to pData for those queries - * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set. - */ - result = VK_NOT_READY; - if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) { - result_base += stride; - continue; - } - } - - for (uint32_t k = 0; k < result_count; k++) { - if (available) { - uint64_t *result; - - if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { - uint32_t stat_idx = statistics_index(&statistics); - result = query_result_addr(pool, query, uint64_t, stat_idx); - } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { - result = query_result_addr(pool, query, struct perfcntr_query_slot, k); - } else { - result = query_result_addr(pool, query, uint64_t, k); - } - - write_query_value_cpu(result_base, k, *result, flags); - } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) - /* From the Vulkan 1.1.130 spec: - * - * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT - * is not set, and the query’s status is unavailable, an - * intermediate result value between zero and the final result - * value is written to pData for that query. - * - * Just return 0 here for simplicity since it's a valid result. - */ - write_query_value_cpu(result_base, k, 0, flags); - } - - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) - /* From the Vulkan 1.1.130 spec: - * - * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final - * integer value written for each query is non-zero if the query’s - * status was available or zero if the status was unavailable. - */ - write_query_value_cpu(result_base, result_count, available, flags); - - result_base += stride; - } - return result; -} - -VKAPI_ATTR VkResult VKAPI_CALL +VkResult tu_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, @@ -558,140 +75,10 @@ tu_GetQueryPoolResults(VkDevice _device, VkDeviceSize stride, VkQueryResultFlags flags) { - TU_FROM_HANDLE(tu_device, device, _device); - TU_FROM_HANDLE(tu_query_pool, pool, queryPool); - assert(firstQuery + queryCount <= pool->size); - - if (vk_device_is_lost(&device->vk)) - return VK_ERROR_DEVICE_LOST; - - switch (pool->type) { - case VK_QUERY_TYPE_OCCLUSION: - case VK_QUERY_TYPE_TIMESTAMP: - case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - case VK_QUERY_TYPE_PIPELINE_STATISTICS: - case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: - return get_query_pool_results(device, pool, firstQuery, queryCount, - dataSize, pData, stride, flags); - default: - assert(!"Invalid query type"); - } return VK_SUCCESS; } -/* Copies a query value from one buffer to another from the GPU. */ -static void -copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf, - struct tu_cs *cs, - uint64_t src_iova, - uint64_t base_write_iova, - uint32_t offset, - VkQueryResultFlags flags) { - uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ? - sizeof(uint64_t) : sizeof(uint32_t); - uint64_t write_iova = base_write_iova + (offset * element_size); - - tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); - uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ? - CP_MEM_TO_MEM_0_DOUBLE : 0; - tu_cs_emit(cs, mem_to_mem_flags); - tu_cs_emit_qw(cs, write_iova); - tu_cs_emit_qw(cs, src_iova); -} - -static void -emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf, - struct tu_cs *cs, - struct tu_query_pool *pool, - uint32_t firstQuery, - uint32_t queryCount, - struct tu_buffer *buffer, - VkDeviceSize dstOffset, - VkDeviceSize stride, - VkQueryResultFlags flags) -{ - /* From the Vulkan 1.1.130 spec: - * - * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous - * uses of vkCmdResetQueryPool in the same queue, without any additional - * synchronization. - * - * To ensure that previous writes to the available bit are coherent, first - * wait for all writes to complete. - */ - tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); - - for (uint32_t i = 0; i < queryCount; i++) { - uint32_t query = firstQuery + i; - uint64_t available_iova = query_available_iova(pool, query); - uint64_t buffer_iova = buffer->iova + dstOffset + i * stride; - uint32_t result_count = get_result_count(pool); - uint32_t statistics = pool->pipeline_statistics; - - /* Wait for the available bit to be set if executed with the - * VK_QUERY_RESULT_WAIT_BIT flag. */ - if (flags & VK_QUERY_RESULT_WAIT_BIT) { - tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); - tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | - CP_WAIT_REG_MEM_0_POLL_MEMORY); - tu_cs_emit_qw(cs, available_iova); - tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1)); - tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0)); - tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); - } - - for (uint32_t k = 0; k < result_count; k++) { - uint64_t result_iova; - - if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { - uint32_t stat_idx = statistics_index(&statistics); - result_iova = query_result_iova(pool, query, uint64_t, stat_idx); - } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { - result_iova = query_result_iova(pool, query, - struct perfcntr_query_slot, k); - } else { - result_iova = query_result_iova(pool, query, uint64_t, k); - } - - if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { - /* Unconditionally copying the bo->result into the buffer here is - * valid because we only set bo->result on vkCmdEndQuery. Thus, even - * if the query is unavailable, this will copy the correct partial - * value of 0. - */ - copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova, - k /* offset */, flags); - } else { - /* Conditionally copy bo->result into the buffer based on whether the - * query is available. - * - * NOTE: For the conditional packets to be executed, CP_COND_EXEC - * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests - * that 0 < available < 2, aka available == 1. - */ - tu_cs_reserve(cs, 7 + 6); - tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6); - tu_cs_emit_qw(cs, available_iova); - tu_cs_emit_qw(cs, available_iova); - tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2)); - tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */ - - /* Start of conditional execution */ - copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova, - k /* offset */, flags); - /* End of conditional execution */ - } - } - - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { - copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova, - result_count /* offset */, flags); - } - } -} - -VKAPI_ATTR void VKAPI_CALL +void tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, @@ -701,1032 +88,35 @@ tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkDeviceSize stride, VkQueryResultFlags flags) { - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_query_pool, pool, queryPool); - TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); - struct tu_cs *cs = &cmdbuf->cs; - assert(firstQuery + queryCount <= pool->size); - - switch (pool->type) { - case VK_QUERY_TYPE_OCCLUSION: - case VK_QUERY_TYPE_TIMESTAMP: - case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - case VK_QUERY_TYPE_PIPELINE_STATISTICS: - return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery, - queryCount, buffer, dstOffset, stride, flags); - case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: - unreachable("allowCommandBufferQueryCopies is false"); - default: - assert(!"Invalid query type"); - } -} - -static void -emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf, - struct tu_query_pool *pool, - uint32_t firstQuery, - uint32_t queryCount) -{ - struct tu_cs *cs = &cmdbuf->cs; - - for (uint32_t i = 0; i < queryCount; i++) { - uint32_t query = firstQuery + i; - uint32_t statistics = pool->pipeline_statistics; - - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); - tu_cs_emit_qw(cs, query_available_iova(pool, query)); - tu_cs_emit_qw(cs, 0x0); - - for (uint32_t k = 0; k < get_result_count(pool); k++) { - uint64_t result_iova; - - if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { - uint32_t stat_idx = statistics_index(&statistics); - result_iova = query_result_iova(pool, query, uint64_t, stat_idx); - } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { - result_iova = query_result_iova(pool, query, - struct perfcntr_query_slot, k); - } else { - result_iova = query_result_iova(pool, query, uint64_t, k); - } - - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); - tu_cs_emit_qw(cs, result_iova); - tu_cs_emit_qw(cs, 0x0); - } - } - } -VKAPI_ATTR void VKAPI_CALL +void tu_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount) { - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_query_pool, pool, queryPool); - - switch (pool->type) { - case VK_QUERY_TYPE_TIMESTAMP: - case VK_QUERY_TYPE_OCCLUSION: - case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - case VK_QUERY_TYPE_PIPELINE_STATISTICS: - case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: - emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount); - break; - default: - assert(!"Invalid query type"); - } } -VKAPI_ATTR void VKAPI_CALL -tu_ResetQueryPool(VkDevice device, - VkQueryPool queryPool, - uint32_t firstQuery, - uint32_t queryCount) -{ - TU_FROM_HANDLE(tu_query_pool, pool, queryPool); - - for (uint32_t i = 0; i < queryCount; i++) { - struct query_slot *slot = slot_address(pool, i + firstQuery); - slot->available = 0; - - for (uint32_t k = 0; k < get_result_count(pool); k++) { - uint64_t *res; - - if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { - res = query_result_addr(pool, i + firstQuery, - struct perfcntr_query_slot, k); - } else { - res = query_result_addr(pool, i + firstQuery, uint64_t, k); - } - - *res = 0; - } - } -} - -static void -emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf, - struct tu_query_pool *pool, - uint32_t query) -{ - /* From the Vulkan 1.1.130 spec: - * - * A query must begin and end inside the same subpass of a render pass - * instance, or must both begin and end outside of a render pass - * instance. - * - * Unlike on an immediate-mode renderer, Turnip renders all tiles on - * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a - * query begins/ends inside the same subpass of a render pass, we need to - * record the packets on the secondary draw command stream. cmdbuf->draw_cs - * is then run on every tile during render, so we just need to accumulate - * sample counts in slot->result to compute the query result. - */ - struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; - - uint64_t begin_iova = occlusion_query_iova(pool, query, begin); - - tu_cs_emit_regs(cs, - A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); - - tu_cs_emit_regs(cs, - A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova)); - - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); - tu_cs_emit(cs, ZPASS_DONE); -} - -static void -emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, - struct tu_query_pool *pool, - uint32_t query) -{ - struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; - uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin); - - if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) { - bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running; - cmdbuf->state.prim_counters_running++; - - /* Prevent starting primitive counters when it is supposed to be stopped - * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query. - */ - if (need_cond_exec) { - tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | - CP_COND_REG_EXEC_0_SYSMEM | - CP_COND_REG_EXEC_0_BINNING); - } - - tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS); - - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); - tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); - tu_cs_emit(cs, 0); - - if (need_cond_exec) { - tu_cond_exec_end(cs); - } - } - - if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) { - tu6_emit_event_write(cmdbuf, cs, START_FRAGMENT_CTRS); - } - - if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) { - tu6_emit_event_write(cmdbuf, cs, START_COMPUTE_CTRS); - } - - tu_cs_emit_wfi(cs); - - tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) | - CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) | - CP_REG_TO_MEM_0_64B); - tu_cs_emit_qw(cs, begin_iova); -} - -static void -emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass) -{ - tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); - tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG( - REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) | - A6XX_CP_REG_TEST_0_BIT(pass) | - A6XX_CP_REG_TEST_0_WAIT_FOR_ME); - tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); -} - -static void -emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf, - struct tu_query_pool *pool, - uint32_t query) -{ - struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; - uint32_t last_pass = ~0; - - if (cmdbuf->state.pass) { - cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true; - } - - /* Querying perf counters happens in these steps: - * - * 0) There's a scratch reg to set a pass index for perf counters query. - * Prepare cmd streams to set each pass index to the reg at device - * creation time. See tu_CreateDevice in tu_device.c - * 1) Emit command streams to read all requested perf counters at all - * passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which - * reads the scratch reg where pass index is set. - * See emit_perfcntrs_pass_start. - * 2) Pick the right cs setting proper pass index to the reg and prepend - * it to the command buffer at each submit time. - * See tu_QueueSubmit in tu_drm.c - * 3) If the pass index in the reg is true, then executes the command - * stream below CP_COND_REG_EXEC. - */ - - tu_cs_emit_wfi(cs); - - for (uint32_t i = 0; i < pool->counter_index_count; i++) { - struct tu_perf_query_data *data = &pool->perf_query_data[i]; - - if (last_pass != data->pass) { - last_pass = data->pass; - - if (data->pass != 0) - tu_cond_exec_end(cs); - emit_perfcntrs_pass_start(cs, data->pass); - } - - const struct fd_perfcntr_counter *counter = - &pool->perf_group[data->gid].counters[data->cntr_reg]; - const struct fd_perfcntr_countable *countable = - &pool->perf_group[data->gid].countables[data->cid]; - - tu_cs_emit_pkt4(cs, counter->select_reg, 1); - tu_cs_emit(cs, countable->selector); - } - tu_cond_exec_end(cs); - - last_pass = ~0; - tu_cs_emit_wfi(cs); - - for (uint32_t i = 0; i < pool->counter_index_count; i++) { - struct tu_perf_query_data *data = &pool->perf_query_data[i]; - - if (last_pass != data->pass) { - last_pass = data->pass; - - if (data->pass != 0) - tu_cond_exec_end(cs); - emit_perfcntrs_pass_start(cs, data->pass); - } - - const struct fd_perfcntr_counter *counter = - &pool->perf_group[data->gid].counters[data->cntr_reg]; - - uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx); - - tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) | - CP_REG_TO_MEM_0_64B); - tu_cs_emit_qw(cs, begin_iova); - } - tu_cond_exec_end(cs); -} - -static void -emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf, - struct tu_query_pool *pool, - uint32_t query, - uint32_t stream_id) -{ - struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; - uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0); - - tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova)); - tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS); -} - -static void -emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf, - struct tu_query_pool *pool, - uint32_t query) -{ - struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; - uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin); - - if (cmdbuf->state.pass) { - cmdbuf->state.rp.has_prim_generated_query_in_rp = true; - } else { - cmdbuf->state.prim_generated_query_running_before_rp = true; - } - - cmdbuf->state.prim_counters_running++; - - if (cmdbuf->state.pass) { - /* Primitives that passed all tests are still counted in in each - * tile even with HW binning beforehand. Do not permit it. - */ - tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | - CP_COND_REG_EXEC_0_SYSMEM | - CP_COND_REG_EXEC_0_BINNING); - } - - tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS); - - tu_cs_emit_wfi(cs); - - tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) | - CP_REG_TO_MEM_0_CNT(2) | - CP_REG_TO_MEM_0_64B); - tu_cs_emit_qw(cs, begin_iova); - - if (cmdbuf->state.pass) { - tu_cond_exec_end(cs); - } -} - -VKAPI_ATTR void VKAPI_CALL +void tu_CmdBeginQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query, VkQueryControlFlags flags) { - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_query_pool, pool, queryPool); - assert(query < pool->size); - - switch (pool->type) { - case VK_QUERY_TYPE_OCCLUSION: - /* In freedreno, there is no implementation difference between - * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly - * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here. - */ - emit_begin_occlusion_query(cmdbuf, pool, query); - break; - case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - emit_begin_xfb_query(cmdbuf, pool, query, 0); - break; - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - emit_begin_prim_generated_query(cmdbuf, pool, query); - break; - case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: - emit_begin_perf_query(cmdbuf, pool, query); - break; - case VK_QUERY_TYPE_PIPELINE_STATISTICS: - emit_begin_stat_query(cmdbuf, pool, query); - break; - case VK_QUERY_TYPE_TIMESTAMP: - unreachable("Unimplemented query type"); - default: - assert(!"Invalid query type"); - } -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t query, - VkQueryControlFlags flags, - uint32_t index) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_query_pool, pool, queryPool); - assert(query < pool->size); - - switch (pool->type) { - case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - emit_begin_xfb_query(cmdbuf, pool, query, index); - break; - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - emit_begin_prim_generated_query(cmdbuf, pool, query); - break; - default: - assert(!"Invalid query type"); - } -} - -static void -emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf, - struct tu_query_pool *pool, - uint32_t query) -{ - /* Ending an occlusion query happens in a few steps: - * 1) Set the slot->end to UINT64_MAX. - * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to - * write the current sample count value into slot->end. - * 3) Since (2) is asynchronous, wait until slot->end is not equal to - * UINT64_MAX before continuing via CP_WAIT_REG_MEM. - * 4) Accumulate the results of the query (slot->end - slot->begin) into - * slot->result. - * 5) If vkCmdEndQuery is *not* called from within the scope of a render - * pass, set the slot's available bit since the query is now done. - * 6) If vkCmdEndQuery *is* called from within the scope of a render - * pass, we cannot mark as available yet since the commands in - * draw_cs are not run until vkCmdEndRenderPass. - */ - const struct tu_render_pass *pass = cmdbuf->state.pass; - struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs; - - uint64_t available_iova = query_available_iova(pool, query); - uint64_t begin_iova = occlusion_query_iova(pool, query, begin); - uint64_t end_iova = occlusion_query_iova(pool, query, end); - uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0); - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); - tu_cs_emit_qw(cs, end_iova); - tu_cs_emit_qw(cs, 0xffffffffffffffffull); - - tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); - - tu_cs_emit_regs(cs, - A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); - - tu_cs_emit_regs(cs, - A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova)); - - tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); - tu_cs_emit(cs, ZPASS_DONE); - - tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); - tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) | - CP_WAIT_REG_MEM_0_POLL_MEMORY); - tu_cs_emit_qw(cs, end_iova); - tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff)); - tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0)); - tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); - - /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */ - tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); - tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); - tu_cs_emit_qw(cs, result_iova); - tu_cs_emit_qw(cs, result_iova); - tu_cs_emit_qw(cs, end_iova); - tu_cs_emit_qw(cs, begin_iova); - - tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); - - if (pass) - /* Technically, queries should be tracked per-subpass, but here we track - * at the render pass level to simply the code a bit. This is safe - * because the only commands that use the available bit are - * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which - * cannot be invoked from inside a render pass scope. - */ - cs = &cmdbuf->draw_epilogue_cs; - - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); - tu_cs_emit_qw(cs, available_iova); - tu_cs_emit_qw(cs, 0x1); -} - -/* PRIMITIVE_CTRS is used for two distinct queries: - * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT - * - VK_QUERY_TYPE_PIPELINE_STATISTICS - * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted - * only for outer query. - * - * Also, pipeline stat query could run outside of renderpass and prim gen - * query inside of secondary cmd buffer - for such case we ought to track - * the status of pipeline stats query. - */ -static void -emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf, - struct tu_cs *cs, - enum VkQueryType query_type) -{ - bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY; - cmdbuf->state.prim_counters_running--; - if (cmdbuf->state.prim_counters_running == 0) { - bool need_cond_exec = - is_secondary && - query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT && - is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics); - - if (!need_cond_exec) { - tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); - } else { - tu_cs_reserve(cs, 7 + 2); - /* Check that pipeline stats query is not running, only then - * we count stop the counter. - */ - tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6); - tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); - tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); - tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2)); - tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */ - - tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); - } - } - - if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); - tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); - tu_cs_emit(cs, 1); - } } -static void -emit_end_stat_query(struct tu_cmd_buffer *cmdbuf, - struct tu_query_pool *pool, - uint32_t query) -{ - struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; - uint64_t end_iova = pipeline_stat_query_iova(pool, query, end); - uint64_t available_iova = query_available_iova(pool, query); - uint64_t result_iova; - uint64_t stat_start_iova; - uint64_t stat_stop_iova; - - if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) { - /* No need to conditionally execute STOP_PRIMITIVE_CTRS when - * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a - * renderpass, because it is already stopped. - */ - emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS); - } - - if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) { - tu6_emit_event_write(cmdbuf, cs, STOP_FRAGMENT_CTRS); - } - - if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) { - tu6_emit_event_write(cmdbuf, cs, STOP_COMPUTE_CTRS); - } - - tu_cs_emit_wfi(cs); - - tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) | - CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) | - CP_REG_TO_MEM_0_64B); - tu_cs_emit_qw(cs, end_iova); - - for (int i = 0; i < STAT_COUNT; i++) { - result_iova = query_result_iova(pool, query, uint64_t, i); - stat_start_iova = pipeline_stat_query_iova(pool, query, begin[i]); - stat_stop_iova = pipeline_stat_query_iova(pool, query, end[i]); - - tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); - tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | - CP_MEM_TO_MEM_0_DOUBLE | - CP_MEM_TO_MEM_0_NEG_C); - - tu_cs_emit_qw(cs, result_iova); - tu_cs_emit_qw(cs, result_iova); - tu_cs_emit_qw(cs, stat_stop_iova); - tu_cs_emit_qw(cs, stat_start_iova); - } - - tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); - - if (cmdbuf->state.pass) - cs = &cmdbuf->draw_epilogue_cs; - - /* Set the availability to 1 */ - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); - tu_cs_emit_qw(cs, available_iova); - tu_cs_emit_qw(cs, 0x1); -} - -static void -emit_end_perf_query(struct tu_cmd_buffer *cmdbuf, - struct tu_query_pool *pool, - uint32_t query) -{ - struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; - uint64_t available_iova = query_available_iova(pool, query); - uint64_t end_iova; - uint64_t begin_iova; - uint64_t result_iova; - uint32_t last_pass = ~0; - - for (uint32_t i = 0; i < pool->counter_index_count; i++) { - struct tu_perf_query_data *data = &pool->perf_query_data[i]; - - if (last_pass != data->pass) { - last_pass = data->pass; - - if (data->pass != 0) - tu_cond_exec_end(cs); - emit_perfcntrs_pass_start(cs, data->pass); - } - - const struct fd_perfcntr_counter *counter = - &pool->perf_group[data->gid].counters[data->cntr_reg]; - - end_iova = perf_query_iova(pool, 0, end, data->app_idx); - - tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) | - CP_REG_TO_MEM_0_64B); - tu_cs_emit_qw(cs, end_iova); - } - tu_cond_exec_end(cs); - - last_pass = ~0; - tu_cs_emit_wfi(cs); - - for (uint32_t i = 0; i < pool->counter_index_count; i++) { - struct tu_perf_query_data *data = &pool->perf_query_data[i]; - - if (last_pass != data->pass) { - last_pass = data->pass; - - - if (data->pass != 0) - tu_cond_exec_end(cs); - emit_perfcntrs_pass_start(cs, data->pass); - } - - result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot, - data->app_idx); - begin_iova = perf_query_iova(pool, 0, begin, data->app_idx); - end_iova = perf_query_iova(pool, 0, end, data->app_idx); - - /* result += end - begin */ - tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); - tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | - CP_MEM_TO_MEM_0_DOUBLE | - CP_MEM_TO_MEM_0_NEG_C); - - tu_cs_emit_qw(cs, result_iova); - tu_cs_emit_qw(cs, result_iova); - tu_cs_emit_qw(cs, end_iova); - tu_cs_emit_qw(cs, begin_iova); - } - tu_cond_exec_end(cs); - - tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); - - if (cmdbuf->state.pass) - cs = &cmdbuf->draw_epilogue_cs; - - /* Set the availability to 1 */ - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); - tu_cs_emit_qw(cs, available_iova); - tu_cs_emit_qw(cs, 0x1); -} - -static void -emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf, - struct tu_query_pool *pool, - uint32_t query, - uint32_t stream_id) -{ - struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; - - uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0); - uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0); - uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1); - uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0); - uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1); - uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0); - uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1); - uint64_t available_iova = query_available_iova(pool, query); - - tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova)); - tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS); - - tu_cs_emit_wfi(cs); - tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS); - - /* Set the count of written primitives */ - tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); - tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | - CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000); - tu_cs_emit_qw(cs, result_written_iova); - tu_cs_emit_qw(cs, result_written_iova); - tu_cs_emit_qw(cs, end_written_iova); - tu_cs_emit_qw(cs, begin_written_iova); - - tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS); - - /* Set the count of generated primitives */ - tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); - tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | - CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000); - tu_cs_emit_qw(cs, result_generated_iova); - tu_cs_emit_qw(cs, result_generated_iova); - tu_cs_emit_qw(cs, end_generated_iova); - tu_cs_emit_qw(cs, begin_generated_iova); - - /* Set the availability to 1 */ - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); - tu_cs_emit_qw(cs, available_iova); - tu_cs_emit_qw(cs, 0x1); -} - -static void -emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf, - struct tu_query_pool *pool, - uint32_t query) -{ - struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; - - if (!cmdbuf->state.pass) { - cmdbuf->state.prim_generated_query_running_before_rp = false; - } - - uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin); - uint64_t end_iova = primitives_generated_query_iova(pool, query, end); - uint64_t result_iova = primitives_generated_query_iova(pool, query, result); - uint64_t available_iova = query_available_iova(pool, query); - - if (cmdbuf->state.pass) { - tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | - CP_COND_REG_EXEC_0_SYSMEM | - CP_COND_REG_EXEC_0_BINNING); - } - - tu_cs_emit_wfi(cs); - - tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) | - CP_REG_TO_MEM_0_CNT(2) | - CP_REG_TO_MEM_0_64B); - tu_cs_emit_qw(cs, end_iova); - - tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); - tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | - CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES); - tu_cs_emit_qw(cs, result_iova); - tu_cs_emit_qw(cs, result_iova); - tu_cs_emit_qw(cs, end_iova); - tu_cs_emit_qw(cs, begin_iova); - - tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); - - /* Should be after waiting for mem writes to have up to date info - * about which query is running. - */ - emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT); - - if (cmdbuf->state.pass) { - tu_cond_exec_end(cs); - } - - if (cmdbuf->state.pass) - cs = &cmdbuf->draw_epilogue_cs; - - /* Set the availability to 1 */ - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); - tu_cs_emit_qw(cs, available_iova); - tu_cs_emit_qw(cs, 0x1); -} - -/* Implement this bit of spec text from section 17.2 "Query Operation": - * - * If queries are used while executing a render pass instance that has - * multiview enabled, the query uses N consecutive query indices in the - * query pool (starting at query) where N is the number of bits set in the - * view mask in the subpass the query is used in. How the numerical - * results of the query are distributed among the queries is - * implementation-dependent. For example, some implementations may write - * each view’s results to a distinct query, while other implementations - * may write the total result to the first query and write zero to the - * other queries. However, the sum of the results in all the queries must - * accurately reflect the total result of the query summed over all views. - * Applications can sum the results from all the queries to compute the - * total result. - * - * Since we execute all views at once, we write zero to the other queries. - * Furthermore, because queries must be reset before use, and we set the - * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available. - */ - -static void -handle_multiview_queries(struct tu_cmd_buffer *cmd, - struct tu_query_pool *pool, - uint32_t query) -{ - if (!cmd->state.pass || !cmd->state.subpass->multiview_mask) - return; - - unsigned views = util_bitcount(cmd->state.subpass->multiview_mask); - struct tu_cs *cs = &cmd->draw_epilogue_cs; - - for (uint32_t i = 1; i < views; i++) { - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); - tu_cs_emit_qw(cs, query_available_iova(pool, query + i)); - tu_cs_emit_qw(cs, 0x1); - } -} - -VKAPI_ATTR void VKAPI_CALL +void tu_CmdEndQuery(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t query) { - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_query_pool, pool, queryPool); - assert(query < pool->size); - - switch (pool->type) { - case VK_QUERY_TYPE_OCCLUSION: - emit_end_occlusion_query(cmdbuf, pool, query); - break; - case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - emit_end_xfb_query(cmdbuf, pool, query, 0); - break; - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - emit_end_prim_generated_query(cmdbuf, pool, query); - break; - case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: - emit_end_perf_query(cmdbuf, pool, query); - break; - case VK_QUERY_TYPE_PIPELINE_STATISTICS: - emit_end_stat_query(cmdbuf, pool, query); - break; - case VK_QUERY_TYPE_TIMESTAMP: - unreachable("Unimplemented query type"); - default: - assert(!"Invalid query type"); - } - - handle_multiview_queries(cmdbuf, pool, query); -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, - VkQueryPool queryPool, - uint32_t query, - uint32_t index) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); - TU_FROM_HANDLE(tu_query_pool, pool, queryPool); - assert(query < pool->size); - - switch (pool->type) { - case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: - assert(index <= 4); - emit_end_xfb_query(cmdbuf, pool, query, index); - break; - case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: - emit_end_prim_generated_query(cmdbuf, pool, query); - break; - default: - assert(!"Invalid query type"); - } -} - -VKAPI_ATTR void VKAPI_CALL -tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, - VkPipelineStageFlagBits2 pipelineStage, - VkQueryPool queryPool, - uint32_t query) -{ - TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); - TU_FROM_HANDLE(tu_query_pool, pool, queryPool); - - /* Inside a render pass, just write the timestamp multiple times so that - * the user gets the last one if we use GMEM. There isn't really much - * better we can do, and this seems to be what the blob does too. - */ - struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; - - /* Stages that will already have been executed by the time the CP executes - * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw - * indirect stage counts as top-of-pipe too. - */ - VkPipelineStageFlags2 top_of_pipe_flags = - VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | - VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT; - - if (pipelineStage & ~top_of_pipe_flags) { - /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM - * does CP_WAIT_FOR_ME internally, which will wait for the WFI to - * complete. - * - * Stalling the CP like this is really unfortunate, but I don't think - * there's a better solution that allows all 48 bits of precision - * because CP_EVENT_WRITE doesn't support 64-bit timestamps. - */ - tu_cs_emit_wfi(cs); - } - - tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) | - CP_REG_TO_MEM_0_CNT(2) | - CP_REG_TO_MEM_0_64B); - tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0)); - - /* Only flag availability once the entire renderpass is done, similar to - * the begin/end path. - */ - cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs; - - tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); - tu_cs_emit_qw(cs, query_available_iova(pool, query)); - tu_cs_emit_qw(cs, 0x1); - - /* From the spec for vkCmdWriteTimestamp: - * - * If vkCmdWriteTimestamp is called while executing a render pass - * instance that has multiview enabled, the timestamp uses N consecutive - * query indices in the query pool (starting at query) where N is the - * number of bits set in the view mask of the subpass the command is - * executed in. The resulting query values are determined by an - * implementation-dependent choice of one of the following behaviors: - * - * - The first query is a timestamp value and (if more than one bit is - * set in the view mask) zero is written to the remaining queries. - * If two timestamps are written in the same subpass, the sum of the - * execution time of all views between those commands is the - * difference between the first query written by each command. - * - * - All N queries are timestamp values. If two timestamps are written - * in the same subpass, the sum of the execution time of all views - * between those commands is the sum of the difference between - * corresponding queries written by each command. The difference - * between corresponding queries may be the execution time of a - * single view. - * - * We execute all views in the same draw call, so we implement the first - * option, the same as regular queries. - */ - handle_multiview_queries(cmd, pool, query); -} - -VKAPI_ATTR VkResult VKAPI_CALL -tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( - VkPhysicalDevice physicalDevice, - uint32_t queueFamilyIndex, - uint32_t* pCounterCount, - VkPerformanceCounterKHR* pCounters, - VkPerformanceCounterDescriptionKHR* pCounterDescriptions) -{ - TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice); - - uint32_t desc_count = *pCounterCount; - uint32_t group_count; - const struct fd_perfcntr_group *group = - fd_perfcntrs(&phydev->dev_id, &group_count); - - VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount); - VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc, - pCounterDescriptions, &desc_count); - - for (int i = 0; i < group_count; i++) { - for (int j = 0; j < group[i].num_countables; j++) { - - vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { - counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR; - counter->unit = - fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type]; - counter->storage = - fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type]; - - unsigned char sha1_result[20]; - _mesa_sha1_compute(group[i].countables[j].name, - strlen(group[i].countables[j].name), - sha1_result); - memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); - } - - vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) { - desc->flags = 0; - - snprintf(desc->name, sizeof(desc->name), - "%s", group[i].countables[j].name); - snprintf(desc->category, sizeof(desc->category), "%s", group[i].name); - snprintf(desc->description, sizeof(desc->description), - "%s: %s performance counter", - group[i].name, group[i].countables[j].name); - } - } - } - - return vk_outarray_status(&out); -} - -VKAPI_ATTR void VKAPI_CALL -tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( - VkPhysicalDevice physicalDevice, - const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo, - uint32_t* pNumPasses) -{ - TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice); - uint32_t group_count = 0; - uint32_t gid = 0, cid = 0, n_passes; - const struct fd_perfcntr_group *group = - fd_perfcntrs(&phydev->dev_id, &group_count); - - uint32_t counters_requested[group_count]; - memset(counters_requested, 0x0, sizeof(counters_requested)); - *pNumPasses = 1; - - for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) { - perfcntr_index(group, group_count, - pPerformanceQueryCreateInfo->pCounterIndices[i], - &gid, &cid); - - counters_requested[gid]++; - } - - for (uint32_t i = 0; i < group_count; i++) { - n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters); - *pNumPasses = MAX2(*pNumPasses, n_passes); - } } -VKAPI_ATTR VkResult VKAPI_CALL -tu_AcquireProfilingLockKHR(VkDevice device, - const VkAcquireProfilingLockInfoKHR* pInfo) -{ - /* TODO. Probably there's something to do for kgsl. */ - return VK_SUCCESS; -} - -VKAPI_ATTR void VKAPI_CALL -tu_ReleaseProfilingLockKHR(VkDevice device) +void +tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer, + VkPipelineStageFlagBits pipelineStage, + VkQueryPool queryPool, + uint32_t query) { - /* TODO. Probably there's something to do for kgsl. */ - return; } diff --git a/lib/mesa/src/freedreno/vulkan/tu_shader.c b/lib/mesa/src/freedreno/vulkan/tu_shader.c index e485f8f5c..f6e13d7c4 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_shader.c +++ b/lib/mesa/src/freedreno/vulkan/tu_shader.c @@ -1,894 +1,336 @@ /* * Copyright © 2019 Google LLC - * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ -#include "tu_shader.h" +#include "tu_private.h" #include "spirv/nir_spirv.h" #include "util/mesa-sha1.h" -#include "nir/nir_xfb_info.h" -#include "nir/nir_vulkan.h" -#include "vk_pipeline.h" -#include "vk_util.h" #include "ir3/ir3_nir.h" -#include "tu_device.h" -#include "tu_descriptor_set.h" -#include "tu_pipeline.h" - -nir_shader * -tu_spirv_to_nir(struct tu_device *dev, - void *mem_ctx, - const VkPipelineShaderStageCreateInfo *stage_info, - gl_shader_stage stage) +static nir_shader * +tu_spirv_to_nir(struct ir3_compiler *compiler, + const uint32_t *words, + size_t word_count, + gl_shader_stage stage, + const char *entry_point_name, + const VkSpecializationInfo *spec_info) { /* TODO these are made-up */ const struct spirv_to_nir_options spirv_options = { - .ubo_addr_format = nir_address_format_vec2_index_32bit_offset, - .ssbo_addr_format = nir_address_format_vec2_index_32bit_offset, - - /* Accessed via stg/ldg */ - .phys_ssbo_addr_format = nir_address_format_64bit_global, - - /* Accessed via the const register file */ - .push_const_addr_format = nir_address_format_logical, - - /* Accessed via ldl/stl */ - .shared_addr_format = nir_address_format_32bit_offset, - - /* Accessed via stg/ldg (not used with Vulkan?) */ - .global_addr_format = nir_address_format_64bit_global, - - /* Use 16-bit math for RelaxedPrecision ALU ops */ - .mediump_16bit_alu = true, - - /* ViewID is a sysval in geometry stages and an input in the FS */ - .view_index_is_input = stage == MESA_SHADER_FRAGMENT, - .caps = { - .transform_feedback = true, - .tessellation = true, - .draw_parameters = true, - .image_read_without_format = true, - .image_write_without_format = true, - .variable_pointers = true, - .stencil_export = true, - .multiview = true, - .shader_viewport_index_layer = true, - .geometry_streams = true, - .device_group = true, - .descriptor_indexing = true, - .descriptor_array_dynamic_indexing = true, - .descriptor_array_non_uniform_indexing = true, - .runtime_descriptor_array = true, - .float_controls = true, - .float16 = true, - .int16 = true, - .storage_16bit = dev->physical_device->info->a6xx.storage_16bit, - .demote_to_helper_invocation = true, - .vk_memory_model = true, - .vk_memory_model_device_scope = true, - .subgroup_basic = true, - .subgroup_ballot = true, - .subgroup_vote = true, - .subgroup_quad = true, - .subgroup_shuffle = true, - .subgroup_arithmetic = true, - .physical_storage_buffer_address = true, - }, + .lower_ubo_ssbo_access_to_offsets = true, + .caps = { false }, }; - const nir_shader_compiler_options *nir_options = - ir3_get_compiler_options(dev->compiler); - - nir_shader *nir; - VkResult result = - vk_pipeline_shader_stage_to_nir(&dev->vk, stage_info, &spirv_options, - nir_options, mem_ctx, &nir); - if (result != VK_SUCCESS) - return NULL; - - if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_NIR)) { - fprintf(stderr, "translated nir:\n"); - nir_print_shader(nir, stderr); - } - - const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = { - .point_coord = true, - }; - NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings); - - NIR_PASS_V(nir, nir_lower_global_vars_to_local); - - /* Older glslang missing bf6efd0316d8 ("SPV: Fix #2293: keep relaxed - * precision on arg passed to relaxed param") will pass function args through - * a highp temporary, so we need the nir_opt_find_array_copies() and a copy - * prop before we lower mediump vars, or you'll be unable to optimize out - * array copies after lowering. We do this before splitting copies, since - * that works against nir_opt_find_array_copies(). - * */ - NIR_PASS_V(nir, nir_opt_find_array_copies); - NIR_PASS_V(nir, nir_opt_copy_prop_vars); - NIR_PASS_V(nir, nir_opt_dce); - - NIR_PASS_V(nir, nir_split_var_copies); - NIR_PASS_V(nir, nir_lower_var_copies); - - NIR_PASS_V(nir, nir_lower_mediump_vars, nir_var_function_temp | nir_var_shader_temp | nir_var_mem_shared); - NIR_PASS_V(nir, nir_opt_copy_prop_vars); - NIR_PASS_V(nir, nir_opt_combine_stores, nir_var_all); - - NIR_PASS_V(nir, nir_lower_is_helper_invocation); - - NIR_PASS_V(nir, nir_lower_system_values); - - NIR_PASS_V(nir, nir_lower_frexp); - - ir3_optimize_loop(dev->compiler, nir); - - NIR_PASS_V(nir, nir_opt_conditional_discard); - - return nir; -} - -static void -lower_load_push_constant(struct tu_device *dev, - nir_builder *b, - nir_intrinsic_instr *instr, - struct tu_shader *shader, - const struct tu_pipeline_layout *layout) -{ - uint32_t base = nir_intrinsic_base(instr); - assert(base % 4 == 0); - - if (tu6_shared_constants_enable(layout, dev->compiler)) { - /* All stages share the same range. We could potentially add - * push_constant_offset to layout and apply it, but this is good for - * now. - */ - base += dev->compiler->shared_consts_base_offset * 4; - } else { - assert(base >= shader->const_state.push_consts.lo * 4); - base -= shader->const_state.push_consts.lo * 4; - } - - nir_ssa_def *load = - nir_load_uniform(b, instr->num_components, - instr->dest.ssa.bit_size, - nir_ushr(b, instr->src[0].ssa, nir_imm_int(b, 2)), - .base = base); - - nir_ssa_def_rewrite_uses(&instr->dest.ssa, load); - - nir_instr_remove(&instr->instr); -} - -static void -lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr, - struct tu_shader *shader, - const struct tu_pipeline_layout *layout) -{ - nir_ssa_def *vulkan_idx = instr->src[0].ssa; - - unsigned set = nir_intrinsic_desc_set(instr); - unsigned binding = nir_intrinsic_binding(instr); - struct tu_descriptor_set_layout *set_layout = layout->set[set].layout; - struct tu_descriptor_set_binding_layout *binding_layout = - &set_layout->binding[binding]; - nir_ssa_def *base; - - shader->active_desc_sets |= 1u << set; - - switch (binding_layout->type) { - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - if (layout->independent_sets) { - /* With independent sets, we don't know - * layout->set[set].dynamic_offset_start until after link time which - * with fast linking means after the shader is compiled. We have to - * get it from the const file instead. - */ - base = nir_imm_int(b, binding_layout->dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS)); - nir_ssa_def *dynamic_offset_start = - nir_load_uniform(b, 1, 32, nir_imm_int(b, 0), - .base = shader->const_state.dynamic_offset_loc + set); - base = nir_iadd(b, base, dynamic_offset_start); - } else { - base = nir_imm_int(b, (layout->set[set].dynamic_offset_start + - binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS)); + ir3_get_compiler_options(compiler); + + /* convert VkSpecializationInfo */ + struct nir_spirv_specialization *spec = NULL; + uint32_t num_spec = 0; + if (spec_info && spec_info->mapEntryCount) { + spec = malloc(sizeof(*spec) * spec_info->mapEntryCount); + if (!spec) + return NULL; + + for (uint32_t i = 0; i < spec_info->mapEntryCount; i++) { + const VkSpecializationMapEntry *entry = &spec_info->pMapEntries[i]; + const void *data = spec_info->pData + entry->offset; + assert(data + entry->size <= spec_info->pData + spec_info->dataSize); + spec[i].id = entry->constantID; + if (entry->size == 8) + spec[i].data64 = *(const uint64_t *) data; + else + spec[i].data32 = *(const uint32_t *) data; + spec[i].defined_on_module = false; } - set = MAX_SETS; - break; - default: - base = nir_imm_int(b, binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)); - break; - } - - nir_ssa_def *shift; - if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - /* Inline uniform blocks cannot have arrays so the stride is unused */ - shift = nir_imm_int(b, 0); - } else { - unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS); - assert(util_is_power_of_two_nonzero(stride)); - shift = nir_imm_int(b, util_logbase2(stride)); + num_spec = spec_info->mapEntryCount; } - nir_ssa_def *def = nir_vec3(b, nir_imm_int(b, set), - nir_iadd(b, base, - nir_ishl(b, vulkan_idx, shift)), - shift); + nir_shader *nir = + spirv_to_nir(words, word_count, spec, num_spec, stage, entry_point_name, + &spirv_options, nir_options); - nir_ssa_def_rewrite_uses(&instr->dest.ssa, def); - nir_instr_remove(&instr->instr); -} + free(spec); -static void -lower_vulkan_resource_reindex(nir_builder *b, nir_intrinsic_instr *instr) -{ - nir_ssa_def *old_index = instr->src[0].ssa; - nir_ssa_def *delta = instr->src[1].ssa; - nir_ssa_def *shift = nir_channel(b, old_index, 2); - - nir_ssa_def *new_index = - nir_vec3(b, nir_channel(b, old_index, 0), - nir_iadd(b, nir_channel(b, old_index, 1), - nir_ishl(b, delta, shift)), - shift); - - nir_ssa_def_rewrite_uses(&instr->dest.ssa, new_index); - nir_instr_remove(&instr->instr); -} + assert(nir->info.stage == stage); + nir_validate_shader(nir, "after spirv_to_nir"); -static void -lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin) -{ - nir_ssa_def *old_index = intrin->src[0].ssa; - /* Loading the descriptor happens as part of the load/store instruction so - * this is a no-op. We just need to turn the shift into an offset of 0. - */ - nir_ssa_def *new_index = - nir_vec3(b, nir_channel(b, old_index, 0), - nir_channel(b, old_index, 1), - nir_imm_int(b, 0)); - nir_ssa_def_rewrite_uses(&intrin->dest.ssa, new_index); - nir_instr_remove(&intrin->instr); + return nir; } static void -lower_ssbo_ubo_intrinsic(struct tu_device *dev, - nir_builder *b, nir_intrinsic_instr *intrin) +tu_sort_variables_by_location(struct exec_list *variables) { - const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic]; - - /* The bindless base is part of the instruction, which means that part of - * the "pointer" has to be constant. We solve this in the same way the blob - * does, by generating a bunch of if-statements. In the usual case where - * the descriptor set is constant we can skip that, though). - */ - - unsigned buffer_src; - if (intrin->intrinsic == nir_intrinsic_store_ssbo) { - /* This has the value first */ - buffer_src = 1; - } else { - buffer_src = 0; - } - - nir_ssa_scalar scalar_idx = nir_ssa_scalar_resolved(intrin->src[buffer_src].ssa, 0); - nir_ssa_def *descriptor_idx = nir_channel(b, intrin->src[buffer_src].ssa, 1); - - /* For isam, we need to use the appropriate descriptor if 16-bit storage is - * enabled. Descriptor 0 is the 16-bit one, descriptor 1 is the 32-bit one. - */ - if (dev->physical_device->info->a6xx.storage_16bit && - intrin->intrinsic == nir_intrinsic_load_ssbo && - (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) && - intrin->dest.ssa.bit_size > 16) { - descriptor_idx = nir_iadd(b, descriptor_idx, nir_imm_int(b, 1)); - } - - nir_ssa_def *results[MAX_SETS + 1] = { NULL }; - - if (nir_ssa_scalar_is_const(scalar_idx)) { - nir_ssa_def *bindless = - nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = nir_ssa_scalar_as_uint(scalar_idx)); - nir_instr_rewrite_src_ssa(&intrin->instr, &intrin->src[buffer_src], bindless); - return; - } - - nir_ssa_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp); - for (unsigned i = 0; i < MAX_SETS + 1; i++) { - /* if (base_idx == i) { ... */ - nir_if *nif = nir_push_if(b, nir_ieq_imm(b, base_idx, i)); - - nir_ssa_def *bindless = - nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = i); - - nir_intrinsic_instr *copy = - nir_intrinsic_instr_create(b->shader, intrin->intrinsic); - - copy->num_components = intrin->num_components; - - for (unsigned src = 0; src < info->num_srcs; src++) { - if (src == buffer_src) - copy->src[src] = nir_src_for_ssa(bindless); - else - copy->src[src] = nir_src_for_ssa(intrin->src[src].ssa); - } - - for (unsigned idx = 0; idx < info->num_indices; idx++) { - copy->const_index[idx] = intrin->const_index[idx]; - } - - if (info->has_dest) { - nir_ssa_dest_init(©->instr, ©->dest, - intrin->dest.ssa.num_components, - intrin->dest.ssa.bit_size, - NULL); - results[i] = ©->dest.ssa; + struct exec_list sorted; + exec_list_make_empty(&sorted); + + nir_foreach_variable_safe(var, variables) + { + exec_node_remove(&var->node); + + /* insert the variable into the sorted list */ + nir_variable *next = NULL; + nir_foreach_variable(tmp, &sorted) + { + if (var->data.location < tmp->data.location) { + next = tmp; + break; + } } - - nir_builder_instr_insert(b, ©->instr); - - /* } else { ... */ - nir_push_else(b, nif); + if (next) + exec_node_insert_node_before(&next->node, &var->node); + else + exec_list_push_tail(&sorted, &var->node); } - nir_ssa_def *result = - nir_ssa_undef(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size); - for (int i = MAX_SETS; i >= 0; i--) { - nir_pop_if(b, NULL); - if (info->has_dest) - result = nir_if_phi(b, results[i], result); - } - - if (info->has_dest) - nir_ssa_def_rewrite_uses(&intrin->dest.ssa, result); - nir_instr_remove(&intrin->instr); + exec_list_move_nodes_to(&sorted, variables); } -static nir_ssa_def * -build_bindless(struct tu_device *dev, nir_builder *b, - nir_deref_instr *deref, bool is_sampler, - struct tu_shader *shader, - const struct tu_pipeline_layout *layout) -{ - nir_variable *var = nir_deref_instr_get_variable(deref); - - unsigned set = var->data.descriptor_set; - unsigned binding = var->data.binding; - const struct tu_descriptor_set_binding_layout *bind_layout = - &layout->set[set].layout->binding[binding]; - - /* input attachments use non bindless workaround */ - if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT && - likely(!(dev->instance->debug_flags & TU_DEBUG_DYNAMIC))) { - const struct glsl_type *glsl_type = glsl_without_array(var->type); - uint32_t idx = var->data.index * 2; - - BITSET_SET_RANGE_INSIDE_WORD(b->shader->info.textures_used, idx, (idx + bind_layout->array_size * 2) - 1); - - /* D24S8 workaround: stencil of D24S8 will be sampled as uint */ - if (glsl_get_sampler_result_type(glsl_type) == GLSL_TYPE_UINT) - idx += 1; - - if (deref->deref_type == nir_deref_type_var) - return nir_imm_int(b, idx); - - nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1); - return nir_iadd(b, nir_imm_int(b, idx), - nir_imul_imm(b, arr_index, 2)); - } - - shader->active_desc_sets |= 1u << set; - - nir_ssa_def *desc_offset; - unsigned descriptor_stride; - unsigned offset = 0; - /* Samplers come second in combined image/sampler descriptors, see - * write_combined_image_sampler_descriptor(). - */ - if (is_sampler && bind_layout->type == - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) { - offset = 1; - } - desc_offset = - nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) + - offset); - descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS); - - if (deref->deref_type != nir_deref_type_var) { - assert(deref->deref_type == nir_deref_type_array); - - nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1); - desc_offset = nir_iadd(b, desc_offset, - nir_imul_imm(b, arr_index, descriptor_stride)); - } - - return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set); -} - -static void -lower_image_deref(struct tu_device *dev, nir_builder *b, - nir_intrinsic_instr *instr, struct tu_shader *shader, - const struct tu_pipeline_layout *layout) +struct tu_shader * +tu_shader_create(struct tu_device *dev, + gl_shader_stage stage, + const VkPipelineShaderStageCreateInfo *stage_info, + const VkAllocationCallbacks *alloc) { - nir_deref_instr *deref = nir_src_as_deref(instr->src[0]); - nir_ssa_def *bindless = build_bindless(dev, b, deref, false, shader, layout); - nir_rewrite_image_intrinsic(instr, bindless, true); -} + const struct tu_shader_module *module = + tu_shader_module_from_handle(stage_info->module); + struct tu_shader *shader; -static bool -lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, - struct tu_device *dev, - struct tu_shader *shader, - const struct tu_pipeline_layout *layout) -{ - switch (instr->intrinsic) { - case nir_intrinsic_load_push_constant: - lower_load_push_constant(dev, b, instr, shader, layout); - return true; - - case nir_intrinsic_load_vulkan_descriptor: - lower_load_vulkan_descriptor(b, instr); - return true; - - case nir_intrinsic_vulkan_resource_index: - lower_vulkan_resource_index(b, instr, shader, layout); - return true; - case nir_intrinsic_vulkan_resource_reindex: - lower_vulkan_resource_reindex(b, instr); - return true; - - case nir_intrinsic_load_ubo: - case nir_intrinsic_load_ssbo: - case nir_intrinsic_store_ssbo: - case nir_intrinsic_ssbo_atomic_add: - case nir_intrinsic_ssbo_atomic_imin: - case nir_intrinsic_ssbo_atomic_umin: - case nir_intrinsic_ssbo_atomic_imax: - case nir_intrinsic_ssbo_atomic_umax: - case nir_intrinsic_ssbo_atomic_and: - case nir_intrinsic_ssbo_atomic_or: - case nir_intrinsic_ssbo_atomic_xor: - case nir_intrinsic_ssbo_atomic_exchange: - case nir_intrinsic_ssbo_atomic_comp_swap: - case nir_intrinsic_ssbo_atomic_fadd: - case nir_intrinsic_ssbo_atomic_fmin: - case nir_intrinsic_ssbo_atomic_fmax: - case nir_intrinsic_ssbo_atomic_fcomp_swap: - case nir_intrinsic_get_ssbo_size: - lower_ssbo_ubo_intrinsic(dev, b, instr); - return true; - - case nir_intrinsic_image_deref_load: - case nir_intrinsic_image_deref_store: - case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_imin: - case nir_intrinsic_image_deref_atomic_umin: - case nir_intrinsic_image_deref_atomic_imax: - case nir_intrinsic_image_deref_atomic_umax: - case nir_intrinsic_image_deref_atomic_and: - case nir_intrinsic_image_deref_atomic_or: - case nir_intrinsic_image_deref_atomic_xor: - case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: - case nir_intrinsic_image_deref_size: - case nir_intrinsic_image_deref_samples: - lower_image_deref(dev, b, instr, shader, layout); - return true; + const uint32_t max_variant_count = (stage == MESA_SHADER_VERTEX) ? 2 : 1; + shader = vk_zalloc2( + &dev->alloc, alloc, + sizeof(*shader) + sizeof(struct ir3_shader_variant) * max_variant_count, + 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!shader) + return NULL; - default: - return false; + /* translate SPIR-V to NIR */ + assert(module->code_size % 4 == 0); + nir_shader *nir = tu_spirv_to_nir( + dev->compiler, (const uint32_t *) module->code, module->code_size / 4, + stage, stage_info->pName, stage_info->pSpecializationInfo); + if (!nir) { + vk_free2(&dev->alloc, alloc, shader); + return NULL; } -} -static void -lower_tex_ycbcr(const struct tu_pipeline_layout *layout, - nir_builder *builder, - nir_tex_instr *tex) -{ - int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref); - assert(deref_src_idx >= 0); - nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src); - - nir_variable *var = nir_deref_instr_get_variable(deref); - const struct tu_descriptor_set_layout *set_layout = - layout->set[var->data.descriptor_set].layout; - const struct tu_descriptor_set_binding_layout *binding = - &set_layout->binding[var->data.binding]; - const struct tu_sampler_ycbcr_conversion *ycbcr_samplers = - tu_immutable_ycbcr_samplers(set_layout, binding); - - if (!ycbcr_samplers) - return; - - /* For the following instructions, we don't apply any change */ - if (tex->op == nir_texop_txs || - tex->op == nir_texop_query_levels || - tex->op == nir_texop_lod) - return; - - assert(tex->texture_index == 0); - unsigned array_index = 0; - if (deref->deref_type != nir_deref_type_var) { - assert(deref->deref_type == nir_deref_type_array); - if (!nir_src_is_const(deref->arr.index)) - return; - array_index = nir_src_as_uint(deref->arr.index); - array_index = MIN2(array_index, binding->array_size - 1); + if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_NIR)) { + fprintf(stderr, "translated nir:\n"); + nir_print_shader(nir, stderr); } - const struct tu_sampler_ycbcr_conversion *ycbcr_sampler = ycbcr_samplers + array_index; - - if (ycbcr_sampler->ycbcr_model == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY) - return; - builder->cursor = nir_after_instr(&tex->instr); + /* TODO what needs to happen? */ - uint8_t bits = vk_format_get_component_bits(ycbcr_sampler->format, - UTIL_FORMAT_COLORSPACE_RGB, - PIPE_SWIZZLE_X); - - switch (ycbcr_sampler->format) { - case VK_FORMAT_G8B8G8R8_422_UNORM: - case VK_FORMAT_B8G8R8G8_422_UNORM: - case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: - case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: - /* util_format_get_component_bits doesn't return what we want */ - bits = 8; + switch (stage) { + case MESA_SHADER_VERTEX: + tu_sort_variables_by_location(&nir->outputs); + break; + case MESA_SHADER_TESS_CTRL: + case MESA_SHADER_TESS_EVAL: + case MESA_SHADER_GEOMETRY: + tu_sort_variables_by_location(&nir->inputs); + tu_sort_variables_by_location(&nir->outputs); + break; + case MESA_SHADER_FRAGMENT: + tu_sort_variables_by_location(&nir->inputs); + break; + case MESA_SHADER_COMPUTE: break; default: + unreachable("invalid gl_shader_stage"); break; } - uint32_t bpcs[3] = {bits, bits, bits}; /* TODO: use right bpc for each channel ? */ - nir_ssa_def *result = nir_convert_ycbcr_to_rgb(builder, - ycbcr_sampler->ycbcr_model, - ycbcr_sampler->ycbcr_range, - &tex->dest.ssa, - bpcs); - nir_ssa_def_rewrite_uses_after(&tex->dest.ssa, result, - result->parent_instr); + nir_assign_var_locations(&nir->inputs, &nir->num_inputs, + ir3_glsl_type_size); + nir_assign_var_locations(&nir->outputs, &nir->num_outputs, + ir3_glsl_type_size); + nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, + ir3_glsl_type_size); - builder->cursor = nir_before_instr(&tex->instr); -} + NIR_PASS_V(nir, nir_lower_system_values); + NIR_PASS_V(nir, nir_lower_frexp); + NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size, 0); -static bool -lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev, - struct tu_shader *shader, const struct tu_pipeline_layout *layout) -{ - lower_tex_ycbcr(layout, b, tex); - - int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref); - if (sampler_src_idx >= 0) { - nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src); - nir_ssa_def *bindless = build_bindless(dev, b, deref, true, shader, layout); - nir_instr_rewrite_src(&tex->instr, &tex->src[sampler_src_idx].src, - nir_src_for_ssa(bindless)); - tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle; - } + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); - int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref); - if (tex_src_idx >= 0) { - nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src); - nir_ssa_def *bindless = build_bindless(dev, b, deref, false, shader, layout); - nir_instr_rewrite_src(&tex->instr, &tex->src[tex_src_idx].src, - nir_src_for_ssa(bindless)); - tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle; - - /* for the input attachment case: */ - if (bindless->parent_instr->type != nir_instr_type_intrinsic) - tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset; - } + shader->ir3_shader.compiler = dev->compiler; + shader->ir3_shader.type = stage; + shader->ir3_shader.nir = nir; - return true; + return shader; } -struct lower_instr_params { - struct tu_device *dev; - struct tu_shader *shader; - const struct tu_pipeline_layout *layout; -}; - -static bool -lower_instr(nir_builder *b, nir_instr *instr, void *cb_data) +void +tu_shader_destroy(struct tu_device *dev, + struct tu_shader *shader, + const VkAllocationCallbacks *alloc) { - struct lower_instr_params *params = cb_data; - b->cursor = nir_before_instr(instr); - switch (instr->type) { - case nir_instr_type_tex: - return lower_tex(b, nir_instr_as_tex(instr), params->dev, params->shader, params->layout); - case nir_instr_type_intrinsic: - return lower_intrinsic(b, nir_instr_as_intrinsic(instr), params->dev, params->shader, params->layout); - default: - return false; - } -} + if (shader->ir3_shader.nir) + ralloc_free(shader->ir3_shader.nir); -/* Figure out the range of push constants that we're actually going to push to - * the shader, and tell the backend to reserve this range when pushing UBO - * constants. - */ - -static void -gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader) -{ - uint32_t min = UINT32_MAX, max = 0; - nir_foreach_function(function, shader) { - if (!function->impl) - continue; - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - if (intrin->intrinsic != nir_intrinsic_load_push_constant) - continue; - - uint32_t base = nir_intrinsic_base(intrin); - uint32_t range = nir_intrinsic_range(intrin); - min = MIN2(min, base); - max = MAX2(max, base + range); - break; - } - } + for (uint32_t i = 0; i < 1 + shader->has_binning_pass; i++) { + if (shader->variants[i].ir) + ir3_destroy(shader->variants[i].ir); } - if (min >= max) { - tu_shader->const_state.push_consts.lo = 0; - tu_shader->const_state.push_consts.dwords = 0; - return; - } + if (shader->ir3_shader.const_state.immediates) + free(shader->ir3_shader.const_state.immediates); + if (shader->binary) + free(shader->binary); + if (shader->binning_binary) + free(shader->binning_binary); - /* CP_LOAD_STATE OFFSET and NUM_UNIT for SHARED_CONSTS are in units of - * dwords while loading regular consts is in units of vec4's. - * So we unify the unit here as dwords for tu_push_constant_range, then - * we should consider correct unit when emitting. - * - * Note there's an alignment requirement of 16 dwords on OFFSET. Expand - * the range and change units accordingly. - */ - tu_shader->const_state.push_consts.lo = (min / 4) / 4 * 4; - tu_shader->const_state.push_consts.dwords = - align(max, 16) / 4 - tu_shader->const_state.push_consts.lo; + vk_free2(&dev->alloc, alloc, shader); } -static bool -tu_lower_io(nir_shader *shader, struct tu_device *dev, - struct tu_shader *tu_shader, - const struct tu_pipeline_layout *layout) +void +tu_shader_compile_options_init( + struct tu_shader_compile_options *options, + const VkGraphicsPipelineCreateInfo *pipeline_info) { - if (!tu6_shared_constants_enable(layout, dev->compiler)) - gather_push_constants(shader, tu_shader); - - struct tu_const_state *const_state = &tu_shader->const_state; - unsigned reserved_consts_vec4 = - align(DIV_ROUND_UP(const_state->push_consts.dwords, 4), - dev->compiler->const_upload_unit); - - if (layout->independent_sets) { - const_state->dynamic_offset_loc = reserved_consts_vec4 * 4; - reserved_consts_vec4 += DIV_ROUND_UP(MAX_SETS, 4); - } else { - const_state->dynamic_offset_loc = UINT32_MAX; - } - - tu_shader->reserved_user_consts_vec4 = reserved_consts_vec4; + *options = (struct tu_shader_compile_options) { + /* TODO ir3_key */ - struct lower_instr_params params = { - .dev = dev, - .shader = tu_shader, - .layout = layout, + .optimize = !(pipeline_info->flags & + VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT), + .include_binning_pass = true, }; - - bool progress = nir_shader_instructions_pass(shader, - lower_instr, - nir_metadata_none, - ¶ms); - - /* Remove now-unused variables so that when we gather the shader info later - * they won't be counted. - */ - - if (progress) - nir_opt_dce(shader); - - progress |= - nir_remove_dead_variables(shader, - nir_var_uniform | nir_var_mem_ubo | nir_var_mem_ssbo, - NULL); - - return progress; } -static void -shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) +static uint32_t * +tu_compile_shader_variant(struct ir3_shader *shader, + const struct ir3_shader_key *key, + bool binning_pass, + struct ir3_shader_variant *variant) { - assert(glsl_type_is_vector_or_scalar(type)); - - unsigned comp_size = - glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8; - unsigned length = glsl_get_vector_elements(type); - *size = comp_size * length; - *align = comp_size; -} - -static void -tu_gather_xfb_info(nir_shader *nir, struct ir3_stream_output_info *info) -{ - nir_shader_gather_xfb_info(nir); - - if (!nir->xfb_info) - return; - - nir_xfb_info *xfb = nir->xfb_info; + variant->shader = shader; + variant->type = shader->type; + variant->key = *key; + variant->binning_pass = binning_pass; - uint8_t output_map[VARYING_SLOT_TESS_MAX]; - memset(output_map, 0, sizeof(output_map)); - - nir_foreach_shader_out_variable(var, nir) { - unsigned slots = - var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4) - : glsl_count_attribute_slots(var->type, false); - for (unsigned i = 0; i < slots; i++) - output_map[var->data.location + i] = var->data.driver_location + i; - } - - assert(xfb->output_count <= IR3_MAX_SO_OUTPUTS); - info->num_outputs = xfb->output_count; - - for (int i = 0; i < IR3_MAX_SO_BUFFERS; i++) { - info->stride[i] = xfb->buffers[i].stride / 4; - info->buffer_to_stream[i] = xfb->buffer_to_stream[i]; - } - - info->streams_written = xfb->streams_written; + int ret = ir3_compile_shader_nir(shader->compiler, variant); + if (ret) + return NULL; - for (int i = 0; i < xfb->output_count; i++) { - info->output[i].register_index = output_map[xfb->outputs[i].location]; - info->output[i].start_component = xfb->outputs[i].component_offset; - info->output[i].num_components = - util_bitcount(xfb->outputs[i].component_mask); - info->output[i].output_buffer = xfb->outputs[i].buffer; - info->output[i].dst_offset = xfb->outputs[i].offset / 4; - info->output[i].stream = xfb->buffer_to_stream[xfb->outputs[i].buffer]; - } + /* when assemble fails, we rely on tu_shader_destroy to clean up the + * variant + */ + return ir3_shader_assemble(variant, shader->compiler->gpu_id); } -struct tu_shader * -tu_shader_create(struct tu_device *dev, - nir_shader *nir, - const struct tu_shader_key *key, - struct tu_pipeline_layout *layout, - const VkAllocationCallbacks *alloc) +VkResult +tu_shader_compile(struct tu_device *dev, + struct tu_shader *shader, + const struct tu_shader *next_stage, + const struct tu_shader_compile_options *options, + const VkAllocationCallbacks *alloc) { - struct tu_shader *shader; - - shader = vk_zalloc2( - &dev->vk.alloc, alloc, - sizeof(*shader), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!shader) - return NULL; - - NIR_PASS_V(nir, nir_opt_access, &(nir_opt_access_options) { - .is_vulkan = true, - }); - - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS_V(nir, nir_lower_input_attachments, - &(nir_input_attachment_options) { - .use_fragcoord_sysval = true, - .use_layer_id_sysval = false, - /* When using multiview rendering, we must use - * gl_ViewIndex as the layer id to pass to the texture - * sampling function. gl_Layer doesn't work when - * multiview is enabled. - */ - .use_view_id_for_layer = key->multiview_mask != 0, - }); + if (options->optimize) { + /* ignore the key for the first pass of optimization */ + ir3_optimize_nir(&shader->ir3_shader, shader->ir3_shader.nir, NULL); + + if (unlikely(dev->physical_device->instance->debug_flags & + TU_DEBUG_NIR)) { + fprintf(stderr, "optimized nir:\n"); + nir_print_shader(shader->ir3_shader.nir, stderr); + } } - /* This needs to happen before multiview lowering which rewrites store - * instructions of the position variable, so that we can just rewrite one - * store at the end instead of having to rewrite every store specified by - * the user. - */ - ir3_nir_lower_io_to_temporaries(nir); + shader->binary = tu_compile_shader_variant( + &shader->ir3_shader, &options->key, false, &shader->variants[0]); + if (!shader->binary) + return VK_ERROR_OUT_OF_HOST_MEMORY; - if (nir->info.stage == MESA_SHADER_VERTEX && key->multiview_mask) { - tu_nir_lower_multiview(nir, key->multiview_mask, dev); - } + /* compile another variant for the binning pass */ + if (options->include_binning_pass && + shader->ir3_shader.type == MESA_SHADER_VERTEX) { + shader->binning_binary = tu_compile_shader_variant( + &shader->ir3_shader, &options->key, true, &shader->variants[1]); + if (!shader->binning_binary) + return VK_ERROR_OUT_OF_HOST_MEMORY; - if (nir->info.stage == MESA_SHADER_FRAGMENT && key->force_sample_interp) { - nir_foreach_shader_in_variable(var, nir) { - if (!var->data.centroid) - var->data.sample = true; - } + shader->has_binning_pass = true; } - NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const, - nir_address_format_32bit_offset); - - NIR_PASS_V(nir, nir_lower_explicit_io, - nir_var_mem_ubo | nir_var_mem_ssbo, - nir_address_format_vec2_index_32bit_offset); - - NIR_PASS_V(nir, nir_lower_explicit_io, - nir_var_mem_global, - nir_address_format_64bit_global); - - if (nir->info.stage == MESA_SHADER_COMPUTE) { - NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, - nir_var_mem_shared, shared_type_info); - NIR_PASS_V(nir, nir_lower_explicit_io, - nir_var_mem_shared, - nir_address_format_32bit_offset); - - if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) { - const unsigned chunk_size = 16; /* max single store size */ - /* Shared memory is allocated in 1024b chunks in HW, but the zero-init - * extension only requires us to initialize the memory that the shader - * is allocated at the API level, and it's up to the user to ensure - * that accesses are limited to those bounds. - */ - const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size); - NIR_PASS_V(nir, nir_zero_initialize_shared_memory, shared_size, chunk_size); + if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_IR3)) { + fprintf(stderr, "disassembled ir3:\n"); + fprintf(stderr, "shader: %s\n", + gl_shader_stage_name(shader->ir3_shader.type)); + ir3_shader_disasm(&shader->variants[0], shader->binary, stderr); + + if (shader->has_binning_pass) { + fprintf(stderr, "disassembled ir3:\n"); + fprintf(stderr, "shader: %s (binning)\n", + gl_shader_stage_name(shader->ir3_shader.type)); + ir3_shader_disasm(&shader->variants[1], shader->binning_binary, + stderr); } - - const struct nir_lower_compute_system_values_options compute_sysval_options = { - .has_base_workgroup_id = true, - }; - NIR_PASS_V(nir, nir_lower_compute_system_values, &compute_sysval_options); } - nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage); - nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage); + return VK_SUCCESS; +} - /* Gather information for transform feedback. This should be called after: - * - nir_split_per_member_structs. - * - nir_remove_dead_variables with varyings, so that we could align - * stream outputs correctly. - * - nir_assign_io_var_locations - to have valid driver_location - */ - struct ir3_stream_output_info so_info = {}; - if (nir->info.stage == MESA_SHADER_VERTEX || - nir->info.stage == MESA_SHADER_TESS_EVAL || - nir->info.stage == MESA_SHADER_GEOMETRY) - tu_gather_xfb_info(nir, &so_info); +VkResult +tu_CreateShaderModule(VkDevice _device, + const VkShaderModuleCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkShaderModule *pShaderModule) +{ + TU_FROM_HANDLE(tu_device, device, _device); + struct tu_shader_module *module; - NIR_PASS_V(nir, tu_lower_io, dev, shader, layout); + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO); + assert(pCreateInfo->flags == 0); + assert(pCreateInfo->codeSize % 4 == 0); - nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + module = vk_alloc2(&device->alloc, pAllocator, + sizeof(*module) + pCreateInfo->codeSize, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (module == NULL) + return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - ir3_finalize_nir(dev->compiler, nir); + module->code_size = pCreateInfo->codeSize; + memcpy(module->code, pCreateInfo->pCode, pCreateInfo->codeSize); - bool shared_consts_enable = tu6_shared_constants_enable(layout, dev->compiler); - if (shared_consts_enable) - assert(!shader->const_state.push_consts.dwords); + _mesa_sha1_compute(module->code, module->code_size, module->sha1); - shader->ir3_shader = - ir3_shader_from_nir(dev->compiler, nir, &(struct ir3_shader_options) { - .reserved_user_consts = shader->reserved_user_consts_vec4, - .shared_consts_enable = shared_consts_enable, - .api_wavesize = key->api_wavesize, - .real_wavesize = key->real_wavesize, - }, &so_info); + *pShaderModule = tu_shader_module_to_handle(module); - return shader; + return VK_SUCCESS; } void -tu_shader_destroy(struct tu_device *dev, - struct tu_shader *shader, - const VkAllocationCallbacks *alloc) +tu_DestroyShaderModule(VkDevice _device, + VkShaderModule _module, + const VkAllocationCallbacks *pAllocator) { - ir3_shader_destroy(shader->ir3_shader); + TU_FROM_HANDLE(tu_device, device, _device); + TU_FROM_HANDLE(tu_shader_module, module, _module); + + if (!module) + return; - vk_free2(&dev->vk.alloc, alloc, shader); + vk_free2(&device->alloc, pAllocator, module); } diff --git a/lib/mesa/src/freedreno/vulkan/tu_util.c b/lib/mesa/src/freedreno/vulkan/tu_util.c index 9b0b9a420..e630460fb 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_util.c +++ b/lib/mesa/src/freedreno/vulkan/tu_util.c @@ -1,21 +1,79 @@ /* * Copyright © 2015 Intel Corporation - * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ -#include "tu_util.h" +#include "tu_private.h" +#include <assert.h> #include <errno.h> #include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #include "util/u_math.h" -#include "util/timespec.h" #include "vk_enum_to_str.h" -#include "tu_device.h" -#include "tu_pass.h" +/* TODO: Add Android support to tu_log funcs */ -void PRINTFLIKE(3, 4) +/** Log an error message. */ +void tu_printflike(1, 2) tu_loge(const char *format, ...) +{ + va_list va; + + va_start(va, format); + tu_loge_v(format, va); + va_end(va); +} + +/** \see tu_loge() */ +void +tu_loge_v(const char *format, va_list va) +{ + fprintf(stderr, "vk: error: "); + vfprintf(stderr, format, va); + fprintf(stderr, "\n"); +} + +/** Log an error message. */ +void tu_printflike(1, 2) tu_logi(const char *format, ...) +{ + va_list va; + + va_start(va, format); + tu_logi_v(format, va); + va_end(va); +} + +/** \see tu_logi() */ +void +tu_logi_v(const char *format, va_list va) +{ + fprintf(stderr, "tu: info: "); + vfprintf(stderr, format, va); + fprintf(stderr, "\n"); +} + +void tu_printflike(3, 4) __tu_finishme(const char *file, int line, const char *format, ...) { va_list ap; @@ -25,17 +83,16 @@ void PRINTFLIKE(3, 4) vsnprintf(buffer, sizeof(buffer), format, ap); va_end(ap); - mesa_loge("%s:%d: FINISHME: %s\n", file, line, buffer); + fprintf(stderr, "%s:%d: FINISHME: %s\n", file, line, buffer); } VkResult -__vk_startup_errorf(struct tu_instance *instance, - VkResult error, - bool always_print, - const char *file, - int line, - const char *format, - ...) +__vk_errorf(struct tu_instance *instance, + VkResult error, + const char *file, + int line, + const char *format, + ...) { va_list ap; char buffer[256]; @@ -43,8 +100,7 @@ __vk_startup_errorf(struct tu_instance *instance, const char *error_str = vk_Result_to_str(error); #ifndef DEBUG - if (!always_print) - return error; + return error; #endif if (format) { @@ -52,236 +108,10 @@ __vk_startup_errorf(struct tu_instance *instance, vsnprintf(buffer, sizeof(buffer), format, ap); va_end(ap); - mesa_loge("%s:%d: %s (%s)\n", file, line, buffer, error_str); + fprintf(stderr, "%s:%d: %s (%s)\n", file, line, buffer, error_str); } else { - mesa_loge("%s:%d: %s\n", file, line, error_str); + fprintf(stderr, "%s:%d: %s\n", file, line, error_str); } return error; } - -static void -tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb, - const struct tu_device *dev, - const struct tu_render_pass *pass, - enum tu_gmem_layout gmem_layout) -{ - const uint32_t tile_align_w = pass->tile_align_w; - const uint32_t tile_align_h = dev->physical_device->info->tile_align_h; - const uint32_t max_tile_width = dev->physical_device->info->tile_max_w; - const uint32_t max_tile_height = dev->physical_device->info->tile_max_h; - struct tu_tiling_config *tiling = &fb->tiling[gmem_layout]; - - /* start from 1 tile */ - tiling->tile_count = (VkExtent2D) { - .width = 1, - .height = 1, - }; - tiling->tile0 = (VkExtent2D) { - .width = util_align_npot(fb->width, tile_align_w), - .height = align(fb->height, tile_align_h), - }; - - /* will force to sysmem, don't bother trying to have a valid tile config - * TODO: just skip all GMEM stuff when sysmem is forced? - */ - if (!pass->gmem_pixels[gmem_layout]) - return; - - if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) { - /* start with 2x2 tiles */ - tiling->tile_count.width = 2; - tiling->tile_count.height = 2; - tiling->tile0.width = util_align_npot(DIV_ROUND_UP(fb->width, 2), tile_align_w); - tiling->tile0.height = align(DIV_ROUND_UP(fb->height, 2), tile_align_h); - } - - /* do not exceed max tile width */ - while (tiling->tile0.width > max_tile_width) { - tiling->tile_count.width++; - tiling->tile0.width = - util_align_npot(DIV_ROUND_UP(fb->width, tiling->tile_count.width), tile_align_w); - } - - /* do not exceed max tile height */ - while (tiling->tile0.height > max_tile_height) { - tiling->tile_count.height++; - tiling->tile0.height = - util_align_npot(DIV_ROUND_UP(fb->height, tiling->tile_count.height), tile_align_h); - } - - /* do not exceed gmem size */ - while (tiling->tile0.width * tiling->tile0.height > pass->gmem_pixels[gmem_layout]) { - if (tiling->tile0.width > MAX2(tile_align_w, tiling->tile0.height)) { - tiling->tile_count.width++; - tiling->tile0.width = - util_align_npot(DIV_ROUND_UP(fb->width, tiling->tile_count.width), tile_align_w); - } else { - /* if this assert fails then layout is impossible.. */ - assert(tiling->tile0.height > tile_align_h); - tiling->tile_count.height++; - tiling->tile0.height = - align(DIV_ROUND_UP(fb->height, tiling->tile_count.height), tile_align_h); - } - } -} - -static void -tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling, - const struct tu_device *dev) -{ - const uint32_t max_pipe_count = 32; /* A6xx */ - - /* start from 1 tile per pipe */ - tiling->pipe0 = (VkExtent2D) { - .width = 1, - .height = 1, - }; - tiling->pipe_count = tiling->tile_count; - - while (tiling->pipe_count.width * tiling->pipe_count.height > max_pipe_count) { - if (tiling->pipe0.width < tiling->pipe0.height) { - tiling->pipe0.width += 1; - tiling->pipe_count.width = - DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width); - } else { - tiling->pipe0.height += 1; - tiling->pipe_count.height = - DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height); - } - } -} - -static void -tu_tiling_config_update_pipes(struct tu_tiling_config *tiling, - const struct tu_device *dev) -{ - const uint32_t max_pipe_count = 32; /* A6xx */ - const uint32_t used_pipe_count = - tiling->pipe_count.width * tiling->pipe_count.height; - const VkExtent2D last_pipe = { - .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1, - .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1, - }; - - assert(used_pipe_count <= max_pipe_count); - assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config)); - - for (uint32_t y = 0; y < tiling->pipe_count.height; y++) { - for (uint32_t x = 0; x < tiling->pipe_count.width; x++) { - const uint32_t pipe_x = tiling->pipe0.width * x; - const uint32_t pipe_y = tiling->pipe0.height * y; - const uint32_t pipe_w = (x == tiling->pipe_count.width - 1) - ? last_pipe.width - : tiling->pipe0.width; - const uint32_t pipe_h = (y == tiling->pipe_count.height - 1) - ? last_pipe.height - : tiling->pipe0.height; - const uint32_t n = tiling->pipe_count.width * y + x; - - tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) | - A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) | - A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) | - A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h); - tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h); - } - } - - memset(tiling->pipe_config + used_pipe_count, 0, - sizeof(uint32_t) * (max_pipe_count - used_pipe_count)); -} - -static bool -is_hw_binning_possible(const struct tu_tiling_config *tiling) -{ - /* Similar to older gens, # of tiles per pipe cannot be more than 32. - * But there are no hangs with 16 or more tiles per pipe in either - * X or Y direction, so that limit does not seem to apply. - */ - uint32_t tiles_per_pipe = tiling->pipe0.width * tiling->pipe0.height; - return tiles_per_pipe <= 32; -} - -static void -tu_tiling_config_update_binning(struct tu_tiling_config *tiling, const struct tu_device *device) -{ - tiling->binning_possible = is_hw_binning_possible(tiling); - - if (tiling->binning_possible) { - tiling->binning = (tiling->tile_count.width * tiling->tile_count.height) > 2; - - if (unlikely(device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) - tiling->binning = true; - if (unlikely(device->physical_device->instance->debug_flags & - TU_DEBUG_NOBIN)) - tiling->binning = false; - } else { - tiling->binning = false; - } -} - -void -tu_framebuffer_tiling_config(struct tu_framebuffer *fb, - const struct tu_device *device, - const struct tu_render_pass *pass) -{ - for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) { - struct tu_tiling_config *tiling = &fb->tiling[gmem_layout]; - tu_tiling_config_update_tile_layout(fb, device, pass, gmem_layout); - tu_tiling_config_update_pipe_layout(tiling, device); - tu_tiling_config_update_pipes(tiling, device); - tu_tiling_config_update_binning(tiling, device); - } -} - -void -tu_dbg_log_gmem_load_store_skips(struct tu_device *device) -{ - static uint32_t last_skipped_loads = 0; - static uint32_t last_skipped_stores = 0; - static uint32_t last_total_loads = 0; - static uint32_t last_total_stores = 0; - static struct timespec last_time = {}; - - pthread_mutex_lock(&device->submit_mutex); - - struct timespec current_time; - clock_gettime(CLOCK_MONOTONIC, ¤t_time); - - if (timespec_sub_to_nsec(¤t_time, &last_time) > 1000 * 1000 * 1000) { - last_time = current_time; - } else { - pthread_mutex_unlock(&device->submit_mutex); - return; - } - - struct tu6_global *global = device->global_bo->map; - - uint32_t current_taken_loads = global->dbg_gmem_taken_loads; - uint32_t current_taken_stores = global->dbg_gmem_taken_stores; - uint32_t current_total_loads = global->dbg_gmem_total_loads; - uint32_t current_total_stores = global->dbg_gmem_total_stores; - - uint32_t skipped_loads = current_total_loads - current_taken_loads; - uint32_t skipped_stores = current_total_stores - current_taken_stores; - - uint32_t current_time_frame_skipped_loads = skipped_loads - last_skipped_loads; - uint32_t current_time_frame_skipped_stores = skipped_stores - last_skipped_stores; - - uint32_t current_time_frame_total_loads = current_total_loads - last_total_loads; - uint32_t current_time_frame_total_stores = current_total_stores - last_total_stores; - - mesa_logi("[GMEM] loads total: %u skipped: %.1f%%\n", - current_time_frame_total_loads, - current_time_frame_skipped_loads / (float) current_time_frame_total_loads * 100.f); - mesa_logi("[GMEM] stores total: %u skipped: %.1f%%\n", - current_time_frame_total_stores, - current_time_frame_skipped_stores / (float) current_time_frame_total_stores * 100.f); - - last_skipped_loads = skipped_loads; - last_skipped_stores = skipped_stores; - last_total_loads = current_total_loads; - last_total_stores = current_total_stores; - - pthread_mutex_unlock(&device->submit_mutex); -} diff --git a/lib/mesa/src/freedreno/vulkan/tu_wsi.c b/lib/mesa/src/freedreno/vulkan/tu_wsi.c index cf09cf9b6..21466108b 100644 --- a/lib/mesa/src/freedreno/vulkan/tu_wsi.c +++ b/lib/mesa/src/freedreno/vulkan/tu_wsi.c @@ -1,62 +1,272 @@ /* * Copyright © 2016 Red Hat - * SPDX-License-Identifier: MIT - * * based on intel anv code: * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. */ -#include "tu_wsi.h" +#include "tu_private.h" #include "vk_util.h" -#include "wsi_common_drm.h" -#include "drm-uapi/drm_fourcc.h" - -#include "tu_device.h" +#include "wsi_common.h" -static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL +static PFN_vkVoidFunction tu_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName) { - TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice); - return vk_instance_get_proc_addr_unchecked(&pdevice->instance->vk, pName); + return tu_lookup_entrypoint_unchecked(pName); } -static bool -tu_wsi_can_present_on_device(VkPhysicalDevice physicalDevice, int fd) +VkResult +tu_wsi_init(struct tu_physical_device *physical_device) { - TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice); + return wsi_device_init(&physical_device->wsi_device, + tu_physical_device_to_handle(physical_device), + tu_wsi_proc_addr, &physical_device->instance->alloc, + physical_device->master_fd, NULL); +} - return wsi_common_drm_devices_equal(fd, pdevice->local_fd); +void +tu_wsi_finish(struct tu_physical_device *physical_device) +{ + wsi_device_finish(&physical_device->wsi_device, + &physical_device->instance->alloc); +} + +void +tu_DestroySurfaceKHR(VkInstance _instance, + VkSurfaceKHR _surface, + const VkAllocationCallbacks *pAllocator) +{ + TU_FROM_HANDLE(tu_instance, instance, _instance); + ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface); + + vk_free2(&instance->alloc, pAllocator, surface); } VkResult -tu_wsi_init(struct tu_physical_device *physical_device) +tu_GetPhysicalDeviceSurfaceSupportKHR(VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + VkSurfaceKHR surface, + VkBool32 *pSupported) { - VkResult result; + TU_FROM_HANDLE(tu_physical_device, device, physicalDevice); - result = wsi_device_init(&physical_device->wsi_device, - tu_physical_device_to_handle(physical_device), - tu_wsi_proc_addr, - &physical_device->instance->vk.alloc, - physical_device->master_fd, - &physical_device->instance->dri_options, - false); - if (result != VK_SUCCESS) - return result; + return wsi_common_get_surface_support( + &device->wsi_device, queueFamilyIndex, surface, pSupported); +} - physical_device->wsi_device.supports_modifiers = true; - physical_device->wsi_device.can_present_on_device = - tu_wsi_can_present_on_device; +VkResult +tu_GetPhysicalDeviceSurfaceCapabilitiesKHR( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + VkSurfaceCapabilitiesKHR *pSurfaceCapabilities) +{ + TU_FROM_HANDLE(tu_physical_device, device, physicalDevice); - physical_device->vk.wsi_device = &physical_device->wsi_device; + return wsi_common_get_surface_capabilities(&device->wsi_device, surface, + pSurfaceCapabilities); +} - return VK_SUCCESS; +VkResult +tu_GetPhysicalDeviceSurfaceCapabilities2KHR( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceSurfaceInfo2KHR *pSurfaceInfo, + VkSurfaceCapabilities2KHR *pSurfaceCapabilities) +{ + TU_FROM_HANDLE(tu_physical_device, device, physicalDevice); + + return wsi_common_get_surface_capabilities2( + &device->wsi_device, pSurfaceInfo, pSurfaceCapabilities); +} + +VkResult +tu_GetPhysicalDeviceSurfaceCapabilities2EXT( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + VkSurfaceCapabilities2EXT *pSurfaceCapabilities) +{ + TU_FROM_HANDLE(tu_physical_device, device, physicalDevice); + + return wsi_common_get_surface_capabilities2ext( + &device->wsi_device, surface, pSurfaceCapabilities); +} + +VkResult +tu_GetPhysicalDeviceSurfaceFormatsKHR(VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + uint32_t *pSurfaceFormatCount, + VkSurfaceFormatKHR *pSurfaceFormats) +{ + TU_FROM_HANDLE(tu_physical_device, device, physicalDevice); + + return wsi_common_get_surface_formats( + &device->wsi_device, surface, pSurfaceFormatCount, pSurfaceFormats); +} + +VkResult +tu_GetPhysicalDeviceSurfaceFormats2KHR( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceSurfaceInfo2KHR *pSurfaceInfo, + uint32_t *pSurfaceFormatCount, + VkSurfaceFormat2KHR *pSurfaceFormats) +{ + TU_FROM_HANDLE(tu_physical_device, device, physicalDevice); + + return wsi_common_get_surface_formats2(&device->wsi_device, pSurfaceInfo, + pSurfaceFormatCount, + pSurfaceFormats); +} + +VkResult +tu_GetPhysicalDeviceSurfacePresentModesKHR(VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + uint32_t *pPresentModeCount, + VkPresentModeKHR *pPresentModes) +{ + TU_FROM_HANDLE(tu_physical_device, device, physicalDevice); + + return wsi_common_get_surface_present_modes( + &device->wsi_device, surface, pPresentModeCount, pPresentModes); +} + +VkResult +tu_CreateSwapchainKHR(VkDevice _device, + const VkSwapchainCreateInfoKHR *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkSwapchainKHR *pSwapchain) +{ + TU_FROM_HANDLE(tu_device, device, _device); + const VkAllocationCallbacks *alloc; + if (pAllocator) + alloc = pAllocator; + else + alloc = &device->alloc; + + return wsi_common_create_swapchain(&device->physical_device->wsi_device, + tu_device_to_handle(device), + pCreateInfo, alloc, pSwapchain); } void -tu_wsi_finish(struct tu_physical_device *physical_device) +tu_DestroySwapchainKHR(VkDevice _device, + VkSwapchainKHR swapchain, + const VkAllocationCallbacks *pAllocator) { - physical_device->vk.wsi_device = NULL; - wsi_device_finish(&physical_device->wsi_device, - &physical_device->instance->vk.alloc); + TU_FROM_HANDLE(tu_device, device, _device); + const VkAllocationCallbacks *alloc; + + if (pAllocator) + alloc = pAllocator; + else + alloc = &device->alloc; + + wsi_common_destroy_swapchain(_device, swapchain, alloc); +} + +VkResult +tu_GetSwapchainImagesKHR(VkDevice device, + VkSwapchainKHR swapchain, + uint32_t *pSwapchainImageCount, + VkImage *pSwapchainImages) +{ + return wsi_common_get_images(swapchain, pSwapchainImageCount, + pSwapchainImages); +} + +VkResult +tu_AcquireNextImageKHR(VkDevice device, + VkSwapchainKHR swapchain, + uint64_t timeout, + VkSemaphore semaphore, + VkFence fence, + uint32_t *pImageIndex) +{ + VkAcquireNextImageInfoKHR acquire_info = { + .sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR, + .swapchain = swapchain, + .timeout = timeout, + .semaphore = semaphore, + .fence = fence, + .deviceMask = 0, + }; + + return tu_AcquireNextImage2KHR(device, &acquire_info, pImageIndex); +} + +VkResult +tu_AcquireNextImage2KHR(VkDevice _device, + const VkAcquireNextImageInfoKHR *pAcquireInfo, + uint32_t *pImageIndex) +{ + TU_FROM_HANDLE(tu_device, device, _device); + struct tu_physical_device *pdevice = device->physical_device; + + VkResult result = wsi_common_acquire_next_image2( + &pdevice->wsi_device, _device, pAcquireInfo, pImageIndex); + + /* TODO signal fence and semaphore */ + + return result; +} + +VkResult +tu_QueuePresentKHR(VkQueue _queue, const VkPresentInfoKHR *pPresentInfo) +{ + TU_FROM_HANDLE(tu_queue, queue, _queue); + return wsi_common_queue_present( + &queue->device->physical_device->wsi_device, + tu_device_to_handle(queue->device), _queue, queue->queue_family_index, + pPresentInfo); +} + +VkResult +tu_GetDeviceGroupPresentCapabilitiesKHR( + VkDevice device, VkDeviceGroupPresentCapabilitiesKHR *pCapabilities) +{ + memset(pCapabilities->presentMask, 0, sizeof(pCapabilities->presentMask)); + pCapabilities->presentMask[0] = 0x1; + pCapabilities->modes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR; + + return VK_SUCCESS; +} + +VkResult +tu_GetDeviceGroupSurfacePresentModesKHR( + VkDevice device, + VkSurfaceKHR surface, + VkDeviceGroupPresentModeFlagsKHR *pModes) +{ + *pModes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR; + + return VK_SUCCESS; +} + +VkResult +tu_GetPhysicalDevicePresentRectanglesKHR(VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + uint32_t *pRectCount, + VkRect2D *pRects) +{ + TU_FROM_HANDLE(tu_physical_device, device, physicalDevice); + + return wsi_common_get_present_rectangles(&device->wsi_device, surface, + pRectCount, pRects); } |