diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2024-04-02 09:30:07 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2024-04-02 09:30:07 +0000 |
commit | f54e142455cb3c9d1662dae7e096a32a47e5409b (patch) | |
tree | 440ecd46269f0eac25e349e1ed58f246490c5e26 /lib/mesa/src/broadcom/vulkan | |
parent | 36d8503c27530f68d655d3ef77a6eaa4dfd8ad65 (diff) |
Import Mesa 23.3.6
Diffstat (limited to 'lib/mesa/src/broadcom/vulkan')
29 files changed, 3111 insertions, 1866 deletions
diff --git a/lib/mesa/src/broadcom/vulkan/meson.build b/lib/mesa/src/broadcom/vulkan/meson.build index c3595cf73..3605e0965 100644 --- a/lib/mesa/src/broadcom/vulkan/meson.build +++ b/lib/mesa/src/broadcom/vulkan/meson.build @@ -25,7 +25,9 @@ v3dv_entrypoints = custom_target( command : [ prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak', '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv', + '--beta', with_vulkan_beta.to_string(), '--device-prefix', 'ver42', + '--device-prefix', 'ver71', ], depend_files : vk_entrypoints_gen_depend_files, ) @@ -63,13 +65,11 @@ files_per_version = files( 'v3dvx_pipeline.c', 'v3dvx_meta_common.c', 'v3dvx_pipeline.c', + 'v3dvx_query.c', 'v3dvx_queue.c', ) -# The vulkan driver only supports version >= 42, which is the version present in -# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d -# driver. -v3d_versions = ['42'] +v3d_versions = ['42', '71'] v3dv_flags = [] @@ -100,7 +100,7 @@ if with_platform_wayland endif if with_platform_android - v3dv_deps += dep_android + v3dv_deps += [dep_android, idep_u_gralloc] v3dv_flags += '-DVK_USE_PLATFORM_ANDROID_KHR' libv3dv_files += files('v3dv_android.c') endif @@ -112,7 +112,7 @@ foreach ver : v3d_versions [files_per_version, v3d_xml_pack, v3dv_entrypoints[0]], include_directories : [ inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom, - inc_compiler, inc_util, + inc_util, ], c_args : [v3dv_flags, '-DV3D_VERSION=' + ver], gnu_symbol_visibility : 'hidden', @@ -124,7 +124,7 @@ libvulkan_broadcom = shared_library( 'vulkan_broadcom', [libv3dv_files, v3dv_entrypoints, sha1_h], include_directories : [ - inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_compiler, inc_util, + inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_util, ], link_with : [ libbroadcom_cle, @@ -165,6 +165,7 @@ broadcom_icd = custom_target( ], build_by_default : true, install_dir : with_vulkan_icd_dir, + install_tag : 'runtime', install : true, ) diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_android.c b/lib/mesa/src/broadcom/vulkan/v3dv_android.c index d217aaf11..6c49e5d71 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_android.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_android.c @@ -35,6 +35,9 @@ #include <vulkan/vk_android_native_buffer.h> #include <vulkan/vk_icd.h> +#include "vk_android.h" +#include "vulkan/util/vk_enum_defines.h" + #include "util/libsync.h" #include "util/log.h" #include "util/os_file.h" @@ -112,117 +115,55 @@ v3dv_hal_close(struct hw_device_t *dev) return -1; } -static int -get_format_bpp(int native) -{ - int bpp; - - switch (native) { - case HAL_PIXEL_FORMAT_RGBA_FP16: - bpp = 8; - break; - case HAL_PIXEL_FORMAT_RGBA_8888: - case HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED: - case HAL_PIXEL_FORMAT_RGBX_8888: - case HAL_PIXEL_FORMAT_BGRA_8888: - case HAL_PIXEL_FORMAT_RGBA_1010102: - bpp = 4; - break; - case HAL_PIXEL_FORMAT_RGB_565: - bpp = 2; - break; - default: - bpp = 0; - break; - } - - return bpp; -} - -/* get buffer info from VkNativeBufferANDROID */ -static VkResult -v3dv_gralloc_info_other(struct v3dv_device *device, - const VkNativeBufferANDROID *native_buffer, - int *out_stride, - uint64_t *out_modifier) -{ - *out_stride = native_buffer->stride /*in pixels*/ * - get_format_bpp(native_buffer->format); - *out_modifier = DRM_FORMAT_MOD_LINEAR; - return VK_SUCCESS; -} - -static const char cros_gralloc_module_name[] = "CrOS Gralloc"; - -#define CROS_GRALLOC_DRM_GET_BUFFER_INFO 4 - -struct cros_gralloc0_buffer_info -{ - uint32_t drm_fourcc; - int num_fds; - int fds[4]; - uint64_t modifier; - int offset[4]; - int stride[4]; -}; - -static VkResult -v3dv_gralloc_info_cros(struct v3dv_device *device, - const VkNativeBufferANDROID *native_buffer, - int *out_stride, - uint64_t *out_modifier) +VkResult +v3dv_gralloc_to_drm_explicit_layout(struct u_gralloc *gralloc, + struct u_gralloc_buffer_handle *in_hnd, + VkImageDrmFormatModifierExplicitCreateInfoEXT *out, + VkSubresourceLayout *out_layouts, + int max_planes) { - const gralloc_module_t *gralloc = device->gralloc; - struct cros_gralloc0_buffer_info info; - int ret; + struct u_gralloc_buffer_basic_info info; - ret = gralloc->perform(gralloc, CROS_GRALLOC_DRM_GET_BUFFER_INFO, - native_buffer->handle, &info); - if (ret) + if (u_gralloc_get_buffer_basic_info(gralloc, in_hnd, &info) != 0) return VK_ERROR_INVALID_EXTERNAL_HANDLE; - *out_stride = info.stride[0]; - *out_modifier = info.modifier; + if (info.num_planes > max_planes) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; - return VK_SUCCESS; -} + bool is_disjoint = false; + for (int i = 1; i < info.num_planes; i++) { + if (info.offsets[i] == 0) { + is_disjoint = true; + break; + } + } -VkResult -v3dv_gralloc_info(struct v3dv_device *device, - const VkNativeBufferANDROID *native_buffer, - int *out_dmabuf, - int *out_stride, - int *out_size, - uint64_t *out_modifier) -{ - if (device->gralloc_type == V3DV_GRALLOC_UNKNOWN) { - /* get gralloc module for gralloc buffer info query */ - int err = hw_get_module(GRALLOC_HARDWARE_MODULE_ID, - (const hw_module_t **) &device->gralloc); + if (is_disjoint) { + /* We don't support disjoint planes yet */ + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + } - device->gralloc_type = V3DV_GRALLOC_OTHER; + memset(out_layouts, 0, sizeof(*out_layouts) * info.num_planes); + memset(out, 0, sizeof(*out)); - if (err == 0) { - const gralloc_module_t *gralloc = device->gralloc; - mesa_logi("opened gralloc module name: %s", gralloc->common.name); + out->sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT; + out->pPlaneLayouts = out_layouts; - if (strcmp(gralloc->common.name, cros_gralloc_module_name) == 0 && - gralloc->perform) { - device->gralloc_type = V3DV_GRALLOC_CROS; - } - } + out->drmFormatModifier = info.modifier; + out->drmFormatModifierPlaneCount = info.num_planes; + for (int i = 0; i < info.num_planes; i++) { + out_layouts[i].offset = info.offsets[i]; + out_layouts[i].rowPitch = info.strides[i]; } - *out_dmabuf = native_buffer->handle->data[0]; - *out_size = lseek(*out_dmabuf, 0, SEEK_END); - - if (device->gralloc_type == V3DV_GRALLOC_CROS) { - return v3dv_gralloc_info_cros(device, native_buffer, out_stride, - out_modifier); - } else { - return v3dv_gralloc_info_other(device, native_buffer, out_stride, - out_modifier); + if (info.drm_fourcc == DRM_FORMAT_YVU420) { + /* Swap the U and V planes to match the VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM */ + VkSubresourceLayout tmp = out_layouts[1]; + out_layouts[1] = out_layouts[2]; + out_layouts[2] = tmp; } + + return VK_SUCCESS; } VkResult @@ -231,11 +172,8 @@ v3dv_import_native_buffer_fd(VkDevice device_h, const VkAllocationCallbacks *alloc, VkImage image_h) { - struct v3dv_image *image = NULL; VkResult result; - image = v3dv_image_from_handle(image_h); - VkDeviceMemory memory_h; const VkMemoryDedicatedAllocateInfo ded_alloc = { @@ -252,13 +190,12 @@ v3dv_import_native_buffer_fd(VkDevice device_h, .fd = os_dupfd_cloexec(native_buffer_fd), }; - assert(image->plane_count == 1); result = v3dv_AllocateMemory(device_h, &(VkMemoryAllocateInfo) { .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, .pNext = &import_info, - .allocationSize = image->planes[0].size, + .allocationSize = lseek(native_buffer_fd, 0, SEEK_END), .memoryTypeIndex = 0, }, alloc, &memory_h); @@ -274,8 +211,6 @@ v3dv_import_native_buffer_fd(VkDevice device_h, }; v3dv_BindImageMemory2(device_h, 1, &bind_info); - image->is_native_buffer_memory = true; - return VK_SUCCESS; fail_create_image: @@ -417,6 +352,193 @@ v3dv_GetSwapchainGrallocUsage2ANDROID( *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_HWCOMPOSER; } + if (swapchainImageUsage & VK_SWAPCHAIN_IMAGE_USAGE_SHARED_BIT_ANDROID) { + uint64_t front_rendering_usage = 0; + u_gralloc_get_front_rendering_usage(device->gralloc, &front_rendering_usage); + *grallocProducerUsage |= front_rendering_usage; + } + return VK_SUCCESS; } #endif + +/* ----------------------------- AHardwareBuffer --------------------------- */ + +static VkResult +get_ahb_buffer_format_properties2(VkDevice device_h, const struct AHardwareBuffer *buffer, + VkAndroidHardwareBufferFormatProperties2ANDROID *pProperties) +{ + V3DV_FROM_HANDLE(v3dv_device, device, device_h); + + /* Get a description of buffer contents . */ + AHardwareBuffer_Desc desc; + AHardwareBuffer_describe(buffer, &desc); + + /* Verify description. */ + const uint64_t gpu_usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE | + AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT | + AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER; + + /* "Buffer must be a valid Android hardware buffer object with at least + * one of the AHARDWAREBUFFER_USAGE_GPU_* usage flags." + */ + if (!(desc.usage & (gpu_usage))) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + /* Fill properties fields based on description. */ + VkAndroidHardwareBufferFormatProperties2ANDROID *p = pProperties; + + p->samplerYcbcrConversionComponents.r = VK_COMPONENT_SWIZZLE_IDENTITY; + p->samplerYcbcrConversionComponents.g = VK_COMPONENT_SWIZZLE_IDENTITY; + p->samplerYcbcrConversionComponents.b = VK_COMPONENT_SWIZZLE_IDENTITY; + p->samplerYcbcrConversionComponents.a = VK_COMPONENT_SWIZZLE_IDENTITY; + + p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601; + p->suggestedYcbcrRange = VK_SAMPLER_YCBCR_RANGE_ITU_FULL; + + p->suggestedXChromaOffset = VK_CHROMA_LOCATION_MIDPOINT; + p->suggestedYChromaOffset = VK_CHROMA_LOCATION_MIDPOINT; + + VkFormatProperties2 format_properties = {.sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2}; + + p->format = vk_ahb_format_to_image_format(desc.format); + + VkFormat external_format = p->format; + + if (p->format != VK_FORMAT_UNDEFINED) + goto finish; + + /* External format only case + * + * From vkGetAndroidHardwareBufferPropertiesANDROID spec: + * "If the Android hardware buffer has one of the formats listed in the Format + * Equivalence table (see spec.), then format must have the equivalent Vulkan + * format listed in the table. Otherwise, format may be VK_FORMAT_UNDEFINED, + * indicating the Android hardware buffer can only be used with an external format." + * + * From SKIA source code analysis: p->format MUST be VK_FORMAT_UNDEFINED, if the + * format is not in the Equivalence table. + */ + + struct u_gralloc_buffer_handle gr_handle = { + .handle = AHardwareBuffer_getNativeHandle(buffer), + .pixel_stride = desc.stride, + .hal_format = desc.format, + }; + + struct u_gralloc_buffer_basic_info info; + + if (u_gralloc_get_buffer_basic_info(device->gralloc, &gr_handle, &info) != 0) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + switch (info.drm_fourcc) { + case DRM_FORMAT_YVU420: + /* Assuming that U and V planes are swapped earlier */ + external_format = VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM; + break; + case DRM_FORMAT_NV12: + external_format = VK_FORMAT_G8_B8R8_2PLANE_420_UNORM; + break; + default:; + mesa_loge("Unsupported external DRM format: %d", info.drm_fourcc); + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + } + + struct u_gralloc_buffer_color_info color_info; + if (u_gralloc_get_buffer_color_info(device->gralloc, &gr_handle, &color_info) == 0) { + switch (color_info.yuv_color_space) { + case __DRI_YUV_COLOR_SPACE_ITU_REC601: + p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601; + break; + case __DRI_YUV_COLOR_SPACE_ITU_REC709: + p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709; + break; + case __DRI_YUV_COLOR_SPACE_ITU_REC2020: + p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_2020; + break; + default: + break; + } + + p->suggestedYcbcrRange = (color_info.sample_range == __DRI_YUV_NARROW_RANGE) ? + VK_SAMPLER_YCBCR_RANGE_ITU_NARROW : VK_SAMPLER_YCBCR_RANGE_ITU_FULL; + p->suggestedXChromaOffset = (color_info.horizontal_siting == __DRI_YUV_CHROMA_SITING_0_5) ? + VK_CHROMA_LOCATION_MIDPOINT : VK_CHROMA_LOCATION_COSITED_EVEN; + p->suggestedYChromaOffset = (color_info.vertical_siting == __DRI_YUV_CHROMA_SITING_0_5) ? + VK_CHROMA_LOCATION_MIDPOINT : VK_CHROMA_LOCATION_COSITED_EVEN; + } + +finish: + + v3dv_GetPhysicalDeviceFormatProperties2(v3dv_physical_device_to_handle(device->pdevice), + external_format, &format_properties); + + /* v3dv doesn't support direct sampling from linear images but has a logic to copy + * from linear to tiled images implicitly before sampling. Therefore expose optimal + * features for both linear and optimal tiling. + */ + p->formatFeatures = format_properties.formatProperties.optimalTilingFeatures; + p->externalFormat = external_format; + + /* From vkGetAndroidHardwareBufferPropertiesANDROID spec: + * "The formatFeatures member *must* include + * VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT and at least one of + * VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT or + * VK_FORMAT_FEATURE_2_COSITED_CHROMA_SAMPLES_BIT" + */ + p->formatFeatures |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT_KHR; + + return VK_SUCCESS; +} + +VkResult +v3dv_GetAndroidHardwareBufferPropertiesANDROID(VkDevice device_h, + const struct AHardwareBuffer *buffer, + VkAndroidHardwareBufferPropertiesANDROID *pProperties) +{ + V3DV_FROM_HANDLE(v3dv_device, dev, device_h); + struct v3dv_physical_device *pdevice = dev->pdevice; + + VkResult result; + + VkAndroidHardwareBufferFormatPropertiesANDROID *format_prop = + vk_find_struct(pProperties->pNext, ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID); + + /* Fill format properties of an Android hardware buffer. */ + if (format_prop) { + VkAndroidHardwareBufferFormatProperties2ANDROID format_prop2 = { + .sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID, + }; + result = get_ahb_buffer_format_properties2(device_h, buffer, &format_prop2); + if (result != VK_SUCCESS) + return result; + + format_prop->format = format_prop2.format; + format_prop->externalFormat = format_prop2.externalFormat; + format_prop->formatFeatures = + vk_format_features2_to_features(format_prop2.formatFeatures); + format_prop->samplerYcbcrConversionComponents = + format_prop2.samplerYcbcrConversionComponents; + format_prop->suggestedYcbcrModel = format_prop2.suggestedYcbcrModel; + format_prop->suggestedYcbcrRange = format_prop2.suggestedYcbcrRange; + format_prop->suggestedXChromaOffset = format_prop2.suggestedXChromaOffset; + format_prop->suggestedYChromaOffset = format_prop2.suggestedYChromaOffset; + } + + VkAndroidHardwareBufferFormatProperties2ANDROID *format_prop2 = + vk_find_struct(pProperties->pNext, ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID); + if (format_prop2) { + result = get_ahb_buffer_format_properties2(device_h, buffer, format_prop2); + if (result != VK_SUCCESS) + return result; + } + + const native_handle_t *handle = AHardwareBuffer_getNativeHandle(buffer); + assert(handle && handle->numFds > 0); + pProperties->allocationSize = lseek(handle->data[0], 0, SEEK_END); + + /* All memory types. */ + pProperties->memoryTypeBits = (1u << pdevice->memory.memoryTypeCount) - 1; + + return VK_SUCCESS; +} diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_bo.c b/lib/mesa/src/broadcom/vulkan/v3dv_bo.c index 9f1bf423a..2728a9233 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_bo.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_bo.c @@ -339,7 +339,7 @@ v3dv_bo_map(struct v3dv_device *device, struct v3dv_bo *bo, uint32_t size) if (!ok) return false; - ok = v3dv_bo_wait(device, bo, PIPE_TIMEOUT_INFINITE); + ok = v3dv_bo_wait(device, bo, OS_TIMEOUT_INFINITE); if (!ok) { fprintf(stderr, "memory wait for map failed\n"); return false; @@ -359,7 +359,7 @@ v3dv_bo_unmap(struct v3dv_device *device, struct v3dv_bo *bo) bo->map_size = 0; } -static boolean +static bool reallocate_size_list(struct v3dv_bo_cache *cache, struct v3dv_device *device, uint32_t size) diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_cl.h b/lib/mesa/src/broadcom/vulkan/v3dv_cl.h index db1dfb08c..7e17ac395 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_cl.h +++ b/lib/mesa/src/broadcom/vulkan/v3dv_cl.h @@ -27,6 +27,7 @@ #include "broadcom/cle/v3d_packet_helpers.h" #include "util/list.h" +#include "util/macros.h" struct v3dv_bo; struct v3dv_job; @@ -150,15 +151,9 @@ cl_aligned_reloc(struct v3dv_cl *cl, uint32_t v3dv_cl_ensure_space(struct v3dv_cl *cl, uint32_t space, uint32_t alignment); void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space); -/* We redefine ALIGN as a macro as we want to use cl_aligned_packet_length for - * struct fields - */ -#define ALIGN(value, alignment) \ - (((value) + (alignment) - 1) & ~((alignment) - 1)) - #define cl_packet_header(packet) V3DX(packet ## _header) #define cl_packet_length(packet) V3DX(packet ## _length) -#define cl_aligned_packet_length(packet, alignment) ALIGN(cl_packet_length(packet), alignment) +#define cl_aligned_packet_length(packet, alignment) ALIGN_POT(cl_packet_length(packet), alignment) #define cl_packet_pack(packet) V3DX(packet ## _pack) #define cl_packet_struct(packet) V3DX(packet) diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_cmd_buffer.c b/lib/mesa/src/broadcom/vulkan/v3dv_cmd_buffer.c index 449e532c6..dc01a0fa0 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_cmd_buffer.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_cmd_buffer.c @@ -24,6 +24,7 @@ #include "v3dv_private.h" #include "util/u_pack_color.h" #include "vk_util.h" +#include "vulkan/runtime/vk_common_entrypoints.h" void v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo) @@ -348,6 +349,7 @@ job_compute_frame_tiling(struct v3dv_job *job, uint32_t layers, uint32_t render_target_count, uint8_t max_internal_bpp, + uint8_t total_color_bpp, bool msaa, bool double_buffer) { @@ -360,13 +362,16 @@ job_compute_frame_tiling(struct v3dv_job *job, tiling->render_target_count = render_target_count; tiling->msaa = msaa; tiling->internal_bpp = max_internal_bpp; + tiling->total_color_bpp = total_color_bpp; tiling->double_buffer = double_buffer; /* Double-buffer is incompatible with MSAA */ assert(!tiling->msaa || !tiling->double_buffer); - v3d_choose_tile_size(render_target_count, max_internal_bpp, - tiling->msaa, tiling->double_buffer, + v3d_choose_tile_size(&job->device->devinfo, + render_target_count, + max_internal_bpp, total_color_bpp, msaa, + tiling->double_buffer, &tiling->tile_width, &tiling->tile_height); tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width); @@ -457,6 +462,7 @@ v3dv_job_start_frame(struct v3dv_job *job, bool allocate_tile_state_now, uint32_t render_target_count, uint8_t max_internal_bpp, + uint8_t total_color_bpp, bool msaa) { assert(job); @@ -467,7 +473,7 @@ v3dv_job_start_frame(struct v3dv_job *job, const struct v3dv_frame_tiling *tiling = job_compute_frame_tiling(job, width, height, layers, render_target_count, max_internal_bpp, - msaa, false); + total_color_bpp, msaa, false); v3dv_cl_ensure_space_with_branch(&job->bcl, 256); v3dv_return_if_oom(NULL, job); @@ -528,6 +534,7 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer) job->frame_tiling.layers, job->frame_tiling.render_target_count, job->frame_tiling.internal_bpp, + job->frame_tiling.total_color_bpp, job->frame_tiling.msaa, true); @@ -1374,7 +1381,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer) } uint32_t att_count = 0; - VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */ + VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */ /* We only need to emit subpass clears as draw calls for color attachments * if the render area is not aligned to tile boundaries. @@ -1434,7 +1441,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer) "VK_ATTACHMENT_LOAD_OP_CLEAR.\n"); } else if (subpass->do_depth_clear_with_draw || subpass->do_stencil_clear_with_draw) { - perf_debug("Subpass clears DEPTH but loads STENCIL (or viceversa), " + perf_debug("Subpass clears DEPTH but loads STENCIL (or vice versa), " "falling back to vkCmdClearAttachments for " "VK_ATTACHMENT_LOAD_OP_CLEAR.\n"); } @@ -1672,10 +1679,11 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, const struct v3dv_framebuffer *framebuffer = state->framebuffer; - uint8_t internal_bpp; + uint8_t max_internal_bpp, total_color_bpp; bool msaa; v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa) - (framebuffer, state->attachments, subpass, &internal_bpp, &msaa); + (framebuffer, state->attachments, subpass, + &max_internal_bpp, &total_color_bpp, &msaa); /* From the Vulkan spec: * @@ -1699,7 +1707,8 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer, layers, true, false, subpass->color_count, - internal_bpp, + max_internal_bpp, + total_color_bpp, msaa); } @@ -2062,6 +2071,14 @@ cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer, } } + if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BOUNDS)) { + if (memcmp(&dest->depth_bounds, &src->depth_bounds, + sizeof(src->depth_bounds))) { + memcpy(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds)); + dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS; + } + } + if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) { if (dest->line_width != src->line_width) { dest->line_width = src->line_width; @@ -2131,39 +2148,6 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer, } } -/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */ -void -v3dv_viewport_compute_xform(const VkViewport *viewport, - float scale[3], - float translate[3]) -{ - float x = viewport->x; - float y = viewport->y; - float half_width = 0.5f * viewport->width; - float half_height = 0.5f * viewport->height; - double n = viewport->minDepth; - double f = viewport->maxDepth; - - scale[0] = half_width; - translate[0] = half_width + x; - scale[1] = half_height; - translate[1] = half_height + y; - - scale[2] = (f - n); - translate[2] = n; - - /* It seems that if the scale is small enough the hardware won't clip - * correctly so we work around this my choosing the smallest scale that - * seems to work. - * - * This case is exercised by CTS: - * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero - */ - const float min_abs_scale = 0.000009f; - if (fabs(scale[2]) < min_abs_scale) - scale[2] = min_abs_scale * (scale[2] < 0 ? -1.0f : 1.0f); -} - /* Considers the pipeline's negative_one_to_one state and applies it to the * current viewport transform if needed to produce the resulting Z translate * and scale parameters. @@ -2216,9 +2200,10 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer, viewportCount * sizeof(*pViewports)); for (uint32_t i = firstViewport; i < total_count; i++) { - v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i], - state->dynamic.viewport.scale[i], - state->dynamic.viewport.translate[i]); + v3dv_X(cmd_buffer->device, viewport_compute_xform) + (&state->dynamic.viewport.viewports[i], + state->dynamic.viewport.scale[i], + state->dynamic.viewport.translate[i]); } cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT; @@ -2264,11 +2249,14 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer) */ float *vptranslate = dynamic->viewport.translate[0]; float *vpscale = dynamic->viewport.scale[0]; + assert(vpscale[0] >= 0); - float vp_minx = -fabsf(vpscale[0]) + vptranslate[0]; - float vp_maxx = fabsf(vpscale[0]) + vptranslate[0]; - float vp_miny = -fabsf(vpscale[1]) + vptranslate[1]; - float vp_maxy = fabsf(vpscale[1]) + vptranslate[1]; + float vp_minx = vptranslate[0] - vpscale[0]; + float vp_maxx = vptranslate[0] + vpscale[0]; + + /* With KHR_maintenance1 viewport may have negative Y */ + float vp_miny = vptranslate[1] - fabsf(vpscale[1]); + float vp_maxy = vptranslate[1] + fabsf(vpscale[1]); /* Quoting from v3dx_emit: * "Clip to the scissor if it's enabled, but still clip to the @@ -2297,11 +2285,6 @@ emit_scissor(struct v3dv_cmd_buffer *cmd_buffer) maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y + cmd_buffer->state.render_area.extent.height); - minx = vp_minx; - miny = vp_miny; - maxx = vp_maxx; - maxy = vp_maxy; - /* Clip against user provided scissor if needed. * * FIXME: right now we only allow one scissor. Below would need to be @@ -2701,6 +2684,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer) true, false, old_job->frame_tiling.render_target_count, old_job->frame_tiling.internal_bpp, + old_job->frame_tiling.total_color_bpp, true /* msaa */); v3dv_job_destroy(old_job); @@ -2965,6 +2949,9 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer, if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS)) v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer); + if (*dirty & V3DV_CMD_DIRTY_DEPTH_BOUNDS) + v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer); + if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS)) v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer); @@ -3309,24 +3296,6 @@ v3dv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER; } -static uint32_t -get_index_size(VkIndexType index_type) -{ - switch (index_type) { - case VK_INDEX_TYPE_UINT8_EXT: - return 1; - break; - case VK_INDEX_TYPE_UINT16: - return 2; - break; - case VK_INDEX_TYPE_UINT32: - return 4; - break; - default: - unreachable("Unsupported index type"); - } -} - VKAPI_ATTR void VKAPI_CALL v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, @@ -3335,7 +3304,7 @@ v3dv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - const uint32_t index_size = get_index_size(indexType); + const uint32_t index_size = vk_index_type_to_bytes(indexType); if (buffer == cmd_buffer->state.index_buffer.buffer && offset == cmd_buffer->state.index_buffer.offset && index_size == cmd_buffer->state.index_buffer.index_size) { @@ -3412,9 +3381,11 @@ v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds) { - /* We do not support depth bounds testing so we just ignore this. We are - * already asserting that pipelines don't enable the feature anyway. - */ + V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds; + cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds; + cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS; } VKAPI_ATTR void VKAPI_CALL @@ -3435,6 +3406,304 @@ v3dv_CmdSetLineWidth(VkCommandBuffer commandBuffer, cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_LINE_WIDTH; } +/** + * This checks a descriptor set to see if are binding any descriptors that would + * involve sampling from a linear image (the hardware only supports this for + * 1D images), and if so, attempts to create a tiled copy of the linear image + * and rewrite the descriptor set to use that instead. + * + * This was added to support a scenario with Android where some part of the UI + * wanted to show previews of linear swapchain images. For more details: + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9712 + * + * Currently this only supports a linear sampling from a simple 2D image, but + * it could be extended to support more cases if necessary. + */ +static void +handle_sample_from_linear_image(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_descriptor_set *set, + bool is_compute) +{ + for (int32_t i = 0; i < set->layout->binding_count; i++) { + const struct v3dv_descriptor_set_binding_layout *blayout = + &set->layout->binding[i]; + if (blayout->type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE && + blayout->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) + continue; + + struct v3dv_descriptor *desc = &set->descriptors[blayout->descriptor_index]; + if (!desc->image_view) + continue; + + struct v3dv_image *image = (struct v3dv_image *) desc->image_view->vk.image; + struct v3dv_image_view *view = (struct v3dv_image_view *) desc->image_view; + if (image->tiled || view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D || + view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D_ARRAY) { + continue; + } + + /* FIXME: we can probably handle most of these restrictions too with + * a bit of extra effort. + */ + if (view->vk.view_type != VK_IMAGE_VIEW_TYPE_2D || + view->vk.level_count != 1 || view->vk.layer_count != 1 || + blayout->array_size != 1) { + fprintf(stderr, "Sampling from linear image is not supported. " + "Expect corruption.\n"); + continue; + } + + /* We are sampling from a linear image. V3D doesn't support this + * so we create a tiled copy of the image and rewrite the descriptor + * to read from it instead. + */ + perf_debug("Sampling from linear image is not supported natively and " + "requires a copy.\n"); + + struct v3dv_device *device = cmd_buffer->device; + VkDevice vk_device = v3dv_device_to_handle(device); + + /* Allocate shadow tiled image if needed, we only do this once for + * each image, on the first sampling attempt. We need to take a lock + * since we may be trying to do the same in another command buffer in + * a separate thread. + */ + mtx_lock(&device->meta.mtx); + VkResult result; + VkImage tiled_image; + if (image->shadow) { + tiled_image = v3dv_image_to_handle(image->shadow); + } else { + VkImageCreateInfo image_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .flags = image->vk.create_flags, + .imageType = image->vk.image_type, + .format = image->vk.format, + .extent = { + image->vk.extent.width, + image->vk.extent.height, + image->vk.extent.depth, + }, + .mipLevels = image->vk.mip_levels, + .arrayLayers = image->vk.array_layers, + .samples = image->vk.samples, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = image->vk.usage, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + result = v3dv_CreateImage(vk_device, &image_info, + &device->vk.alloc, &tiled_image); + if (result != VK_SUCCESS) { + fprintf(stderr, "Failed to copy linear 2D image for sampling." + "Expect corruption.\n"); + mtx_unlock(&device->meta.mtx); + continue; + } + + bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT; + VkImageMemoryRequirementsInfo2 reqs_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2, + .image = tiled_image, + }; + + assert(image->plane_count <= V3DV_MAX_PLANE_COUNT); + for (int p = 0; p < (disjoint ? image->plane_count : 1); p++) { + VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p; + VkImagePlaneMemoryRequirementsInfo plane_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO, + .planeAspect = plane_aspect, + }; + if (disjoint) + reqs_info.pNext = &plane_info; + + VkMemoryRequirements2 reqs = { + .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2, + }; + v3dv_GetImageMemoryRequirements2(vk_device, &reqs_info, &reqs); + + VkDeviceMemory mem; + VkMemoryAllocateInfo alloc_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = reqs.memoryRequirements.size, + .memoryTypeIndex = 0, + }; + result = v3dv_AllocateMemory(vk_device, &alloc_info, + &device->vk.alloc, &mem); + if (result != VK_SUCCESS) { + fprintf(stderr, "Failed to copy linear 2D image for sampling." + "Expect corruption.\n"); + v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc); + mtx_unlock(&device->meta.mtx); + continue; + } + + VkBindImageMemoryInfo bind_info = { + .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO, + .image = tiled_image, + .memory = mem, + .memoryOffset = 0, + }; + VkBindImagePlaneMemoryInfo plane_bind_info = { + .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO, + .planeAspect = plane_aspect, + }; + if (disjoint) + bind_info.pNext = &plane_bind_info; + result = v3dv_BindImageMemory2(vk_device, 1, &bind_info); + if (result != VK_SUCCESS) { + fprintf(stderr, "Failed to copy linear 2D image for sampling." + "Expect corruption.\n"); + v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc); + v3dv_FreeMemory(vk_device, mem, &device->vk.alloc); + mtx_unlock(&device->meta.mtx); + continue; + } + } + + image->shadow = v3dv_image_from_handle(tiled_image); + } + + /* Create a shadow view that refers to the tiled image if needed */ + VkImageView tiled_view; + if (view->shadow) { + tiled_view = v3dv_image_view_to_handle(view->shadow); + } else { + VkImageViewCreateInfo view_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .flags = view->vk.create_flags, + .image = tiled_image, + .viewType = view->vk.view_type, + .format = view->vk.format, + .components = view->vk.swizzle, + .subresourceRange = { + .aspectMask = view->vk.aspects, + .baseMipLevel = view->vk.base_mip_level, + .levelCount = view->vk.level_count, + .baseArrayLayer = view->vk.base_array_layer, + .layerCount = view->vk.layer_count, + }, + }; + result = v3dv_create_image_view(device, &view_info, &tiled_view); + if (result != VK_SUCCESS) { + fprintf(stderr, "Failed to copy linear 2D image for sampling." + "Expect corruption.\n"); + mtx_unlock(&device->meta.mtx); + continue; + } + } + + view->shadow = v3dv_image_view_from_handle(tiled_view); + + mtx_unlock(&device->meta.mtx); + + /* Rewrite the descriptor to use the shadow view */ + VkDescriptorImageInfo desc_image_info = { + .sampler = v3dv_sampler_to_handle(desc->sampler), + .imageView = tiled_view, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + VkWriteDescriptorSet write = { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = v3dv_descriptor_set_to_handle(set), + .dstBinding = i, + .dstArrayElement = 0, /* Assumes array_size is 1 */ + .descriptorCount = 1, + .descriptorType = desc->type, + .pImageInfo = &desc_image_info, + }; + v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL); + + /* Now we need to actually copy the pixel data from the linear image + * into the tiled image storage to ensure it is up-to-date. + * + * FIXME: ideally we would track if the linear image is dirty and skip + * this step otherwise, but that would be a bit of a pain. + * + * Note that we need to place the copy job *before* the current job in + * the command buffer state so we have the tiled image ready to process + * an upcoming draw call in the current job that samples from it. + * + * Also, we need to use the TFU path for this copy, as any other path + * will use the tile buffer and would require a new framebuffer setup, + * thus requiring extra work to stop and resume any in-flight render + * pass. Since we are converting a full 2D texture here the TFU should + * be able to handle this. + */ + for (int p = 0; p < image->plane_count; p++) { + VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p; + struct VkImageCopy2 copy_region = { + .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2, + .srcSubresource = { + .aspectMask = image->plane_count == 1 ? + view->vk.aspects : (view->vk.aspects & plane_aspect), + .mipLevel = view->vk.base_mip_level, + .baseArrayLayer = view->vk.base_array_layer, + .layerCount = view->vk.layer_count, + }, + .srcOffset = {0, 0, 0 }, + .dstSubresource = { + .aspectMask = image->plane_count == 1 ? + view->vk.aspects : (view->vk.aspects & plane_aspect), + .mipLevel = view->vk.base_mip_level, + .baseArrayLayer = view->vk.base_array_layer, + .layerCount = view->vk.layer_count, + }, + .dstOffset = { 0, 0, 0}, + .extent = { + image->planes[p].width, + image->planes[p].height, + 1, + }, + }; + struct v3dv_image *copy_src = image; + struct v3dv_image *copy_dst = v3dv_image_from_handle(tiled_image); + bool ok = v3dv_cmd_buffer_copy_image_tfu(cmd_buffer, copy_dst, copy_src, + ©_region); + if (ok) { + /* This will emit the TFU job right before the current in-flight + * job (if any), since in-fight jobs are only added to the list + * when finished. + */ + struct v3dv_job *tfu_job = + list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link); + assert(tfu_job->type == V3DV_JOB_TYPE_GPU_TFU); + /* Serialize the copy since we don't know who is producing the linear + * image and we need the image to be ready by the time the copy + * executes. + */ + tfu_job->serialize = V3DV_BARRIER_ALL; + + /* Also, we need to ensure the TFU copy job completes before anyhing + * else coming after that may be using the tiled shadow copy. + */ + if (cmd_buffer->state.job) { + /* If we already had an in-flight job (i.e. we are in a render + * pass) make sure the job waits for the TFU copy. + */ + cmd_buffer->state.job->serialize |= V3DV_BARRIER_TRANSFER_BIT; + } else { + /* Otherwise, make the the follow-up job syncs with the TFU + * job we just added when it is created by adding the + * corresponding barrier state. + */ + if (!is_compute) { + cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_GRAPHICS_BIT; + cmd_buffer->state.barrier.src_mask_graphics |= V3DV_BARRIER_TRANSFER_BIT; + } else { + cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_COMPUTE_BIT; + cmd_buffer->state.barrier.src_mask_compute |= V3DV_BARRIER_TRANSFER_BIT; + } + } + } else { + fprintf(stderr, "Failed to copy linear 2D image for sampling." + "TFU doesn't support copy. Expect corruption.\n"); + } + } + } +} + VKAPI_ATTR void VKAPI_CALL v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, @@ -3468,6 +3737,15 @@ v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, descriptor_state->descriptor_sets[index] = set; dirty_stages |= set->layout->shader_stages; descriptor_state_changed = true; + + /* Check if we are sampling from a linear 2D image. This is not + * supported in hardware, but may be required for some applications + * so we will transparently convert to tiled at the expense of + * performance. + */ + handle_sample_from_linear_image(cmd_buffer, set, + pipelineBindPoint == + VK_PIPELINE_BIND_POINT_COMPUTE); } for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) { @@ -3846,6 +4124,7 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer) void v3dv_cmd_buffer_rewrite_indirect_csd_job( + struct v3dv_device *device, struct v3dv_csd_indirect_cpu_job_info *info, const uint32_t *wg_counts) { @@ -3865,15 +4144,22 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job( submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; - submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) * - (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; + uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) * + (wg_counts[0] * wg_counts[1] * wg_counts[2]); + /* V3D 7.1.6 and later don't subtract 1 from the number of batches */ + if (device->devinfo.ver < 71 || + (device->devinfo.ver == 71 && device->devinfo.rev < 6)) { + submit->cfg[4] = num_batches - 1; + } else { + submit->cfg[4] = num_batches; + } assert(submit->cfg[4] != ~0); if (info->needs_wg_uniform_rewrite) { /* Make sure the GPU is not currently accessing the indirect CL for this * job, since we are about to overwrite some of the uniform data. */ - v3dv_bo_wait(job->device, job->indirect.bo, PIPE_TIMEOUT_INFINITE); + v3dv_bo_wait(job->device, job->indirect.bo, OS_TIMEOUT_INFINITE); for (uint32_t i = 0; i < 3; i++) { if (info->wg_uniform_offsets[i]) { @@ -3899,6 +4185,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, uint32_t **wg_uniform_offsets_out, uint32_t *wg_size_out) { + struct v3dv_device *device = cmd_buffer->device; struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline; assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]); struct v3dv_shader_variant *cs_variant = @@ -3957,18 +4244,26 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, if (wg_size_out) *wg_size_out = wg_size; - submit->cfg[4] = num_batches - 1; + /* V3D 7.1.6 and later don't subtract 1 from the number of batches */ + if (device->devinfo.ver < 71 || + (device->devinfo.ver == 71 && device->devinfo.rev < 6)) { + submit->cfg[4] = num_batches - 1; + } else { + submit->cfg[4] = num_batches; + } assert(submit->cfg[4] != ~0); assert(pipeline->shared_data->assembly_bo); struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo; submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset; - submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; if (cs_variant->prog_data.base->single_seg) submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; if (cs_variant->prog_data.base->threads == 4) submit->cfg[5] |= V3D_CSD_CFG5_THREADING; + /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved */ + if (device->devinfo.ver < 71) + submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; if (cs_variant->prog_data.cs->shared_size > 0) { job->csd.shared_memory = diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_device.c b/lib/mesa/src/broadcom/vulkan/v3dv_device.c index 0590afeb7..027c35ffe 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_device.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_device.c @@ -49,9 +49,14 @@ #include "git_sha1.h" #include "util/build_id.h" +#include "util/os_file.h" #include "util/u_debug.h" #include "util/format/u_format.h" +#ifdef ANDROID +#include "vk_android.h" +#endif + #ifdef VK_USE_PLATFORM_XCB_KHR #include <xcb/xcb.h> #include <xcb/dri3.h> @@ -63,11 +68,14 @@ #include "wayland-drm-client-protocol.h" #endif -#ifndef ANDROID -# define V3DV_API_VERSION VK_MAKE_VERSION(1, 2, VK_HEADER_VERSION) -#else -/* Android CDD require additional extensions for API v1.1+ */ -# define V3DV_API_VERSION VK_MAKE_VERSION(1, 0, VK_HEADER_VERSION) +#define V3DV_API_VERSION VK_MAKE_VERSION(1, 2, VK_HEADER_VERSION) + +#ifdef ANDROID +#if ANDROID_API_LEVEL <= 32 +/* Android 12.1 and lower support only Vulkan API v1.1 */ +#undef V3DV_API_VERSION +#define V3DV_API_VERSION VK_MAKE_VERSION(1, 1, VK_HEADER_VERSION) +#endif #endif VKAPI_ATTR VkResult VKAPI_CALL @@ -156,9 +164,7 @@ get_device_extensions(const struct v3dv_physical_device *device, .KHR_shader_float_controls = true, .KHR_shader_non_semantic_info = true, .KHR_sampler_mirror_clamp_to_edge = true, -#ifndef ANDROID .KHR_sampler_ycbcr_conversion = true, -#endif .KHR_spirv_1_4 = true, .KHR_storage_buffer_storage_class = true, .KHR_timeline_semaphore = true, @@ -202,11 +208,237 @@ get_device_extensions(const struct v3dv_physical_device *device, .EXT_tooling_info = true, .EXT_vertex_attribute_divisor = true, #ifdef ANDROID + .ANDROID_external_memory_android_hardware_buffer = true, .ANDROID_native_buffer = true, + .EXT_queue_family_foreign = true, #endif }; } +static void +get_features(const struct v3dv_physical_device *physical_device, + struct vk_features *features) +{ + *features = (struct vk_features) { + /* Vulkan 1.0 */ + .robustBufferAccess = true, /* This feature is mandatory */ + .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71, + .imageCubeArray = true, + .independentBlend = true, + .geometryShader = true, + .tessellationShader = false, + .sampleRateShading = true, + .dualSrcBlend = false, + .logicOp = true, + .multiDrawIndirect = false, + .drawIndirectFirstInstance = true, + .depthClamp = physical_device->devinfo.ver >= 71, + .depthBiasClamp = true, + .fillModeNonSolid = true, + .depthBounds = physical_device->devinfo.ver >= 71, + .wideLines = true, + .largePoints = true, + .alphaToOne = true, + .multiViewport = false, + .samplerAnisotropy = true, + .textureCompressionETC2 = true, + .textureCompressionASTC_LDR = true, + /* Note that textureCompressionBC requires that the driver support all + * the BC formats. V3D 4.2 only support the BC1-3, so we can't claim + * that we support it. + */ + .textureCompressionBC = false, + .occlusionQueryPrecise = true, + .pipelineStatisticsQuery = false, + .vertexPipelineStoresAndAtomics = true, + .fragmentStoresAndAtomics = true, + .shaderTessellationAndGeometryPointSize = true, + .shaderImageGatherExtended = true, + .shaderStorageImageExtendedFormats = true, + .shaderStorageImageMultisample = false, + .shaderStorageImageReadWithoutFormat = true, + .shaderStorageImageWriteWithoutFormat = false, + .shaderUniformBufferArrayDynamicIndexing = false, + .shaderSampledImageArrayDynamicIndexing = false, + .shaderStorageBufferArrayDynamicIndexing = false, + .shaderStorageImageArrayDynamicIndexing = false, + .shaderClipDistance = true, + .shaderCullDistance = false, + .shaderFloat64 = false, + .shaderInt64 = false, + .shaderInt16 = false, + .shaderResourceResidency = false, + .shaderResourceMinLod = false, + .sparseBinding = false, + .sparseResidencyBuffer = false, + .sparseResidencyImage2D = false, + .sparseResidencyImage3D = false, + .sparseResidency2Samples = false, + .sparseResidency4Samples = false, + .sparseResidency8Samples = false, + .sparseResidency16Samples = false, + .sparseResidencyAliased = false, + .variableMultisampleRate = false, + .inheritedQueries = true, + + /* Vulkan 1.1 */ + .storageBuffer16BitAccess = true, + .uniformAndStorageBuffer16BitAccess = true, + .storagePushConstant16 = true, + .storageInputOutput16 = false, + .multiview = true, + .multiviewGeometryShader = false, + .multiviewTessellationShader = false, + .variablePointersStorageBuffer = true, + /* FIXME: this needs support for non-constant index on UBO/SSBO */ + .variablePointers = false, + .protectedMemory = false, + .samplerYcbcrConversion = true, + .shaderDrawParameters = false, + + /* Vulkan 1.2 */ + .hostQueryReset = true, + .uniformAndStorageBuffer8BitAccess = true, + .uniformBufferStandardLayout = true, + /* V3D 4.2 wraps TMU vector accesses to 16-byte boundaries, so loads and + * stores of vectors that cross these boundaries would not work correctly + * with scalarBlockLayout and would need to be split into smaller vectors + * (and/or scalars) that don't cross these boundaries. For load/stores + * with dynamic offsets where we can't identify if the offset is + * problematic, we would always have to scalarize. Overall, this would + * not lead to best performance so let's just not support it. + */ + .scalarBlockLayout = physical_device->devinfo.ver >= 71, + /* This tells applications 2 things: + * + * 1. If they can select just one aspect for barriers. For us barriers + * decide if we need to split a job and we don't care if it is only + * for one of the aspects of the image or both, so we don't really + * benefit from seeing barriers that select just one aspect. + * + * 2. If they can program different layouts for each aspect. We + * generally don't care about layouts, so again, we don't get any + * benefits from this to limit the scope of image layout transitions. + * + * Still, Vulkan 1.2 requires this feature to be supported so we + * advertise it even though we don't really take advantage of it. + */ + .separateDepthStencilLayouts = true, + .storageBuffer8BitAccess = true, + .storagePushConstant8 = true, + .imagelessFramebuffer = true, + .timelineSemaphore = true, + + .samplerMirrorClampToEdge = true, + + /* These are mandatory by Vulkan 1.2, however, we don't support any of + * the optional features affected by them (non 32-bit types for + * shaderSubgroupExtendedTypes and additional subgroup ballot for + * subgroupBroadcastDynamicId), so in practice setting them to true + * doesn't have any implications for us until we implement any of these + * optional features. + */ + .shaderSubgroupExtendedTypes = true, + .subgroupBroadcastDynamicId = true, + + .vulkanMemoryModel = true, + .vulkanMemoryModelDeviceScope = true, + .vulkanMemoryModelAvailabilityVisibilityChains = true, + + .bufferDeviceAddress = true, + .bufferDeviceAddressCaptureReplay = false, + .bufferDeviceAddressMultiDevice = false, + + /* Vulkan 1.3 */ + .inlineUniformBlock = true, + /* Inline buffers work like push constants, so after their are bound + * some of their contents may be copied into the uniform stream as soon + * as the next draw/dispatch is recorded in the command buffer. This means + * that if the client updates the buffer contents after binding it to + * a command buffer, the next queue submit of that command buffer may + * not use the latest update to the buffer contents, but the data that + * was present in the buffer at the time it was bound to the command + * buffer. + */ + .descriptorBindingInlineUniformBlockUpdateAfterBind = false, + .pipelineCreationCacheControl = true, + .privateData = true, + .maintenance4 = true, + .shaderZeroInitializeWorkgroupMemory = true, + .synchronization2 = true, + .robustImageAccess = true, + .shaderIntegerDotProduct = true, + + /* VK_EXT_4444_formats */ + .formatA4R4G4B4 = true, + .formatA4B4G4R4 = true, + + /* VK_EXT_custom_border_color */ + .customBorderColors = true, + .customBorderColorWithoutFormat = false, + + /* VK_EXT_index_type_uint8 */ + .indexTypeUint8 = true, + + /* VK_EXT_line_rasterization */ + .rectangularLines = true, + .bresenhamLines = true, + .smoothLines = false, + .stippledRectangularLines = false, + .stippledBresenhamLines = false, + .stippledSmoothLines = false, + + /* VK_EXT_color_write_enable */ + .colorWriteEnable = true, + + /* VK_KHR_pipeline_executable_properties */ + .pipelineExecutableInfo = true, + + /* VK_EXT_provoking_vertex */ + .provokingVertexLast = true, + /* FIXME: update when supporting EXT_transform_feedback */ + .transformFeedbackPreservesProvokingVertex = false, + + /* VK_EXT_vertex_attribute_divisor */ + .vertexAttributeInstanceRateDivisor = true, + .vertexAttributeInstanceRateZeroDivisor = false, + + /* VK_KHR_performance_query */ + .performanceCounterQueryPools = physical_device->caps.perfmon, + .performanceCounterMultipleQueryPools = false, + + /* VK_EXT_texel_buffer_alignment */ + .texelBufferAlignment = true, + + /* VK_KHR_workgroup_memory_explicit_layout */ + .workgroupMemoryExplicitLayout = true, + .workgroupMemoryExplicitLayoutScalarBlockLayout = false, + .workgroupMemoryExplicitLayout8BitAccess = true, + .workgroupMemoryExplicitLayout16BitAccess = true, + + /* VK_EXT_border_color_swizzle */ + .borderColorSwizzle = true, + .borderColorSwizzleFromImage = true, + + /* VK_EXT_shader_module_identifier */ + .shaderModuleIdentifier = true, + + /* VK_EXT_depth_clip_control */ + .depthClipControl = true, + + /* VK_EXT_attachment_feedback_loop_layout */ + .attachmentFeedbackLoopLayout = true, + + /* VK_EXT_primitive_topology_list_restart */ + .primitiveTopologyListRestart = true, + /* FIXME: we don't support tessellation shaders yet */ + .primitiveTopologyPatchListRestart = false, + + /* VK_EXT_pipeline_robustness */ + .pipelineRobustness = true, + }; +} + VKAPI_ATTR VkResult VKAPI_CALL v3dv_EnumerateInstanceExtensionProperties(const char *pLayerName, uint32_t *pPropertyCount, @@ -321,8 +553,6 @@ physical_device_finish(struct v3dv_physical_device *device) close(device->render_fd); if (device->display_fd >= 0) close(device->display_fd); - if (device->master_fd >= 0) - close(device->master_fd); free(device->name); @@ -404,273 +634,6 @@ compute_memory_budget(struct v3dv_physical_device *device) return MIN2(heap_size, heap_used + heap_available); } -#if !using_v3d_simulator -#ifdef VK_USE_PLATFORM_XCB_KHR -static int -create_display_fd_xcb(VkIcdSurfaceBase *surface) -{ - int fd = -1; - - xcb_connection_t *conn; - xcb_dri3_open_reply_t *reply = NULL; - if (surface) { - if (surface->platform == VK_ICD_WSI_PLATFORM_XLIB) - conn = XGetXCBConnection(((VkIcdSurfaceXlib *)surface)->dpy); - else - conn = ((VkIcdSurfaceXcb *)surface)->connection; - } else { - conn = xcb_connect(NULL, NULL); - } - - if (xcb_connection_has_error(conn)) - goto finish; - - const xcb_setup_t *setup = xcb_get_setup(conn); - xcb_screen_iterator_t iter = xcb_setup_roots_iterator(setup); - xcb_screen_t *screen = iter.data; - - xcb_dri3_open_cookie_t cookie; - cookie = xcb_dri3_open(conn, screen->root, None); - reply = xcb_dri3_open_reply(conn, cookie, NULL); - if (!reply) - goto finish; - - if (reply->nfd != 1) - goto finish; - - fd = xcb_dri3_open_reply_fds(conn, reply)[0]; - fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); - -finish: - if (!surface) - xcb_disconnect(conn); - if (reply) - free(reply); - - return fd; -} -#endif - -#ifdef VK_USE_PLATFORM_WAYLAND_KHR -struct v3dv_wayland_info { - struct wl_drm *wl_drm; - int fd; - bool is_set; - bool authenticated; -}; - -static void -v3dv_drm_handle_device(void *data, struct wl_drm *drm, const char *device) -{ - struct v3dv_wayland_info *info = data; - info->fd = open(device, O_RDWR | O_CLOEXEC); - info->is_set = info->fd != -1; - if (!info->is_set) { - fprintf(stderr, "v3dv_drm_handle_device: could not open %s (%s)\n", - device, strerror(errno)); - return; - } - - drm_magic_t magic; - if (drmGetMagic(info->fd, &magic)) { - fprintf(stderr, "v3dv_drm_handle_device: drmGetMagic failed\n"); - close(info->fd); - info->fd = -1; - info->is_set = false; - return; - } - wl_drm_authenticate(info->wl_drm, magic); -} - -static void -v3dv_drm_handle_format(void *data, struct wl_drm *drm, uint32_t format) -{ -} - -static void -v3dv_drm_handle_authenticated(void *data, struct wl_drm *drm) -{ - struct v3dv_wayland_info *info = data; - info->authenticated = true; -} - -static void -v3dv_drm_handle_capabilities(void *data, struct wl_drm *drm, uint32_t value) -{ -} - -struct wl_drm_listener v3dv_drm_listener = { - .device = v3dv_drm_handle_device, - .format = v3dv_drm_handle_format, - .authenticated = v3dv_drm_handle_authenticated, - .capabilities = v3dv_drm_handle_capabilities -}; - -static void -v3dv_registry_global(void *data, - struct wl_registry *registry, - uint32_t name, - const char *interface, - uint32_t version) -{ - struct v3dv_wayland_info *info = data; - if (strcmp(interface, wl_drm_interface.name) == 0) { - info->wl_drm = wl_registry_bind(registry, name, &wl_drm_interface, - MIN2(version, 2)); - wl_drm_add_listener(info->wl_drm, &v3dv_drm_listener, data); - }; -} - -static void -v3dv_registry_global_remove_cb(void *data, - struct wl_registry *registry, - uint32_t name) -{ -} - -static int -create_display_fd_wayland(VkIcdSurfaceBase *surface) -{ - struct wl_display *display; - struct wl_registry *registry = NULL; - - struct v3dv_wayland_info info = { - .wl_drm = NULL, - .fd = -1, - .is_set = false, - .authenticated = false - }; - - if (surface) - display = ((VkIcdSurfaceWayland *) surface)->display; - else - display = wl_display_connect(NULL); - - if (!display) - return -1; - - registry = wl_display_get_registry(display); - if (!registry) { - if (!surface) - wl_display_disconnect(display); - return -1; - } - - static const struct wl_registry_listener registry_listener = { - v3dv_registry_global, - v3dv_registry_global_remove_cb - }; - wl_registry_add_listener(registry, ®istry_listener, &info); - - wl_display_roundtrip(display); /* For the registry advertisement */ - wl_display_roundtrip(display); /* For the DRM device event */ - wl_display_roundtrip(display); /* For the authentication event */ - - wl_drm_destroy(info.wl_drm); - wl_registry_destroy(registry); - - if (!surface) - wl_display_disconnect(display); - - if (!info.is_set) - return -1; - - if (!info.authenticated) - return -1; - - return info.fd; -} -#endif - -/* Acquire an authenticated display fd without a surface reference. This is the - * case where the application is making WSI allocations outside the Vulkan - * swapchain context (only Zink, for now). Since we lack information about the - * underlying surface we just try our best to figure out the correct display - * and platform to use. It should work in most cases. - */ -static void -acquire_display_device_no_surface(struct v3dv_physical_device *pdevice) -{ -#ifdef VK_USE_PLATFORM_WAYLAND_KHR - pdevice->display_fd = create_display_fd_wayland(NULL); -#endif - -#ifdef VK_USE_PLATFORM_XCB_KHR - if (pdevice->display_fd == -1) - pdevice->display_fd = create_display_fd_xcb(NULL); -#endif - -#ifdef VK_USE_PLATFORM_DISPLAY_KHR - if (pdevice->display_fd == - 1 && pdevice->master_fd >= 0) - pdevice->display_fd = dup(pdevice->master_fd); -#endif -} - -/* Acquire an authenticated display fd from the surface. This is the regular - * case where the application is using swapchains to create WSI allocations. - * In this case we use the surface information to figure out the correct - * display and platform combination. - */ -static void -acquire_display_device_surface(struct v3dv_physical_device *pdevice, - VkIcdSurfaceBase *surface) -{ - /* Mesa will set both of VK_USE_PLATFORM_{XCB,XLIB} when building with - * platform X11, so only check for XCB and rely on XCB to get an - * authenticated device also for Xlib. - */ -#ifdef VK_USE_PLATFORM_XCB_KHR - if (surface->platform == VK_ICD_WSI_PLATFORM_XCB || - surface->platform == VK_ICD_WSI_PLATFORM_XLIB) { - pdevice->display_fd = create_display_fd_xcb(surface); - } -#endif - -#ifdef VK_USE_PLATFORM_WAYLAND_KHR - if (surface->platform == VK_ICD_WSI_PLATFORM_WAYLAND) - pdevice->display_fd = create_display_fd_wayland(surface); -#endif - -#ifdef VK_USE_PLATFORM_DISPLAY_KHR - if (surface->platform == VK_ICD_WSI_PLATFORM_DISPLAY && - pdevice->master_fd >= 0) { - pdevice->display_fd = dup(pdevice->master_fd); - } -#endif -} -#endif /* !using_v3d_simulator */ - -/* Attempts to get an authenticated display fd from the display server that - * we can use to allocate BOs for presentable images. - */ -VkResult -v3dv_physical_device_acquire_display(struct v3dv_physical_device *pdevice, - VkIcdSurfaceBase *surface) -{ - VkResult result = VK_SUCCESS; - mtx_lock(&pdevice->mutex); - - if (pdevice->display_fd != -1) - goto done; - - /* When running on the simulator we do everything on a single render node so - * we don't need to get an authenticated display fd from the display server. - */ -#if !using_v3d_simulator - if (surface) - acquire_display_device_surface(pdevice, surface); - else - acquire_display_device_no_surface(pdevice); - - if (pdevice->display_fd == -1) - result = VK_ERROR_INITIALIZATION_FAILED; -#endif - -done: - mtx_unlock(&pdevice->mutex); - return result; -} - static bool v3d_has_feature(struct v3dv_physical_device *device, enum drm_v3d_param feature) { @@ -763,11 +726,11 @@ v3dv_physical_device_init_disk_cache(struct v3dv_physical_device *device) static VkResult create_physical_device(struct v3dv_instance *instance, - drmDevicePtr drm_render_device, - drmDevicePtr drm_primary_device) + drmDevicePtr gpu_device, + drmDevicePtr display_device) { VkResult result = VK_SUCCESS; - int32_t master_fd = -1; + int32_t display_fd = -1; int32_t render_fd = -1; struct v3dv_physical_device *device = @@ -783,14 +746,14 @@ create_physical_device(struct v3dv_instance *instance, vk_physical_device_dispatch_table_from_entrypoints( &dispatch_table, &wsi_physical_device_entrypoints, false); - result = vk_physical_device_init(&device->vk, &instance->vk, NULL, - &dispatch_table); + result = vk_physical_device_init(&device->vk, &instance->vk, NULL, NULL, + NULL, &dispatch_table); if (result != VK_SUCCESS) goto fail; - assert(drm_render_device); - const char *path = drm_render_device->nodes[DRM_NODE_RENDER]; + assert(gpu_device); + const char *path = gpu_device->nodes[DRM_NODE_RENDER]; render_fd = open(path, O_RDWR | O_CLOEXEC); if (render_fd < 0) { fprintf(stderr, "Opening %s failed: %s\n", path, strerror(errno)); @@ -805,12 +768,12 @@ create_physical_device(struct v3dv_instance *instance, const char *primary_path; #if !using_v3d_simulator - if (drm_primary_device) - primary_path = drm_primary_device->nodes[DRM_NODE_PRIMARY]; + if (display_device) + primary_path = display_device->nodes[DRM_NODE_PRIMARY]; else primary_path = NULL; #else - primary_path = drm_render_device->nodes[DRM_NODE_PRIMARY]; + primary_path = gpu_device->nodes[DRM_NODE_PRIMARY]; #endif struct stat primary_stat = {0}, render_stat = {0}; @@ -837,20 +800,23 @@ create_physical_device(struct v3dv_instance *instance, device->render_devid = render_stat.st_rdev; #if using_v3d_simulator - device->device_id = drm_render_device->deviceinfo.pci->device_id; + device->device_id = gpu_device->deviceinfo.pci->device_id; #endif if (instance->vk.enabled_extensions.KHR_display || + instance->vk.enabled_extensions.KHR_xcb_surface || + instance->vk.enabled_extensions.KHR_xlib_surface || + instance->vk.enabled_extensions.KHR_wayland_surface || instance->vk.enabled_extensions.EXT_acquire_drm_display) { #if !using_v3d_simulator /* Open the primary node on the vc4 display device */ - assert(drm_primary_device); - master_fd = open(primary_path, O_RDWR | O_CLOEXEC); + assert(display_device); + display_fd = open(primary_path, O_RDWR | O_CLOEXEC); #else /* There is only one device with primary and render nodes. * Open its primary node. */ - master_fd = open(primary_path, O_RDWR | O_CLOEXEC); + display_fd = open(primary_path, O_RDWR | O_CLOEXEC); #endif } @@ -859,8 +825,7 @@ create_physical_device(struct v3dv_instance *instance, #endif device->render_fd = render_fd; /* The v3d render node */ - device->display_fd = -1; /* Authenticated vc4 primary node */ - device->master_fd = master_fd; /* Master vc4 primary node */ + device->display_fd = display_fd; /* Master vc4 primary node */ if (!v3d_get_device_info(device->render_fd, &device->devinfo, &v3dv_ioctl)) { result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, @@ -895,8 +860,10 @@ create_physical_device(struct v3dv_instance *instance, device->next_program_id = 0; ASSERTED int len = - asprintf(&device->name, "V3D %d.%d", - device->devinfo.ver / 10, device->devinfo.ver % 10); + asprintf(&device->name, "V3D %d.%d.%d", + device->devinfo.ver / 10, + device->devinfo.ver % 10, + device->devinfo.rev); assert(len != -1); v3dv_physical_device_init_disk_cache(device); @@ -928,36 +895,6 @@ create_physical_device(struct v3dv_instance *instance, */ device->drm_syncobj_type.features &= ~VK_SYNC_FEATURE_TIMELINE; -#if using_v3d_simulator - /* There are CTS tests which do the following: - * - * 1. Create a command buffer with a vkCmdWaitEvents() - * 2. Submit the command buffer - * 3. vkGetSemaphoreFdKHR() to try to get a sync_file - * 4. vkSetEvent() - * - * This deadlocks in the simulator because we have to wait for the syncobj - * to get a real fence in vkGetSemaphoreFdKHR(). This will never happen - * though because the simulator, unlike real hardware, executes ioctls - * synchronously in the same thread, which means that it will try to - * execute the wait for event immediately and never get to emit the - * signaling job that comes after the compute job that implements the wait - * in the command buffer, which would be responsible for creating the fence - * for the signaling semaphore. - * - * This behavior was seemingly allowed in previous Vulkan versions, however, - * this was fixed in Vulkan the 1.3.228 spec. From commit 355367640f2e: - * - * "Clarify that vkCmdWaitEvents must not execute before a vkSetEvent it - * waits on (internal issue 2971)" - * - * Either way, we disable sync file support in the simulator for now, until - * the CTS is fixed. - */ - device->drm_syncobj_type.import_sync_file = NULL; - device->drm_syncobj_type.export_sync_file = NULL; -#endif - /* Multiwait is required for emulated timeline semaphores and is supported * by the v3d kernel interface. */ @@ -978,6 +915,7 @@ create_physical_device(struct v3dv_instance *instance, } get_device_extensions(device, &device->vk.supported_extensions); + get_features(device, &device->vk.supported_features); mtx_init(&device->mutex, mtx_plain); @@ -991,8 +929,8 @@ fail: if (render_fd >= 0) close(render_fd); - if (master_fd >= 0) - close(master_fd); + if (display_fd >= 0) + close(display_fd); return result; } @@ -1035,14 +973,13 @@ enumerate_devices(struct vk_instance *vk_instance) break; } #else - /* On actual hardware, we should have a render node (v3d) - * and a primary node (vc4). We will need to use the primary - * to allocate WSI buffers and share them with the render node - * via prime, but that is a privileged operation so we need the - * primary node to be authenticated, and for that we need the - * display server to provide the device fd (with DRI3), so we - * here we only check that the device is present but we don't - * try to open it. + /* On actual hardware, we should have a gpu device (v3d) and a display + * device (vc4). We will need to use the display device to allocate WSI + * buffers and share them with the render node via prime, but that is a + * privileged operation so we need t have an authenticated display fd + * and for that we need the display server to provide the it (with DRI3), + * so here we only check that the device is present but we don't try to + * open it. */ if (devices[i]->bustype != DRM_BUS_PLATFORM) continue; @@ -1050,7 +987,8 @@ enumerate_devices(struct vk_instance *vk_instance) if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) { char **compat = devices[i]->deviceinfo.platform->compatible; while (*compat) { - if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) { + if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 || + strncmp(*compat, "brcm,2712-v3d", 13) == 0) { v3d_idx = i; break; } @@ -1059,8 +997,9 @@ enumerate_devices(struct vk_instance *vk_instance) } else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) { char **compat = devices[i]->deviceinfo.platform->compatible; while (*compat) { - if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 || - strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) { + if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 || + strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 || + strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) { vc4_idx = i; break; } @@ -1071,9 +1010,10 @@ enumerate_devices(struct vk_instance *vk_instance) } #if !using_v3d_simulator - if (v3d_idx != -1 && vc4_idx != -1) { - result = - create_physical_device(instance, devices[v3d_idx], devices[vc4_idx]); + if (v3d_idx != -1) { + drmDevicePtr v3d_device = devices[v3d_idx]; + drmDevicePtr vc4_device = vc4_idx != -1 ? devices[vc4_idx] : NULL; + result = create_physical_device(instance, v3d_device, vc4_device); } #endif @@ -1082,238 +1022,6 @@ enumerate_devices(struct vk_instance *vk_instance) return result; } -VKAPI_ATTR void VKAPI_CALL -v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, - VkPhysicalDeviceFeatures2 *pFeatures) -{ - V3DV_FROM_HANDLE(v3dv_physical_device, physical_device, physicalDevice); - - struct vk_features features = { - /* Vulkan 1.0 */ - .robustBufferAccess = true, /* This feature is mandatory */ - .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */ - .imageCubeArray = true, - .independentBlend = true, - .geometryShader = true, - .tessellationShader = false, - .sampleRateShading = true, - .dualSrcBlend = false, - .logicOp = true, - .multiDrawIndirect = false, - .drawIndirectFirstInstance = true, - .depthClamp = false, /* Only available since V3D 4.5.1.1 */ - .depthBiasClamp = true, - .fillModeNonSolid = true, - .depthBounds = false, /* Only available since V3D 4.3.16.2 */ - .wideLines = true, - .largePoints = true, - .alphaToOne = true, - .multiViewport = false, - .samplerAnisotropy = true, - .textureCompressionETC2 = true, - .textureCompressionASTC_LDR = true, - /* Note that textureCompressionBC requires that the driver support all - * the BC formats. V3D 4.2 only support the BC1-3, so we can't claim - * that we support it. - */ - .textureCompressionBC = false, - .occlusionQueryPrecise = true, - .pipelineStatisticsQuery = false, - .vertexPipelineStoresAndAtomics = true, - .fragmentStoresAndAtomics = true, - .shaderTessellationAndGeometryPointSize = true, - .shaderImageGatherExtended = false, - .shaderStorageImageExtendedFormats = true, - .shaderStorageImageMultisample = false, - .shaderStorageImageReadWithoutFormat = true, - .shaderStorageImageWriteWithoutFormat = false, - .shaderUniformBufferArrayDynamicIndexing = false, - .shaderSampledImageArrayDynamicIndexing = false, - .shaderStorageBufferArrayDynamicIndexing = false, - .shaderStorageImageArrayDynamicIndexing = false, - .shaderClipDistance = true, - .shaderCullDistance = false, - .shaderFloat64 = false, - .shaderInt64 = false, - .shaderInt16 = false, - .shaderResourceResidency = false, - .shaderResourceMinLod = false, - .sparseBinding = false, - .sparseResidencyBuffer = false, - .sparseResidencyImage2D = false, - .sparseResidencyImage3D = false, - .sparseResidency2Samples = false, - .sparseResidency4Samples = false, - .sparseResidency8Samples = false, - .sparseResidency16Samples = false, - .sparseResidencyAliased = false, - .variableMultisampleRate = false, - .inheritedQueries = true, - - /* Vulkan 1.1 */ - .storageBuffer16BitAccess = true, - .uniformAndStorageBuffer16BitAccess = true, - .storagePushConstant16 = true, - .storageInputOutput16 = false, - .multiview = true, - .multiviewGeometryShader = false, - .multiviewTessellationShader = false, - .variablePointersStorageBuffer = true, - /* FIXME: this needs support for non-constant index on UBO/SSBO */ - .variablePointers = false, - .protectedMemory = false, -#ifdef ANDROID - .samplerYcbcrConversion = false, -#else - .samplerYcbcrConversion = true, -#endif - .shaderDrawParameters = false, - - /* Vulkan 1.2 */ - .hostQueryReset = true, - .uniformAndStorageBuffer8BitAccess = true, - .uniformBufferStandardLayout = true, - /* V3D 4.2 wraps TMU vector accesses to 16-byte boundaries, so loads and - * stores of vectors that cross these boundaries would not work correcly - * with scalarBlockLayout and would need to be split into smaller vectors - * (and/or scalars) that don't cross these boundaries. For load/stores - * with dynamic offsets where we can't identify if the offset is - * problematic, we would always have to scalarize. Overall, this would - * not lead to best performance so let's just not support it. - */ - .scalarBlockLayout = false, - /* This tells applications 2 things: - * - * 1. If they can select just one aspect for barriers. For us barriers - * decide if we need to split a job and we don't care if it is only - * for one of the aspects of the image or both, so we don't really - * benefit from seeing barriers that select just one aspect. - * - * 2. If they can program different layouts for each aspect. We - * generally don't care about layouts, so again, we don't get any - * benefits from this to limit the scope of image layout transitions. - * - * Still, Vulkan 1.2 requires this feature to be supported so we - * advertise it even though we don't really take advantage of it. - */ - .separateDepthStencilLayouts = true, - .storageBuffer8BitAccess = true, - .storagePushConstant8 = true, - .imagelessFramebuffer = true, - .timelineSemaphore = true, - - .samplerMirrorClampToEdge = true, - - /* These are mandatory by Vulkan 1.2, however, we don't support any of - * the optional features affected by them (non 32-bit types for - * shaderSubgroupExtendedTypes and additional subgroup ballot for - * subgroupBroadcastDynamicId), so in practice setting them to true - * doesn't have any implications for us until we implement any of these - * optional features. - */ - .shaderSubgroupExtendedTypes = true, - .subgroupBroadcastDynamicId = true, - - .vulkanMemoryModel = true, - .vulkanMemoryModelDeviceScope = true, - .vulkanMemoryModelAvailabilityVisibilityChains = true, - - .bufferDeviceAddress = true, - .bufferDeviceAddressCaptureReplay = false, - .bufferDeviceAddressMultiDevice = false, - - /* Vulkan 1.3 */ - .inlineUniformBlock = true, - /* Inline buffers work like push constants, so after their are bound - * some of their contents may be copied into the uniform stream as soon - * as the next draw/dispatch is recorded in the command buffer. This means - * that if the client updates the buffer contents after binding it to - * a command buffer, the next queue submit of that command buffer may - * not use the latest update to the buffer contents, but the data that - * was present in the buffer at the time it was bound to the command - * buffer. - */ - .descriptorBindingInlineUniformBlockUpdateAfterBind = false, - .pipelineCreationCacheControl = true, - .privateData = true, - .maintenance4 = true, - .shaderZeroInitializeWorkgroupMemory = true, - .synchronization2 = true, - .robustImageAccess = true, - .shaderIntegerDotProduct = true, - - /* VK_EXT_4444_formats */ - .formatA4R4G4B4 = true, - .formatA4B4G4R4 = true, - - /* VK_EXT_custom_border_color */ - .customBorderColors = true, - .customBorderColorWithoutFormat = false, - - /* VK_EXT_index_type_uint8 */ - .indexTypeUint8 = true, - - /* VK_EXT_line_rasterization */ - .rectangularLines = true, - .bresenhamLines = true, - .smoothLines = false, - .stippledRectangularLines = false, - .stippledBresenhamLines = false, - .stippledSmoothLines = false, - - /* VK_EXT_color_write_enable */ - .colorWriteEnable = true, - - /* VK_KHR_pipeline_executable_properties */ - .pipelineExecutableInfo = true, - - /* VK_EXT_provoking_vertex */ - .provokingVertexLast = true, - /* FIXME: update when supporting EXT_transform_feedback */ - .transformFeedbackPreservesProvokingVertex = false, - - /* VK_EXT_vertex_attribute_divisor */ - .vertexAttributeInstanceRateDivisor = true, - .vertexAttributeInstanceRateZeroDivisor = false, - - /* VK_KHR_performance_query */ - .performanceCounterQueryPools = physical_device->caps.perfmon, - .performanceCounterMultipleQueryPools = false, - - /* VK_EXT_texel_buffer_alignment */ - .texelBufferAlignment = true, - - /* VK_KHR_workgroup_memory_explicit_layout */ - .workgroupMemoryExplicitLayout = true, - .workgroupMemoryExplicitLayoutScalarBlockLayout = false, - .workgroupMemoryExplicitLayout8BitAccess = true, - .workgroupMemoryExplicitLayout16BitAccess = true, - - /* VK_EXT_border_color_swizzle */ - .borderColorSwizzle = true, - .borderColorSwizzleFromImage = true, - - /* VK_EXT_shader_module_identifier */ - .shaderModuleIdentifier = true, - - /* VK_EXT_depth_clip_control */ - .depthClipControl = true, - - /* VK_EXT_attachment_feedback_loop_layout */ - .attachmentFeedbackLoopLayout = true, - - /* VK_EXT_primitive_topology_list_restart */ - .primitiveTopologyListRestart = true, - /* FIXME: we don't support tessellation shaders yet */ - .primitiveTopologyPatchListRestart = false, - - /* VK_EXT_pipeline_robustness */ - .pipelineRobustness = true, - }; - - vk_get_physical_device_features(pFeatures, &features); -} - uint32_t v3dv_physical_device_vendor_id(struct v3dv_physical_device *dev) { @@ -1329,6 +1037,8 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev) switch (dev->devinfo.ver) { case 42: return 0xBE485FD3; /* Broadcom deviceID for 2711 */ + case 71: + return 0x55701C33; /* Broadcom deviceID for 2712 */ default: unreachable("Unsupported V3D version"); } @@ -1357,6 +1067,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, const VkSampleCountFlags supported_sample_counts = VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT; + const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver); + struct timespec clock_res; clock_getres(CLOCK_MONOTONIC, &clock_res); const float timestamp_period = @@ -1427,7 +1139,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .maxFragmentInputComponents = max_varying_components, .maxFragmentOutputAttachments = 4, .maxFragmentDualSrcAttachments = 0, - .maxFragmentCombinedOutputResources = MAX_RENDER_TARGETS + + .maxFragmentCombinedOutputResources = max_rts + MAX_STORAGE_BUFFERS + MAX_STORAGE_IMAGES, @@ -1440,7 +1152,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .subPixelPrecisionBits = V3D_COORD_SHIFT, .subTexelPrecisionBits = 8, .mipmapPrecisionBits = 8, - .maxDrawIndexedIndexValue = 0x00ffffff, + .maxDrawIndexedIndexValue = pdevice->devinfo.ver >= 71 ? + 0xffffffff : 0x00ffffff, .maxDrawIndirectCount = 0x7fffffff, .maxSamplerLodBias = 14.0f, .maxSamplerAnisotropy = 16.0f, @@ -1467,7 +1180,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .framebufferDepthSampleCounts = supported_sample_counts, .framebufferStencilSampleCounts = supported_sample_counts, .framebufferNoAttachmentsSampleCounts = supported_sample_counts, - .maxColorAttachments = MAX_RENDER_TARGETS, + .maxColorAttachments = max_rts, .sampledImageColorSampleCounts = supported_sample_counts, .sampledImageIntegerSampleCounts = supported_sample_counts, .sampledImageDepthSampleCounts = supported_sample_counts, @@ -1579,8 +1292,8 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, .driverID = VK_DRIVER_ID_MESA_V3DV, .conformanceVersion = { .major = 1, - .minor = 2, - .subminor = 7, + .minor = 3, + .subminor = 6, .patch = 1, }, .supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT, @@ -1689,6 +1402,24 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, props->allowCommandBufferQueryCopies = true; break; } +#ifdef ANDROID +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wswitch" + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENTATION_PROPERTIES_ANDROID: { + VkPhysicalDevicePresentationPropertiesANDROID *props = + (VkPhysicalDevicePresentationPropertiesANDROID *)ext; + uint64_t front_rendering_usage = 0; + struct u_gralloc *gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO); + if (gralloc != NULL) { + u_gralloc_get_front_rendering_usage(gralloc, &front_rendering_usage); + u_gralloc_destroy(&gralloc); + } + props->sharedImage = front_rendering_usage ? VK_TRUE + : VK_FALSE; + break; + } +#pragma GCC diagnostic pop +#endif case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: { VkPhysicalDeviceDrmPropertiesEXT *props = (VkPhysicalDeviceDrmPropertiesEXT *)ext; @@ -2001,6 +1732,11 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, return vk_error(NULL, result); } +#ifdef ANDROID + device->gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO); + assert(device->gralloc); +#endif + device->instance = instance; device->pdevice = physical_device; @@ -2034,7 +1770,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0, device->instance->default_pipeline_cache_enabled); device->default_attribute_float = - v3dv_pipeline_create_default_attribute_values(device, NULL); + v3dv_X(device, create_default_attribute_values)(device, NULL); device->device_address_mem_ctx = ralloc_context(NULL); util_dynarray_init(&device->device_address_bo_list, @@ -2067,6 +1803,9 @@ fail: v3dv_event_free_resources(device); v3dv_query_free_resources(device); vk_device_finish(&device->vk); +#ifdef ANDROID + u_gralloc_destroy(&device->gralloc); +#endif vk_free(&device->vk.alloc, device); return result; @@ -2105,6 +1844,9 @@ v3dv_DestroyDevice(VkDevice _device, mtx_destroy(&device->query_mutex); vk_device_finish(&device->vk); +#ifdef ANDROID + u_gralloc_destroy(&device->gralloc); +#endif vk_free2(&device->vk.alloc, pAllocator, device); } @@ -2244,18 +1986,8 @@ device_alloc_for_wsi(struct v3dv_device *device, #if using_v3d_simulator return device_alloc(device, mem, size); #else - /* If we are allocating for WSI we should have a swapchain and thus, - * we should've initialized the display device. However, Zink doesn't - * use swapchains, so in that case we can get here without acquiring the - * display device and we need to do it now. - */ VkResult result; struct v3dv_physical_device *pdevice = device->pdevice; - if (unlikely(pdevice->display_fd < 0)) { - result = v3dv_physical_device_acquire_display(pdevice, NULL); - if (result != VK_SUCCESS) - return result; - } assert(pdevice->display_fd != -1); mem->is_for_wsi = true; @@ -2329,7 +2061,7 @@ free_memory(struct v3dv_device *device, device_free(device, mem); - vk_object_free(&device->vk, pAllocator, mem); + vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk); } VKAPI_ATTR void VKAPI_CALL @@ -2354,13 +2086,10 @@ v3dv_AllocateMemory(VkDevice _device, assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO); - /* The Vulkan 1.0.33 spec says "allocationSize must be greater than 0". */ - assert(pAllocateInfo->allocationSize > 0); - /* We always allocate device memory in multiples of a page, so round up * requested size to that. */ - const VkDeviceSize alloc_size = ALIGN(pAllocateInfo->allocationSize, 4096); + const VkDeviceSize alloc_size = align64(pAllocateInfo->allocationSize, 4096); if (unlikely(alloc_size > MAX_MEMORY_ALLOCATION_SIZE)) return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); @@ -2369,8 +2098,8 @@ v3dv_AllocateMemory(VkDevice _device, if (unlikely(heap_used + alloc_size > pdevice->memory.memoryHeaps[0].size)) return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); - mem = vk_object_zalloc(&device->vk, pAllocator, sizeof(*mem), - VK_OBJECT_TYPE_DEVICE_MEMORY); + mem = vk_device_memory_create(&device->vk, pAllocateInfo, + pAllocator, sizeof(*mem)); if (mem == NULL) return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); @@ -2410,6 +2139,7 @@ v3dv_AllocateMemory(VkDevice _device, } VkResult result; + if (wsi_info) { result = device_alloc_for_wsi(device, pAllocator, mem, alloc_size); } else if (fd_info && fd_info->handleType) { @@ -2419,12 +2149,22 @@ v3dv_AllocateMemory(VkDevice _device, fd_info->fd, alloc_size, &mem->bo); if (result == VK_SUCCESS) close(fd_info->fd); + } else if (mem->vk.ahardware_buffer) { +#ifdef ANDROID + const native_handle_t *handle = AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer); + assert(handle->numFds > 0); + size_t size = lseek(handle->data[0], 0, SEEK_END); + result = device_import_bo(device, pAllocator, + handle->data[0], size, &mem->bo); +#else + result = VK_ERROR_FEATURE_NOT_PRESENT; +#endif } else { result = device_alloc(device, mem, alloc_size); } if (result != VK_SUCCESS) { - vk_object_free(&device->vk, pAllocator, mem); + vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk); return vk_error(device, result); } @@ -2436,7 +2176,7 @@ v3dv_AllocateMemory(VkDevice _device, /* If this memory can be used via VK_KHR_buffer_device_address then we * will need to manually add the BO to any job submit that makes use of - * VK_KHR_buffer_device_address, since such jobs may produde buffer + * VK_KHR_buffer_device_address, since such jobs may produce buffer * load/store operations that may access any buffer memory allocated with * this flag and we don't have any means to tell which buffers will be * accessed through this mechanism since they don't even have to be bound @@ -2668,11 +2408,44 @@ v3dv_BindImageMemory2(VkDevice _device, const VkBindImageMemoryInfo *pBindInfos) { for (uint32_t i = 0; i < bindInfoCount; i++) { -#ifndef ANDROID +#ifdef ANDROID + V3DV_FROM_HANDLE(v3dv_device_memory, mem, pBindInfos[i].memory); + V3DV_FROM_HANDLE(v3dv_device, device, _device); + if (mem != NULL && mem->vk.ahardware_buffer) { + AHardwareBuffer_Desc description; + const native_handle_t *handle = AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer); + + V3DV_FROM_HANDLE(v3dv_image, image, pBindInfos[i].image); + AHardwareBuffer_describe(mem->vk.ahardware_buffer, &description); + + struct u_gralloc_buffer_handle gr_handle = { + .handle = handle, + .pixel_stride = description.stride, + .hal_format = description.format, + }; + + VkResult result = v3dv_gralloc_to_drm_explicit_layout( + device->gralloc, + &gr_handle, + image->android_explicit_layout, + image->android_plane_layouts, + V3DV_MAX_PLANE_COUNT); + if (result != VK_SUCCESS) + return result; + + result = v3dv_update_image_layout( + device, image, image->android_explicit_layout->drmFormatModifier, + /* disjoint = */ false, image->android_explicit_layout); + if (result != VK_SUCCESS) + return result; + } +#endif + const VkBindImageMemorySwapchainInfoKHR *swapchain_info = vk_find_struct_const(pBindInfos->pNext, BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR); if (swapchain_info && swapchain_info->swapchain) { +#ifndef ANDROID struct v3dv_image *swapchain_image = v3dv_wsi_get_image_from_swapchain(swapchain_info->swapchain, swapchain_info->imageIndex); @@ -2685,8 +2458,8 @@ v3dv_BindImageMemory2(VkDevice _device, .memoryOffset = swapchain_image->planes[0].mem_offset, }; bind_image_memory(&swapchain_bind); - } else #endif + } else { bind_image_memory(&pBindInfos[i]); } @@ -2716,6 +2489,18 @@ get_buffer_memory_requirements(struct v3dv_buffer *buffer, .size = align64(buffer->size, buffer->alignment), }; + /* UBO and SSBO may be read using ldunifa, which prefetches the next + * 4 bytes after a read. If the buffer's size is exactly a multiple + * of a page size and the shader reads the last 4 bytes with ldunifa + * the prefetching would read out of bounds and cause an MMU error, + * so we allocate extra space to avoid kernel error spamming. + */ + bool can_ldunifa = buffer->usage & + (VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); + if (can_ldunifa && (buffer->size % 4096 == 0)) + pMemoryRequirements->memoryRequirements.size += buffer->alignment; + vk_foreach_struct(ext, pMemoryRequirements->pNext) { switch (ext->sType) { case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: { @@ -2978,7 +2763,7 @@ v3dv_CreateSampler(VkDevice _device, } } - v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info); + v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info); *pSampler = v3dv_sampler_to_handle(sampler); @@ -3079,9 +2864,9 @@ vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion) * * - Loader interface v4 differs from v3 in: * - The ICD must implement vk_icdGetPhysicalDeviceProcAddr(). - * + * * - Loader interface v5 differs from v4 in: - * - The ICD must support Vulkan API version 1.1 and must not return + * - The ICD must support Vulkan API version 1.1 and must not return * VK_ERROR_INCOMPATIBLE_DRIVER from vkCreateInstance() unless a * Vulkan Loader with interface v4 or smaller is being used and the * application provides an API version that is greater than 1.0. diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_event.c b/lib/mesa/src/broadcom/vulkan/v3dv_event.c index 966392400..a3aad37d9 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_event.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_event.c @@ -33,20 +33,16 @@ get_set_event_cs() nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "set event cs"); - b.shader->info.workgroup_size[0] = 1; - b.shader->info.workgroup_size[1] = 1; - b.shader->info.workgroup_size[2] = 1; - - nir_ssa_def *buf = + nir_def *buf = nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), .desc_set = 0, .binding = 0, .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); - nir_ssa_def *offset = + nir_def *offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); - nir_ssa_def *value = + nir_def *value = nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 4, .range = 4); nir_store_ssbo(&b, value, buf, offset, @@ -62,23 +58,19 @@ get_wait_event_cs() nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "wait event cs"); - b.shader->info.workgroup_size[0] = 1; - b.shader->info.workgroup_size[1] = 1; - b.shader->info.workgroup_size[2] = 1; - - nir_ssa_def *buf = + nir_def *buf = nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), .desc_set = 0, .binding = 0, .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); - nir_ssa_def *offset = + nir_def *offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); nir_loop *loop = nir_push_loop(&b); - nir_ssa_def *load = + nir_def *load = nir_load_ssbo(&b, 1, 8, buf, offset, .access = 0, .align_mul = 4); - nir_ssa_def *value = nir_i2i32(&b, load); + nir_def *value = nir_i2i32(&b, load); nir_if *if_stmt = nir_push_if(&b, nir_ieq_imm(&b, value, 1)); nir_jump(&b, nir_jump_break); diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_formats.c b/lib/mesa/src/broadcom/vulkan/v3dv_formats.c index ecb369963..01be6dcf4 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_formats.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_formats.c @@ -22,13 +22,18 @@ */ #include "v3dv_private.h" -#include "vk_util.h" +#ifdef ANDROID +#include "vk_android.h" +#endif #include "vk_enum_defines.h" +#include "vk_util.h" #include "drm-uapi/drm_fourcc.h" #include "util/format/u_format.h" #include "vulkan/wsi/wsi_common.h" +#include <vulkan/vulkan_android.h> + const uint8_t * v3dv_get_format_swizzle(struct v3dv_device *device, VkFormat f, uint8_t plane) { @@ -169,6 +174,7 @@ image_format_plane_features(struct v3dv_physical_device *pdevice, if (desc->nr_channels == 1 && vk_format_is_int(vk_format)) flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT; } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 || + vk_format == VK_FORMAT_A2R10G10B10_UNORM_PACK32 || vk_format == VK_FORMAT_A2B10G10R10_UINT_PACK32 || vk_format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) { /* To comply with shaderStorageImageExtendedFormats */ @@ -291,7 +297,8 @@ buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT | VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT_KHR; } - } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32) { + } else if (vk_format == VK_FORMAT_A2B10G10R10_UNORM_PACK32 || + vk_format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) { flags |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT | VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT | VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT; @@ -658,6 +665,7 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL; const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *drm_format_mod_info = NULL; VkExternalImageFormatProperties *external_props = NULL; + UNUSED VkAndroidHardwareBufferUsageANDROID *android_usage = NULL; VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL; VkImageTiling tiling = base_info->tiling; @@ -698,6 +706,9 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice, case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES: external_props = (void *) s; break; + case VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_USAGE_ANDROID: + android_usage = (void *)s; + break; case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES: ycbcr_props = (void *) s; break; @@ -721,12 +732,28 @@ v3dv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice, if (external_props) external_props->externalMemoryProperties = prime_fd_props; break; +#ifdef ANDROID + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID: + if (external_props) { + external_props->externalMemoryProperties.exportFromImportedHandleTypes = 0; + external_props->externalMemoryProperties.compatibleHandleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID; + external_props->externalMemoryProperties.externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT | VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT | VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT; + } + break; +#endif default: result = VK_ERROR_FORMAT_NOT_SUPPORTED; break; } } + if (android_usage) { +#ifdef ANDROID + android_usage->androidHardwareBufferUsage = + vk_image_usage_to_ahb_usage(base_info->flags, base_info->usage); +#endif + } + done: return result; } diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_image.c b/lib/mesa/src/broadcom/vulkan/v3dv_image.c index 325cc7ce3..c02516960 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_image.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_image.c @@ -28,6 +28,9 @@ #include "util/u_math.h" #include "vk_util.h" #include "vulkan/wsi/wsi_common.h" +#ifdef ANDROID +#include "vk_android.h" +#endif /** * Computes the HW's UIFblock padding for a given height/cpp. @@ -70,27 +73,38 @@ v3d_get_ub_pad(uint32_t cpp, uint32_t height) return 0; } -static void +/** + * Computes the dimension with required padding for mip levels. + * + * This padding is required for width and height dimensions when the mip + * level is greater than 1, and for the depth dimension when the mip level + * is greater than 0. This function expects to be passed a mip level >= 1. + * + * Note: Hardware documentation seems to suggest that the third argument + * should be the utile dimensions, but through testing it was found that + * the block dimension should be used instead. + */ +static uint32_t +v3d_get_dimension_mpad(uint32_t dimension, uint32_t level, uint32_t block_dimension) +{ + assert(level >= 1); + uint32_t pot_dim = u_minify(dimension, 1); + pot_dim = util_next_power_of_two(DIV_ROUND_UP(pot_dim, block_dimension)); + uint32_t padded_dim = block_dimension * pot_dim; + return u_minify(padded_dim, level - 1); +} + +static bool v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane, - uint32_t plane_offset) + uint32_t plane_offset, + const VkSubresourceLayout *plane_layouts) { assert(image->planes[plane].cpp > 0); - /* Texture Base Adress needs to be 64-byte aligned */ - assert(plane_offset % 64 == 0); uint32_t width = image->planes[plane].width; uint32_t height = image->planes[plane].height; uint32_t depth = image->vk.extent.depth; - /* Note that power-of-two padding is based on level 1. These are not - * equivalent to just util_next_power_of_two(dimension), because at a - * level 0 dimension of 9, the level 1 power-of-two padded value is 4, - * not 8. - */ - uint32_t pot_width = 2 * util_next_power_of_two(u_minify(width, 1)); - uint32_t pot_height = 2 * util_next_power_of_two(u_minify(height, 1)); - uint32_t pot_depth = 2 * util_next_power_of_two(u_minify(depth, 1)); - uint32_t utile_w = v3d_utile_width(image->planes[plane].cpp); uint32_t utile_h = v3d_utile_height(image->planes[plane].cpp); uint32_t uif_block_w = utile_w * 2; @@ -99,6 +113,21 @@ v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane, uint32_t block_width = vk_format_get_blockwidth(image->vk.format); uint32_t block_height = vk_format_get_blockheight(image->vk.format); + /* Note that power-of-two padding is based on level 1. These are not + * equivalent to just util_next_power_of_two(dimension), because at a + * level 0 dimension of 9, the level 1 power-of-two padded value is 4, + * not 8. Additionally the pot padding is based on the block size. + */ + uint32_t pot_width = 2 * v3d_get_dimension_mpad(width, + 1, + block_width); + uint32_t pot_height = 2 * v3d_get_dimension_mpad(height, + 1, + block_height); + uint32_t pot_depth = 2 * v3d_get_dimension_mpad(depth, + 1, + 1); + assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT || image->vk.samples == VK_SAMPLE_COUNT_4_BIT); bool msaa = image->vk.samples != VK_SAMPLE_COUNT_1_BIT; @@ -109,14 +138,30 @@ v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane, assert(depth > 0); assert(image->vk.mip_levels >= 1); - uint32_t offset = plane_offset; + /* Texture Base Address needs to be 64-byte aligned. If we have an explicit + * plane layout we will return false to fail image creation with appropriate + * error code. + */ + uint32_t offset; + if (plane_layouts) { + offset = plane_layouts[plane].offset; + if (offset % 64 != 0) + return false; + } else { + offset = plane_offset; + } + assert(plane_offset % 64 == 0); + for (int32_t i = image->vk.mip_levels - 1; i >= 0; i--) { struct v3d_resource_slice *slice = &image->planes[plane].slices[i]; + slice->width = u_minify(width, i); + slice->height = u_minify(height, i); + uint32_t level_width, level_height, level_depth; if (i < 2) { - level_width = u_minify(width, i); - level_height = u_minify(height, i); + level_width = slice->width; + level_height = slice->height; } else { level_width = u_minify(pot_width, i); level_height = u_minify(pot_height, i); @@ -179,6 +224,18 @@ v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane, slice->offset = offset; slice->stride = level_width * image->planes[plane].cpp; + + /* We assume that rowPitch in the plane layout refers to level 0 */ + if (plane_layouts && i == 0) { + if (plane_layouts[plane].rowPitch < slice->stride) + return false; + if (plane_layouts[plane].rowPitch % image->planes[plane].cpp) + return false; + if (image->tiled && (plane_layouts[plane].rowPitch % (4 * uif_block_w))) + return false; + slice->stride = plane_layouts[plane].rowPitch; + } + slice->padded_height = level_height; if (slice->tiling == V3D_TILING_UIF_NO_XOR || slice->tiling == V3D_TILING_UIF_XOR) { @@ -222,7 +279,8 @@ v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane, image->planes[plane].alignment = 4096; } else { image->planes[plane].alignment = - (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) ? 64 : image->planes[plane].cpp; + (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) ? + 64 : image->planes[plane].cpp; } uint32_t align_offset = @@ -243,15 +301,36 @@ v3d_setup_plane_slices(struct v3dv_image *image, uint8_t plane, image->planes[plane].cube_map_stride = align(image->planes[plane].slices[0].offset + image->planes[plane].slices[0].size, 64); + + if (plane_layouts && image->vk.array_layers > 1) { + if (plane_layouts[plane].arrayPitch % 64 != 0) + return false; + if (plane_layouts[plane].arrayPitch < + image->planes[plane].cube_map_stride) { + return false; + } + image->planes[plane].cube_map_stride = plane_layouts[plane].arrayPitch; + } + image->planes[plane].size += image->planes[plane].cube_map_stride * (image->vk.array_layers - 1); } else { image->planes[plane].cube_map_stride = image->planes[plane].slices[0].size; + if (plane_layouts) { + /* We assume that depthPitch in the plane layout refers to level 0 */ + if (plane_layouts[plane].depthPitch != + image->planes[plane].slices[0].size) { + return false; + } + } } + + return true; } -static void -v3d_setup_slices(struct v3dv_image *image, bool disjoint) +static bool +v3d_setup_slices(struct v3dv_image *image, bool disjoint, + const VkSubresourceLayout *plane_layouts) { if (disjoint && image->plane_count == 1) disjoint = false; @@ -259,11 +338,15 @@ v3d_setup_slices(struct v3dv_image *image, bool disjoint) uint32_t offset = 0; for (uint8_t plane = 0; plane < image->plane_count; plane++) { offset = disjoint ? 0 : offset; - v3d_setup_plane_slices(image, plane, offset); + if (!v3d_setup_plane_slices(image, plane, offset, plane_layouts)) { + assert(plane_layouts); + return false; + } offset += align(image->planes[plane].size, 64); } image->non_disjoint_size = disjoint ? 0 : offset; + return true; } uint32_t @@ -280,6 +363,34 @@ v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer } VkResult +v3dv_update_image_layout(struct v3dv_device *device, + struct v3dv_image *image, + uint64_t modifier, + bool disjoint, + const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info) +{ + assert(!explicit_mod_info || + image->plane_count == explicit_mod_info->drmFormatModifierPlaneCount); + + assert(!explicit_mod_info || + modifier == explicit_mod_info->drmFormatModifier); + + image->tiled = modifier != DRM_FORMAT_MOD_LINEAR; + + image->vk.drm_format_mod = modifier; + + bool ok = + v3d_setup_slices(image, disjoint, + explicit_mod_info ? explicit_mod_info->pPlaneLayouts : NULL); + if (!ok) { + assert(explicit_mod_info); + return VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT; + } + + return VK_SUCCESS; +} + +VkResult v3dv_image_init(struct v3dv_device *device, const VkImageCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, @@ -296,11 +407,20 @@ v3dv_image_init(struct v3dv_device *device, */ VkImageTiling tiling = pCreateInfo->tiling; uint64_t modifier = DRM_FORMAT_MOD_INVALID; + const VkImageDrmFormatModifierListCreateInfoEXT *mod_info = NULL; + const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info = NULL; +#ifdef ANDROID + if (image->is_native_buffer_memory) { + assert(image->android_explicit_layout); + explicit_mod_info = image->android_explicit_layout; + modifier = explicit_mod_info->drmFormatModifier; + } +#endif if (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { - const VkImageDrmFormatModifierListCreateInfoEXT *mod_info = + mod_info = vk_find_struct_const(pCreateInfo->pNext, IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT); - const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info = + explicit_mod_info = vk_find_struct_const(pCreateInfo->pNext, IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT); assert(mod_info || explicit_mod_info); @@ -327,35 +447,20 @@ v3dv_image_init(struct v3dv_device *device, tiling = VK_IMAGE_TILING_LINEAR; } -#ifdef ANDROID - const VkNativeBufferANDROID *native_buffer = - vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID); - - int native_buf_fd = -1; - int native_buf_stride = 0; - int native_buf_size = 0; - - if (native_buffer != NULL) { - VkResult result = v3dv_gralloc_info(device, native_buffer, &native_buf_fd, - &native_buf_stride, &native_buf_size, - &modifier); - if (result != VK_SUCCESS) - return result; - - if (modifier != DRM_FORMAT_MOD_BROADCOM_UIF) - tiling = VK_IMAGE_TILING_LINEAR; - } -#endif + if (modifier == DRM_FORMAT_MOD_INVALID) + modifier = (tiling == VK_IMAGE_TILING_OPTIMAL) ? DRM_FORMAT_MOD_BROADCOM_UIF + : DRM_FORMAT_MOD_LINEAR; const struct v3dv_format *format = - v3dv_X(device, get_format)(pCreateInfo->format); + v3dv_X(device, get_format)(image->vk.format); v3dv_assert(format != NULL && format->plane_count); assert(pCreateInfo->samples == VK_SAMPLE_COUNT_1_BIT || pCreateInfo->samples == VK_SAMPLE_COUNT_4_BIT); image->format = format; - image->plane_count = vk_format_get_plane_count(pCreateInfo->format); + + image->plane_count = vk_format_get_plane_count(image->vk.format); const struct vk_format_ycbcr_info *ycbcr_info = vk_format_get_ycbcr_info(image->vk.format); @@ -378,12 +483,6 @@ v3dv_image_init(struct v3dv_device *device, ycbcr_info->planes[plane].denominator_scales[1]; } } - image->tiled = tiling == VK_IMAGE_TILING_OPTIMAL || - (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT && - modifier != DRM_FORMAT_MOD_LINEAR); - - image->vk.tiling = tiling; - image->vk.drm_format_mod = modifier; /* Our meta paths can create image views with compatible formats for any * image, so always set this flag to keep the common Vulkan image code @@ -391,26 +490,18 @@ v3dv_image_init(struct v3dv_device *device, */ image->vk.create_flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; - bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT; - v3d_setup_slices(image, disjoint); - #ifdef ANDROID - if (native_buffer != NULL) { - assert(image->plane_count == 1); - image->planes[0].slices[0].stride = native_buf_stride; - image->non_disjoint_size = - image->planes[0].slices[0].size = - image->planes[0].size = native_buf_size; - - VkResult result = v3dv_import_native_buffer_fd(v3dv_device_to_handle(device), - native_buf_fd, pAllocator, - v3dv_image_to_handle(image)); - if (result != VK_SUCCESS) - return result; - } + /* At this time, an AHB handle is not yet provided. + * Image layout will be filled up during vkBindImageMemory2 + */ + if (image->is_ahb) + return VK_SUCCESS; #endif - return VK_SUCCESS; + bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT; + + return v3dv_update_image_layout(device, image, modifier, disjoint, + explicit_mod_info); } static VkResult @@ -419,21 +510,92 @@ create_image(struct v3dv_device *device, const VkAllocationCallbacks *pAllocator, VkImage *pImage) { + VkResult result; struct v3dv_image *image = NULL; image = vk_image_create(&device->vk, pCreateInfo, pAllocator, sizeof(*image)); if (image == NULL) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - VkResult result = v3dv_image_init(device, pCreateInfo, pAllocator, image); - if (result != VK_SUCCESS) { - vk_image_destroy(&device->vk, pAllocator, &image->vk); - return result; +#ifdef ANDROID + const VkExternalMemoryImageCreateInfo *external_info = + vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO); + + const VkNativeBufferANDROID *native_buffer = + vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID); + + if (native_buffer != NULL) + image->is_native_buffer_memory = true; + + image->is_ahb = external_info && (external_info->handleTypes & + VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID); + + assert(!(image->is_ahb && image->is_native_buffer_memory)); + + if (image->is_ahb || image->is_native_buffer_memory) { + image->android_explicit_layout = vk_alloc2(&device->vk.alloc, pAllocator, + sizeof(VkImageDrmFormatModifierExplicitCreateInfoEXT), + 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!image->android_explicit_layout) { + result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail; + } + + image->android_plane_layouts = vk_alloc2(&device->vk.alloc, pAllocator, + sizeof(VkSubresourceLayout) * V3DV_MAX_PLANE_COUNT, + 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!image->android_plane_layouts) { + result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail; + } + } + + if (image->is_native_buffer_memory) { + struct u_gralloc_buffer_handle gr_handle = { + .handle = native_buffer->handle, + .hal_format = native_buffer->format, + .pixel_stride = native_buffer->stride, + }; + + result = v3dv_gralloc_to_drm_explicit_layout(device->gralloc, + &gr_handle, + image->android_explicit_layout, + image->android_plane_layouts, + V3DV_MAX_PLANE_COUNT); + if (result != VK_SUCCESS) + goto fail; + } +#endif + + result = v3dv_image_init(device, pCreateInfo, pAllocator, image); + if (result != VK_SUCCESS) + goto fail; + +#ifdef ANDROID + if (image->is_native_buffer_memory) { + result = v3dv_import_native_buffer_fd(v3dv_device_to_handle(device), + native_buffer->handle->data[0], pAllocator, + v3dv_image_to_handle(image)); + if (result != VK_SUCCESS) + goto fail; } +#endif *pImage = v3dv_image_to_handle(image); return VK_SUCCESS; + +fail: +#ifdef ANDROID + if (image->android_explicit_layout) + vk_free2(&device->vk.alloc, pAllocator, image->android_explicit_layout); + if (image->android_plane_layouts) + vk_free2(&device->vk.alloc, pAllocator, image->android_plane_layouts); +#endif + + vk_image_destroy(&device->vk, pAllocator, &image->vk); + return result; } static VkResult @@ -534,8 +696,10 @@ v3dv_GetImageSubresourceLayout(VkDevice device, v3dv_layer_offset(image, subresource->mipLevel, subresource->arrayLayer, plane) - image->planes[plane].mem_offset; layout->rowPitch = slice->stride; - layout->depthPitch = image->planes[plane].cube_map_stride; - layout->arrayPitch = image->planes[plane].cube_map_stride; + layout->depthPitch = image->vk.image_type == VK_IMAGE_TYPE_3D ? + image->planes[plane].cube_map_stride : 0; + layout->arrayPitch = image->vk.array_layers > 1 ? + image->planes[plane].cube_map_stride : 0; if (image->vk.image_type != VK_IMAGE_TYPE_3D) { layout->size = slice->size; @@ -567,12 +731,33 @@ v3dv_DestroyImage(VkDevice _device, if (image == NULL) return; + /* If we have created a shadow tiled image for this image we must also free + * it (along with its memory allocation). + */ + if (image->shadow) { + bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT; + for (int i = 0; i < (disjoint ? image->plane_count : 1); i++) { + if (image->shadow->planes[i].mem) { + v3dv_FreeMemory(_device, + v3dv_device_memory_to_handle(image->shadow->planes[i].mem), + pAllocator); + } + } + v3dv_DestroyImage(_device, v3dv_image_to_handle(image->shadow), + pAllocator); + image->shadow = NULL; + } + #ifdef ANDROID - assert(image->plane_count == 1); if (image->is_native_buffer_memory) v3dv_FreeMemory(_device, v3dv_device_memory_to_handle(image->planes[0].mem), pAllocator); + + if (image->android_explicit_layout) + vk_free2(&device->vk.alloc, pAllocator, image->android_explicit_layout); + if (image->android_plane_layouts) + vk_free2(&device->vk.alloc, pAllocator, image->android_plane_layouts); #endif vk_image_destroy(&device->vk, pAllocator, &image->vk); @@ -641,8 +826,7 @@ create_image_view(struct v3dv_device *device, * makes sense to implement swizzle composition using VkSwizzle directly. */ VkFormat format; - uint8_t image_view_swizzle[4]; - if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT && + if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT && range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) { format = VK_FORMAT_R8G8B8A8_UINT; uint8_t stencil_aspect_swizzle[4] = { @@ -652,11 +836,11 @@ create_image_view(struct v3dv_device *device, vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle); util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle, - image_view_swizzle); + iview->view_swizzle); } else { - format = pCreateInfo->format; + format = iview->vk.format; vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, - image_view_swizzle); + iview->view_swizzle); } iview->vk.view_format = format; @@ -681,7 +865,7 @@ create_image_view(struct v3dv_device *device, const uint8_t *format_swizzle = v3dv_get_format_swizzle(device, format, plane); - util_format_compose_swizzles(format_swizzle, image_view_swizzle, + util_format_compose_swizzles(format_swizzle, iview->view_swizzle, iview->planes[plane].swizzle); iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle); @@ -725,6 +909,13 @@ v3dv_DestroyImageView(VkDevice _device, if (image_view == NULL) return; + if (image_view->shadow) { + v3dv_DestroyImageView(_device, + v3dv_image_view_to_handle(image_view->shadow), + pAllocator); + image_view->shadow = NULL; + } + vk_image_view_destroy(&device->vk, pAllocator, &image_view->vk); } diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_limits.h b/lib/mesa/src/broadcom/vulkan/v3dv_limits.h index 9cda9f0d6..4df172e6b 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_limits.h +++ b/lib/mesa/src/broadcom/vulkan/v3dv_limits.h @@ -41,7 +41,7 @@ #define MAX_STORAGE_IMAGES 4 #define MAX_INPUT_ATTACHMENTS 4 -#define MAX_UNIFORM_BUFFERS 12 +#define MAX_UNIFORM_BUFFERS 16 #define MAX_INLINE_UNIFORM_BUFFERS 4 #define MAX_STORAGE_BUFFERS 8 @@ -50,8 +50,6 @@ #define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \ MAX_DYNAMIC_STORAGE_BUFFERS) -#define MAX_RENDER_TARGETS 4 - #define MAX_MULTIVIEW_VIEW_COUNT 16 /* These are tunable parameters in the HW design, but all the V3D diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_meta_clear.c b/lib/mesa/src/broadcom/vulkan/v3dv_meta_clear.c index 9d7e36928..8eeb03e57 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_meta_clear.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_meta_clear.c @@ -73,7 +73,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, * conversion" */ assert(image->plane_count == 1); - if (!v3dv_meta_can_use_tlb(image, 0, &origin, &fb_format)) + if (!v3dv_meta_can_use_tlb(image, 0, 0, &origin, NULL, &fb_format)) return false; uint32_t internal_type, internal_bpp; @@ -127,6 +127,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, v3dv_job_start_frame(job, width, height, max_layer, false, true, 1, internal_bpp, + 4 * v3d_internal_bpp_words(internal_bpp), image->vk.samples > VK_SAMPLE_COUNT_1_BIT); struct v3dv_meta_framebuffer framebuffer; @@ -329,7 +330,7 @@ get_clear_rect_vs() nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position"); vs_out_pos->data.location = VARYING_SLOT_POS; - nir_ssa_def *pos = nir_gen_rect_vertices(&b, NULL, NULL); + nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL); nir_store_var(&b, vs_out_pos, pos, 0xf); return b.shader; @@ -352,8 +353,8 @@ get_clear_rect_gs(uint32_t push_constant_layer_base) nir->info.inputs_read = 1ull << VARYING_SLOT_POS; nir->info.outputs_written = (1ull << VARYING_SLOT_POS) | (1ull << VARYING_SLOT_LAYER); - nir->info.gs.input_primitive = SHADER_PRIM_TRIANGLES; - nir->info.gs.output_primitive = SHADER_PRIM_TRIANGLE_STRIP; + nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES; + nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP; nir->info.gs.vertices_in = 3; nir->info.gs.vertices_out = 3; nir->info.gs.invocations = 1; @@ -386,7 +387,7 @@ get_clear_rect_gs(uint32_t push_constant_layer_base) nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i); /* gl_Layer from push constants */ - nir_ssa_def *layer = + nir_def *layer = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = push_constant_layer_base, .range = 4); nir_store_var(&b, gs_out_layer, layer, 0x1); @@ -414,7 +415,7 @@ get_color_clear_rect_fs(uint32_t rt_idx, VkFormat format) nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color"); fs_out_color->data.location = FRAG_RESULT_DATA0 + rt_idx; - nir_ssa_def *color_load = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16); + nir_def *color_load = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16); nir_store_var(&b, fs_out_color, color_load, 0xf); return b.shader; @@ -432,7 +433,7 @@ get_depth_clear_rect_fs() "out_depth"); fs_out_depth->data.location = FRAG_RESULT_DEPTH; - nir_ssa_def *depth_load = + nir_def *depth_load = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); nir_store_var(&b, fs_out_depth, depth_load, 0x1); @@ -747,7 +748,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx, uint32_t bit_offset = 0; key |= rt_idx; - bit_offset += 2; + bit_offset += 3; key |= ((uint64_t) format) << bit_offset; bit_offset += 32; @@ -1189,9 +1190,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer, { V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer); - /* We can only clear attachments in the current subpass */ - assert(attachmentCount <= 5); /* 4 color + D/S */ + /* We can have at most max_color_RTs + 1 D/S attachments */ + assert(attachmentCount <= + V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1); + /* We can only clear attachments in the current subpass */ struct v3dv_render_pass *pass = cmd_buffer->state.pass; assert(cmd_buffer->state.subpass_idx < pass->subpass_count); diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_meta_copy.c b/lib/mesa/src/broadcom/vulkan/v3dv_meta_copy.c index 4d83e5379..f9779bf26 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_meta_copy.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_meta_copy.c @@ -351,18 +351,37 @@ get_compatible_tlb_format(VkFormat format) * Checks if we can implement an image copy or clear operation using the TLB * hardware. * + * The extent and miplevel are only used to validate tile stores (to match the + * region to store against the miplevel dimensions to avoid avoid cases where + * the region to store is not a aligned to tile boundaries). If extent is + * NULL no checks are done (which is fine if the image will only be used for a + * TLB load or when we know in advance that the store will be for the entire + * size of the image miplevel). + * * For tlb copies we are doing a per-plane copy, so for multi-plane formats, * the compatible format will be single-plane. */ bool v3dv_meta_can_use_tlb(struct v3dv_image *image, uint8_t plane, + uint8_t miplevel, const VkOffset3D *offset, + const VkExtent3D *extent, VkFormat *compat_format) { if (offset->x != 0 || offset->y != 0) return false; + /* FIXME: this is suboptimal, what we really want to check is that the + * extent of the region to copy is the full slice or a multiple of the + * tile size. + */ + if (extent) { + struct v3d_resource_slice *slice = &image->planes[plane].slices[miplevel]; + if (slice->width != extent->width || slice->height != extent->height) + return false; + } + if (image->format->planes[plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) { if (compat_format) *compat_format = image->planes[plane].vk_format; @@ -403,8 +422,11 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer, uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask); assert(plane < image->plane_count); - if (!v3dv_meta_can_use_tlb(image, plane, ®ion->imageOffset, &fb_format)) + if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel, + ®ion->imageOffset, ®ion->imageExtent, + &fb_format)) { return false; + } uint32_t internal_type, internal_bpp; v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects) @@ -431,8 +453,9 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer, const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w); const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h); - v3dv_job_start_frame(job, width, height, num_layers, false, true, - 1, internal_bpp, false); + v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + false); struct v3dv_meta_framebuffer framebuffer; v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, @@ -459,26 +482,89 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, VkFilter filter, bool dst_is_padded_image); + /** - * Returns true if the implementation supports the requested operation (even if - * it failed to process it, for example, due to an out-of-memory error). + * A structure that contains all the information we may need in various + * processes involving image to buffer copies implemented with blit paths. */ -static bool -copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_buffer *buffer, - struct v3dv_image *image, - const VkBufferImageCopy2 *region) +struct image_to_buffer_info { + /* Source image info */ + VkFormat src_format; + uint8_t plane; + VkColorComponentFlags cmask; + VkComponentMapping cswizzle; + VkImageAspectFlags src_copy_aspect; + uint32_t block_width; + uint32_t block_height; + + /* Destination buffer info */ + VkFormat dst_format; + uint32_t buf_width; + uint32_t buf_height; + uint32_t buf_bpp; + VkImageAspectFlags dst_copy_aspect; +}; + +static VkImageBlit2 +blit_region_for_image_to_buffer(const VkOffset3D *offset, + const VkExtent3D *extent, + uint32_t mip_level, + uint32_t base_layer, + uint32_t layer_offset, + struct image_to_buffer_info *info) { - bool handled = false; + VkImageBlit2 output = { + .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2, + .srcSubresource = { + .aspectMask = info->src_copy_aspect, + .mipLevel = mip_level, + .baseArrayLayer = base_layer + layer_offset, + .layerCount = 1, + }, + .srcOffsets = { + { + DIV_ROUND_UP(offset->x, info->block_width), + DIV_ROUND_UP(offset->y, info->block_height), + offset->z + layer_offset, + }, + { + DIV_ROUND_UP(offset->x + extent->width, info->block_width), + DIV_ROUND_UP(offset->y + extent->height, info->block_height), + offset->z + layer_offset + 1, + }, + }, + .dstSubresource = { + .aspectMask = info->dst_copy_aspect, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1, + }, + .dstOffsets = { + { 0, 0, 0 }, + { + DIV_ROUND_UP(extent->width, info->block_width), + DIV_ROUND_UP(extent->height, info->block_height), + 1 + }, + }, + }; - /* This path uses a shader blit which doesn't support linear images. Return - * early to avoid all the heavy lifting in preparation for the - * blit_shader() call that is bound to fail in that scenario. - */ - if (image->vk.tiling == VK_IMAGE_TILING_LINEAR && - image->vk.image_type != VK_IMAGE_TYPE_1D) { - return handled; - } + return output; +} + +/** + * Produces an image_to_buffer_info struct from a VkBufferImageCopy2 that we can + * use to implement buffer to image copies with blit paths. + * + * Returns false if the copy operation can't be implemented with a blit. + */ +static bool +gather_image_to_buffer_info(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_image *image, + const VkBufferImageCopy2 *region, + struct image_to_buffer_info *out_info) +{ + bool supported = false; VkImageAspectFlags dst_copy_aspect = region->imageSubresource.aspectMask; /* For multi-planar images we copy one plane at a time using an image alias @@ -572,7 +658,7 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, break; default: unreachable("unsupported aspect"); - return handled; + return supported; }; break; case 2: @@ -588,7 +674,7 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, break; default: unreachable("unsupported bit-size"); - return handled; + return supported; }; /* The hardware doesn't support linear depth/stencil stores, so we @@ -600,7 +686,7 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT; /* We should be able to handle the blit if we got this far */ - handled = true; + supported = true; /* Obtain the 2D buffer region spec */ uint32_t buf_width, buf_height; @@ -619,98 +705,246 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, vk_format_get_blockwidth(image->planes[plane].vk_format); uint32_t block_height = vk_format_get_blockheight(image->planes[plane].vk_format); - buf_width = buf_width / block_width; - buf_height = buf_height / block_height; + buf_width = DIV_ROUND_UP(buf_width, block_width); + buf_height = DIV_ROUND_UP(buf_height, block_height); + + out_info->src_format = src_format; + out_info->dst_format = dst_format; + out_info->src_copy_aspect = src_copy_aspect; + out_info->dst_copy_aspect = dst_copy_aspect; + out_info->buf_width = buf_width; + out_info->buf_height = buf_height; + out_info->buf_bpp = buffer_bpp; + out_info->block_width = block_width; + out_info->block_height = block_height; + out_info->cmask = cmask; + out_info->cswizzle = cswizzle; + out_info->plane = plane; + + return supported; +} - /* Compute layers to copy */ - uint32_t num_layers; - if (image->vk.image_type != VK_IMAGE_TYPE_3D) - num_layers = region->imageSubresource.layerCount; - else - num_layers = region->imageExtent.depth; - assert(num_layers > 0); +/* Creates a linear image to alias buffer memory. It also includes that image + * as a private object in the cmd_buffer. + * + * This is used for cases where we want to implement an image to buffer copy, + * but we need to rely on a mechanism that uses an image as destination, like + * blitting. + */ +static VkResult +create_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_buffer *buffer, + const VkBufferImageCopy2 *region, + struct image_to_buffer_info *info, + uint32_t layer, + VkImage *out_image) +{ + VkImageCreateInfo image_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .imageType = VK_IMAGE_TYPE_2D, + .format = info->dst_format, + .extent = { info->buf_width, info->buf_height, 1 }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_LINEAR, + .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + }; - /* Our blit interface can see the real format of the images to detect - * copies between compressed and uncompressed images and adapt the - * blit region accordingly. Here we are just doing a raw copy of - * compressed data, but we are passing an uncompressed view of the - * buffer for the blit destination image (since compressed formats are - * not renderable), so we also want to provide an uncompressed view of - * the source image. - */ VkResult result; struct v3dv_device *device = cmd_buffer->device; VkDevice _device = v3dv_device_to_handle(device); - if (vk_format_is_compressed(image->vk.format)) { - assert(image->plane_count == 1); - VkImage uiview; - VkImageCreateInfo uiview_info = { - .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, - .imageType = VK_IMAGE_TYPE_3D, - .format = dst_format, - .extent = { buf_width, buf_height, image->vk.extent.depth }, - .mipLevels = image->vk.mip_levels, - .arrayLayers = image->vk.array_layers, - .samples = image->vk.samples, - .tiling = image->vk.tiling, - .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .initialLayout = VK_IMAGE_LAYOUT_GENERAL, - }; - result = v3dv_CreateImage(_device, &uiview_info, &device->vk.alloc, &uiview); - if (result != VK_SUCCESS) - return handled; - v3dv_cmd_buffer_add_private_obj( - cmd_buffer, (uintptr_t)uiview, - (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage); + VkImage buffer_image; + result = + v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image); + if (result != VK_SUCCESS) + return result; - result = - vk_common_BindImageMemory(_device, uiview, - v3dv_device_memory_to_handle(image->planes[plane].mem), - image->planes[plane].mem_offset); - if (result != VK_SUCCESS) - return handled; + *out_image = buffer_image; + + v3dv_cmd_buffer_add_private_obj( + cmd_buffer, (uintptr_t)buffer_image, + (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage); + + /* Bind the buffer memory to the image + */ + VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset + + layer * info->buf_width * info->buf_height * info->buf_bpp; + + result = + vk_common_BindImageMemory(_device, buffer_image, + v3dv_device_memory_to_handle(buffer->mem), + buffer_offset); + return result; +} + +/** + * Creates an image with a single mip level that aliases the memory of a + * mip level in another image, re-interpreting the memory with an uncompressed + * format. The image is added to the command buffer as a private object for + * disposal. + */ +static bool +create_image_mip_level_alias(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_image *image, + VkFormat format, + uint32_t plane, + uint32_t mip_level, + uint32_t layer, + VkImage *alias) +{ + VkResult result; + assert(!vk_format_is_compressed(format)); + + struct v3dv_device *device = cmd_buffer->device; + VkDevice vk_device = v3dv_device_to_handle(device); + uint32_t mip_width = image->planes[plane].slices[mip_level].width; + uint32_t mip_height = image->planes[plane].slices[mip_level].height; + + uint32_t block_width = + vk_format_get_blockwidth(image->planes[plane].vk_format); + uint32_t block_height = + vk_format_get_blockheight(image->planes[plane].vk_format); + + VkImageCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .imageType = image->vk.image_type, + .format = format, + .extent = { DIV_ROUND_UP(mip_width, block_width), + DIV_ROUND_UP(mip_height, block_height), + 1 }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = image->vk.samples, + .tiling = image->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR, + .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + result = v3dv_CreateImage(vk_device, &info, &device->vk.alloc, alias); + if (result != VK_SUCCESS) + return false; + + /* The alias we have just created has just one mip, but we may be aliasing + * any mip in the original image. Because the slice setup changes based on + * the mip (particularly, for mips >= 2 it uses power of 2 sizes internally) + * and this can influence the tiling layout selected for the slice, we want + * to make sure we copy the slice description from the actual mip level in + * the original image, and then rewrite any fields that we need for the + * alias. Particularly, we want to make the offset 0 because we are going to + * bind the underlying image memory exactly at the start of the selected mip. + * We also want to relax the image alignment requirements to the minimum + * (the one imposed by the Texture Base Address field) since we may not be + * aliasing a level 0 (for which we typically want a page alignment for + * optimal performance). + */ + V3DV_FROM_HANDLE(v3dv_image, v3dv_alias, *alias); + v3dv_alias->planes[plane].slices[0] = image->planes[plane].slices[mip_level]; + v3dv_alias->planes[plane].slices[0].width = info.extent.width; + v3dv_alias->planes[plane].slices[0].height = info.extent.height; + v3dv_alias->planes[plane].slices[0].offset = 0; + v3dv_alias->planes[plane].alignment = 64; + + v3dv_cmd_buffer_add_private_obj( + cmd_buffer, (uintptr_t)*alias, + (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage); + + result = + vk_common_BindImageMemory(vk_device, *alias, + v3dv_device_memory_to_handle(image->planes[plane].mem), + v3dv_layer_offset(image, mip_level, layer, plane)); + return result == VK_SUCCESS; +} + +/** + * Returns true if the implementation supports the requested operation (even if + * it failed to process it, for example, due to an out-of-memory error). + */ +static bool +copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_buffer *buffer, + struct v3dv_image *image, + const VkBufferImageCopy2 *region) +{ + bool handled = false; + struct image_to_buffer_info info; - image = v3dv_image_from_handle(uiview); + /* This path uses a shader blit which doesn't support linear images. Return + * early to avoid all the heavy lifting in preparation for the + * blit_shader() call that is bound to fail in that scenario. + */ + if (!image->tiled && image->vk.image_type != VK_IMAGE_TYPE_1D) { + return handled; } + handled = gather_image_to_buffer_info(cmd_buffer, image, region, + &info); + + if (!handled) + return handled; + + /* We should be able to handle the blit if we got this far */ + handled = true; + + /* Compute layers to copy */ + uint32_t num_layers; + if (image->vk.image_type != VK_IMAGE_TYPE_3D) + num_layers = region->imageSubresource.layerCount; + else + num_layers = region->imageExtent.depth; + assert(num_layers > 0); + /* Copy requested layers */ + VkResult result; + VkImageBlit2 blit_region; + uint32_t mip_level = region->imageSubresource.mipLevel; + uint32_t base_layer = region->imageSubresource.baseArrayLayer; for (uint32_t i = 0; i < num_layers; i++) { - /* Create the destination blit image from the destination buffer */ - VkImageCreateInfo image_info = { - .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, - .imageType = VK_IMAGE_TYPE_2D, - .format = dst_format, - .extent = { buf_width, buf_height, 1 }, - .mipLevels = 1, - .arrayLayers = 1, - .samples = VK_SAMPLE_COUNT_1_BIT, - .tiling = VK_IMAGE_TILING_LINEAR, - .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .initialLayout = VK_IMAGE_LAYOUT_GENERAL, - }; - - VkImage buffer_image; - result = - v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image); - if (result != VK_SUCCESS) - return handled; + uint32_t layer_offset = i; + + if (vk_format_is_compressed(image->vk.format)) { + /* Our blit interface can see the real format of the images to detect + * copies between compressed and uncompressed images and adapt the + * blit region accordingly. Here we are just doing a raw copy of + * compressed data, but we are passing an uncompressed view of the + * buffer for the blit destination image (since compressed formats are + * not renderable), so we also want to provide an uncompressed view of + * the source image. + * + * It is important that we create the alias over the selected mip + * level (instead of aliasing the entire image) because an uncompressed + * view of the image won't have the same number of mip levels as the + * original image and the implicit mip size calculations the hw will + * do to sample from a non-zero mip level may not match exactly between + * compressed and uncompressed views. + */ + VkImage alias; + if (!create_image_mip_level_alias(cmd_buffer, image, info.dst_format, + info.plane, mip_level, + base_layer + layer_offset, + &alias)) { + return handled; + } - v3dv_cmd_buffer_add_private_obj( - cmd_buffer, (uintptr_t)buffer_image, - (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage); + /* We are aliasing the selected mip level and layer with a + * single-mip and single-layer image. + */ + image = v3dv_image_from_handle(alias); + mip_level = 0; + base_layer = 0; + layer_offset = 0; + } - /* Bind the buffer memory to the image */ - VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset + - i * buf_width * buf_height * buffer_bpp; + /* Create the destination blit image from the destination buffer */ + VkImage buffer_image; result = - vk_common_BindImageMemory(_device, buffer_image, - v3dv_device_memory_to_handle(buffer->mem), - buffer_offset); + create_image_from_buffer(cmd_buffer, buffer, region, &info, + i, &buffer_image); if (result != VK_SUCCESS) return handled; @@ -722,48 +956,17 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, * image, but that we need to blit to a S8D24 destination (the only * stencil format we support). */ - const VkImageBlit2 blit_region = { - .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2, - .srcSubresource = { - .aspectMask = src_copy_aspect, - .mipLevel = region->imageSubresource.mipLevel, - .baseArrayLayer = region->imageSubresource.baseArrayLayer + i, - .layerCount = 1, - }, - .srcOffsets = { - { - DIV_ROUND_UP(region->imageOffset.x, block_width), - DIV_ROUND_UP(region->imageOffset.y, block_height), - region->imageOffset.z + i, - }, - { - DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width, - block_width), - DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height, - block_height), - region->imageOffset.z + i + 1, - }, - }, - .dstSubresource = { - .aspectMask = dst_copy_aspect, - .mipLevel = 0, - .baseArrayLayer = 0, - .layerCount = 1, - }, - .dstOffsets = { - { 0, 0, 0 }, - { - DIV_ROUND_UP(region->imageExtent.width, block_width), - DIV_ROUND_UP(region->imageExtent.height, block_height), - 1 - }, - }, - }; + blit_region = + blit_region_for_image_to_buffer(®ion->imageOffset, + ®ion->imageExtent, + mip_level, base_layer, layer_offset, + &info); handled = blit_shader(cmd_buffer, - v3dv_image_from_handle(buffer_image), dst_format, - image, src_format, - cmask, &cswizzle, + v3dv_image_from_handle(buffer_image), + info.dst_format, + image, info.src_format, + info.cmask, &info.cswizzle, &blit_region, VK_FILTER_NEAREST, false); if (!handled) { /* This is unexpected, we should have a supported blit spec */ @@ -776,6 +979,107 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer, return true; } +static bool +copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_image *dst, + struct v3dv_image *src, + const VkImageCopy2 *region); + +static VkImageCopy2 +image_copy_region_for_image_to_buffer(const VkBufferImageCopy2 *region, + struct image_to_buffer_info *info, + uint32_t layer) +{ + VkImageCopy2 output = { + .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2, + .srcSubresource = { + .aspectMask = info->src_copy_aspect, + .mipLevel = region->imageSubresource.mipLevel, + .baseArrayLayer = region->imageSubresource.baseArrayLayer + layer, + .layerCount = 1, + }, + .srcOffset = { + DIV_ROUND_UP(region->imageOffset.x, info->block_width), + DIV_ROUND_UP(region->imageOffset.y, info->block_height), + region->imageOffset.z, + }, + .dstSubresource = { + .aspectMask = info->dst_copy_aspect, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1, + }, + .dstOffset = { 0, 0, 0 }, + .extent = { + DIV_ROUND_UP(region->imageExtent.width, info->block_width), + DIV_ROUND_UP(region->imageExtent.height, info->block_height), + 1 + }, + }; + + return output; +} + +/** + * Returns true if the implementation supports the requested operation (even if + * it failed to process it, for example, due to an out-of-memory error). + */ +static bool +copy_image_to_buffer_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_buffer *dst_buffer, + struct v3dv_image *src_image, + const VkBufferImageCopy2 *region) +{ + bool handled = false; + VkImage dst_buffer_image; + struct image_to_buffer_info info; + + /* This is a requirement for copy_image_linear_texel_buffer below. We check + * it in advance in order to do an early return + */ + if (src_image->tiled) + return false; + + handled = + gather_image_to_buffer_info(cmd_buffer, src_image, region, + &info); + if (!handled) + return handled; + + /* At this point the implementation should support the copy, any possible + * error below are for different reasons, like out-of-memory error + */ + handled = true; + + uint32_t num_layers; + if (src_image->vk.image_type != VK_IMAGE_TYPE_3D) + num_layers = region->imageSubresource.layerCount; + else + num_layers = region->imageExtent.depth; + assert(num_layers > 0); + + VkResult result; + VkImageCopy2 image_region; + for (uint32_t layer = 0; layer < num_layers; layer++) { + /* Create the destination image from the destination buffer */ + result = + create_image_from_buffer(cmd_buffer, dst_buffer, region, &info, + layer, &dst_buffer_image); + if (result != VK_SUCCESS) + return handled; + + image_region = + image_copy_region_for_image_to_buffer(region, &info, layer); + + handled = + copy_image_linear_texel_buffer(cmd_buffer, + v3dv_image_from_handle(dst_buffer_image), + src_image, &image_region); + } + + return handled; +} + VKAPI_ATTR void VKAPI_CALL v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer, const VkCopyImageToBufferInfo2 *info) @@ -798,6 +1102,9 @@ v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer, if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, region)) continue; + if (copy_image_to_buffer_texel_buffer(cmd_buffer, buffer, image, region)) + continue; + unreachable("Unsupported image to buffer copy."); } cmd_buffer->state.is_transfer = false; @@ -819,7 +1126,7 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, } /* Destination can't be raster format */ - if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR) + if (!dst->tiled) return false; /* We can only do full copies, so if the format is D24S8 both aspects need @@ -947,6 +1254,15 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, return true; } +inline bool +v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_image *dst, + struct v3dv_image *src, + const VkImageCopy2 *region) +{ + return copy_image_tfu(cmd_buffer, dst, src, region); +} + /** * Returns true if the implementation supports the requested operation (even if * it failed to process it, for example, due to an out-of-memory error). @@ -965,9 +1281,12 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, assert(dst_plane < dst->plane_count); VkFormat fb_format; - if (!v3dv_meta_can_use_tlb(src, src_plane, ®ion->srcOffset, &fb_format) || - !v3dv_meta_can_use_tlb(dst, dst_plane, ®ion->dstOffset, &fb_format)) + if (!v3dv_meta_can_use_tlb(src, src_plane, region->srcSubresource.mipLevel, + ®ion->srcOffset, NULL, &fb_format) || + !v3dv_meta_can_use_tlb(dst, dst_plane, region->dstSubresource.mipLevel, + ®ion->dstOffset, ®ion->extent, &fb_format)) { return false; + } /* From the Vulkan spec, VkImageCopy valid usage: * @@ -1013,8 +1332,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w); const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h); - v3dv_job_start_frame(job, width, height, num_layers, - false, true, 1, internal_bpp, + v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), src->vk.samples > VK_SAMPLE_COUNT_1_BIT); struct v3dv_meta_framebuffer framebuffer; @@ -1066,7 +1385,7 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer, .mipLevels = src->vk.mip_levels, .arrayLayers = src->vk.array_layers, .samples = src->vk.samples, - .tiling = src->vk.tiling, + .tiling = src->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR, .usage = src->vk.usage, }; @@ -1094,8 +1413,7 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_image *src, const VkImageCopy2 *region) { - if (src->vk.tiling == VK_IMAGE_TILING_LINEAR && - src->vk.image_type != VK_IMAGE_TYPE_1D) + if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D) return false; uint8_t src_plane = @@ -1207,14 +1525,21 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer, * (since the region dimensions are already specified in terms of the source * image). */ + uint32_t region_width = region->extent.width * src_scale_w; + uint32_t region_height = region->extent.height * src_scale_h; + if (src_block_w > 1) + region_width = util_next_power_of_two(region_width); + if (src_block_h > 1) + region_height = util_next_power_of_two(region_height); + const VkOffset3D src_start = { region->srcOffset.x * src_scale_w, region->srcOffset.y * src_scale_h, region->srcOffset.z, }; const VkOffset3D src_end = { - src_start.x + region->extent.width * src_scale_w, - src_start.y + region->extent.height * src_scale_h, + src_start.x + region_width, + src_start.y + region_height, src_start.z + region->extent.depth, }; @@ -1224,8 +1549,8 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer, region->dstOffset.z, }; const VkOffset3D dst_end = { - dst_start.x + region->extent.width * src_scale_w, - dst_start.y + region->extent.height * src_scale_h, + dst_start.x + region_width, + dst_start.y + region_height, dst_start.z + region->extent.depth, }; @@ -1253,7 +1578,7 @@ copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_image *src, const VkImageCopy2 *region) { - if (src->vk.tiling != VK_IMAGE_TILING_LINEAR) + if (src->tiled) return false; /* Implementations are allowed to restrict linear images like this */ @@ -1507,7 +1832,7 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT); /* Destination can't be raster format */ - if (image->vk.tiling == VK_IMAGE_TILING_LINEAR) + if (!image->tiled) return false; /* We can't copy D24S8 because buffer to image copies only copy one aspect @@ -1539,11 +1864,13 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, else height = region->bufferImageHeight; - uint8_t plane = + const uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask); - if (width != image->planes[plane].width || - height != image->planes[plane].height) + const uint32_t mip_level = region->imageSubresource.mipLevel; + const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level]; + + if (width != slice->width || height != slice->height) return false; /* Handle region semantics for compressed images */ @@ -1566,9 +1893,6 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, assert(format->plane_count == 1); const struct v3dv_format_plane *format_plane = &format->planes[0]; - const uint32_t mip_level = region->imageSubresource.mipLevel; - const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level]; - uint32_t num_layers; if (image->vk.image_type != VK_IMAGE_TYPE_3D) num_layers = region->imageSubresource.layerCount; @@ -1631,8 +1955,11 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask); assert(plane < image->plane_count); - if (!v3dv_meta_can_use_tlb(image, plane, ®ion->imageOffset, &fb_format)) + if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel, + ®ion->imageOffset, ®ion->imageExtent, + &fb_format)) { return false; + } uint32_t internal_type, internal_bpp; v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects) @@ -1659,8 +1986,9 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w); const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h); - v3dv_job_start_frame(job, width, height, num_layers, false, true, - 1, internal_bpp, false); + v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + false); struct v3dv_meta_framebuffer framebuffer; v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, @@ -1832,7 +2160,7 @@ get_texel_buffer_copy_vs() glsl_vec4_type(), "gl_Position"); vs_out_pos->data.location = VARYING_SLOT_POS; - nir_ssa_def *pos = nir_gen_rect_vertices(&b, NULL, NULL); + nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL); nir_store_var(&b, vs_out_pos, pos, 0xf); return b.shader; @@ -1855,8 +2183,8 @@ get_texel_buffer_copy_gs() nir->info.inputs_read = 1ull << VARYING_SLOT_POS; nir->info.outputs_written = (1ull << VARYING_SLOT_POS) | (1ull << VARYING_SLOT_LAYER); - nir->info.gs.input_primitive = SHADER_PRIM_TRIANGLES; - nir->info.gs.output_primitive = SHADER_PRIM_TRIANGLE_STRIP; + nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES; + nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP; nir->info.gs.vertices_in = 3; nir->info.gs.vertices_out = 3; nir->info.gs.invocations = 1; @@ -1889,7 +2217,7 @@ get_texel_buffer_copy_gs() nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i); /* gl_Layer from push constants */ - nir_ssa_def *layer = + nir_def *layer = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET, .range = 4); @@ -1903,7 +2231,7 @@ get_texel_buffer_copy_gs() return nir; } -static nir_ssa_def * +static nir_def * load_frag_coord(nir_builder *b) { nir_foreach_shader_in_variable(var, b->shader) { @@ -1967,24 +2295,24 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format, /* Load the box describing the pixel region we want to copy from the * texel buffer. */ - nir_ssa_def *box = + nir_def *box = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET, .range = 16); /* Load the buffer stride (this comes in texel units) */ - nir_ssa_def *stride = + nir_def *stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET, .range = 4); /* Load the buffer offset (this comes in texel units) */ - nir_ssa_def *offset = + nir_def *offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET, .range = 4); - nir_ssa_def *coord = nir_f2i32(&b, load_frag_coord(&b)); + nir_def *coord = nir_f2i32(&b, load_frag_coord(&b)); /* Load pixel data from texel buffer based on the x,y offset of the pixel * within the box. Texel buffers are 1D arrays of texels. @@ -1994,28 +2322,26 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format, * texel buffer should always be within its bounds and we we don't need * to add a check for that here. */ - nir_ssa_def *x_offset = + nir_def *x_offset = nir_isub(&b, nir_channel(&b, coord, 0), nir_channel(&b, box, 0)); - nir_ssa_def *y_offset = + nir_def *y_offset = nir_isub(&b, nir_channel(&b, coord, 1), nir_channel(&b, box, 1)); - nir_ssa_def *texel_offset = + nir_def *texel_offset = nir_iadd(&b, nir_iadd(&b, offset, x_offset), nir_imul(&b, y_offset, stride)); - nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa; + nir_def *tex_deref = &nir_build_deref_var(&b, sampler)->def; nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2); tex->sampler_dim = GLSL_SAMPLER_DIM_BUF; tex->op = nir_texop_txf; - tex->src[0].src_type = nir_tex_src_coord; - tex->src[0].src = nir_src_for_ssa(texel_offset); - tex->src[1].src_type = nir_tex_src_texture_deref; - tex->src[1].src = nir_src_for_ssa(tex_deref); + tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, texel_offset); + tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref); tex->dest_type = nir_type_uint32; tex->is_array = false; tex->coord_components = 1; - nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "texel buffer result"); + nir_def_init(&tex->instr, &tex->def, 4, 32); nir_builder_instr_insert(&b, &tex->instr); uint32_t swiz[4]; @@ -2027,7 +2353,7 @@ get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format, component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_B, cswizzle->b); swiz[3] = component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_A, cswizzle->a); - nir_ssa_def *s = nir_swizzle(&b, &tex->dest.ssa, swiz, 4); + nir_def *s = nir_swizzle(&b, &tex->def, swiz, 4); nir_store_var(&b, fs_out_color, s, 0xf); return b.shader; @@ -2883,76 +3209,6 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer, } } -/** - * Returns true if the implementation supports the requested operation (even if - * it failed to process it, for example, due to an out-of-memory error). - */ -static bool -copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_image *image, - struct v3dv_buffer *buffer, - const VkBufferImageCopy2 *region) -{ - /* FIXME */ - if (vk_format_is_depth_or_stencil(image->vk.format)) - return false; - - if (vk_format_is_compressed(image->vk.format)) - return false; - - if (image->vk.tiling == VK_IMAGE_TILING_LINEAR) - return false; - - uint32_t buffer_width, buffer_height; - if (region->bufferRowLength == 0) - buffer_width = region->imageExtent.width; - else - buffer_width = region->bufferRowLength; - - if (region->bufferImageHeight == 0) - buffer_height = region->imageExtent.height; - else - buffer_height = region->bufferImageHeight; - - uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask); - assert(plane < image->plane_count); - - uint32_t buffer_stride = buffer_width * image->planes[plane].cpp; - uint32_t buffer_layer_stride = buffer_stride * buffer_height; - - uint32_t num_layers; - if (image->vk.image_type != VK_IMAGE_TYPE_3D) - num_layers = region->imageSubresource.layerCount; - else - num_layers = region->imageExtent.depth; - assert(num_layers > 0); - - struct v3dv_job *job = - v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device, - V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE, - cmd_buffer, -1); - if (!job) - return true; - - job->cpu.copy_buffer_to_image.image = image; - job->cpu.copy_buffer_to_image.buffer = buffer; - job->cpu.copy_buffer_to_image.buffer_stride = buffer_stride; - job->cpu.copy_buffer_to_image.buffer_layer_stride = buffer_layer_stride; - job->cpu.copy_buffer_to_image.buffer_offset = region->bufferOffset; - job->cpu.copy_buffer_to_image.image_extent = region->imageExtent; - job->cpu.copy_buffer_to_image.image_offset = region->imageOffset; - job->cpu.copy_buffer_to_image.mip_level = - region->imageSubresource.mipLevel; - job->cpu.copy_buffer_to_image.base_layer = - region->imageSubresource.baseArrayLayer; - job->cpu.copy_buffer_to_image.layer_count = num_layers; - job->cpu.copy_buffer_to_image.plane = plane; - - list_addtail(&job->list_link, &cmd_buffer->jobs); - - return true; -} - VKAPI_ATTR void VKAPI_CALL v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer, const VkCopyBufferToImageInfo2 *info) @@ -3013,11 +3269,6 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer, * slow it might not be worth it and we should instead put more effort * in handling more cases with the other paths. */ - if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer, &info->pRegions[r])) { - batch_size = 1; - goto handled; - } - if (copy_buffer_to_image_shader(cmd_buffer, image, buffer, batch_size, &info->pRegions[r], false)) { goto handled; @@ -3072,7 +3323,7 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer, return false; /* Destination can't be raster format */ - if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR) + if (!dst->tiled) return false; /* Source region must start at (0,0) */ @@ -3301,16 +3552,16 @@ create_blit_render_pass(struct v3dv_device *device, return result == VK_SUCCESS; } -static nir_ssa_def * +static nir_def * gen_tex_coords(nir_builder *b) { - nir_ssa_def *tex_box = + nir_def *tex_box = nir_load_push_constant(b, 4, 32, nir_imm_int(b, 0), .base = 0, .range = 16); - nir_ssa_def *tex_z = + nir_def *tex_z = nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 16, .range = 4); - nir_ssa_def *vertex_id = nir_load_vertex_id(b); + nir_def *vertex_id = nir_load_vertex_id(b); /* vertex 0: src0_x, src0_y * vertex 1: src0_x, src1_y @@ -3323,11 +3574,11 @@ gen_tex_coords(nir_builder *b) * channel 1 is vertex id & 1 ? src1_y : src0_y */ - nir_ssa_def *one = nir_imm_int(b, 1); - nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2)); - nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one); + nir_def *one = nir_imm_int(b, 1); + nir_def *c0cmp = nir_ilt_imm(b, vertex_id, 2); + nir_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one); - nir_ssa_def *comp[4]; + nir_def *comp[4]; comp[0] = nir_bcsel(b, c0cmp, nir_channel(b, tex_box, 0), nir_channel(b, tex_box, 2)); @@ -3340,9 +3591,9 @@ gen_tex_coords(nir_builder *b) return nir_vec(b, comp, 4); } -static nir_ssa_def * +static nir_def * build_nir_tex_op_read(struct nir_builder *b, - nir_ssa_def *tex_pos, + nir_def *tex_pos, enum glsl_base_type tex_type, enum glsl_sampler_dim dim) { @@ -3355,57 +3606,49 @@ build_nir_tex_op_read(struct nir_builder *b, sampler->data.descriptor_set = 0; sampler->data.binding = 0; - nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa; + nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def; nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3); tex->sampler_dim = dim; tex->op = nir_texop_tex; - tex->src[0].src_type = nir_tex_src_coord; - tex->src[0].src = nir_src_for_ssa(tex_pos); - tex->src[1].src_type = nir_tex_src_texture_deref; - tex->src[1].src = nir_src_for_ssa(tex_deref); - tex->src[2].src_type = nir_tex_src_sampler_deref; - tex->src[2].src = nir_src_for_ssa(tex_deref); + tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos); + tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref); + tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_sampler_deref, tex_deref); tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type); tex->is_array = glsl_sampler_type_is_array(sampler_type); tex->coord_components = tex_pos->num_components; - nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); + nir_def_init(&tex->instr, &tex->def, 4, 32); nir_builder_instr_insert(b, &tex->instr); - return &tex->dest.ssa; + return &tex->def; } -static nir_ssa_def * +static nir_def * build_nir_tex_op_ms_fetch_sample(struct nir_builder *b, nir_variable *sampler, - nir_ssa_def *tex_deref, + nir_def *tex_deref, enum glsl_base_type tex_type, - nir_ssa_def *tex_pos, - nir_ssa_def *sample_idx) + nir_def *tex_pos, + nir_def *sample_idx) { - nir_tex_instr *tex = nir_tex_instr_create(b->shader, 4); + nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3); tex->sampler_dim = GLSL_SAMPLER_DIM_MS; tex->op = nir_texop_txf_ms; - tex->src[0].src_type = nir_tex_src_coord; - tex->src[0].src = nir_src_for_ssa(tex_pos); - tex->src[1].src_type = nir_tex_src_texture_deref; - tex->src[1].src = nir_src_for_ssa(tex_deref); - tex->src[2].src_type = nir_tex_src_sampler_deref; - tex->src[2].src = nir_src_for_ssa(tex_deref); - tex->src[3].src_type = nir_tex_src_ms_index; - tex->src[3].src = nir_src_for_ssa(sample_idx); + tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos); + tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref); + tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_idx); tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type); tex->is_array = false; tex->coord_components = tex_pos->num_components; - nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); + nir_def_init(&tex->instr, &tex->def, 4, 32); nir_builder_instr_insert(b, &tex->instr); - return &tex->dest.ssa; + return &tex->def; } /* Fetches all samples at the given position and averages them */ -static nir_ssa_def * +static nir_def * build_nir_tex_op_ms_resolve(struct nir_builder *b, - nir_ssa_def *tex_pos, + nir_def *tex_pos, enum glsl_base_type tex_type, VkSampleCountFlagBits src_samples) { @@ -3419,10 +3662,10 @@ build_nir_tex_op_ms_resolve(struct nir_builder *b, const bool is_int = glsl_base_type_is_integer(tex_type); - nir_ssa_def *tmp = NULL; - nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa; + nir_def *tmp = NULL; + nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def; for (uint32_t i = 0; i < src_samples; i++) { - nir_ssa_def *s = + nir_def *s = build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref, tex_type, tex_pos, nir_imm_int(b, i)); @@ -3437,13 +3680,13 @@ build_nir_tex_op_ms_resolve(struct nir_builder *b, } assert(!is_int); - return nir_fmul(b, tmp, nir_imm_float(b, 1.0f / src_samples)); + return nir_fmul_imm(b, tmp, 1.0f / src_samples); } /* Fetches the current sample (gl_SampleID) at the given position */ -static nir_ssa_def * +static nir_def * build_nir_tex_op_ms_read(struct nir_builder *b, - nir_ssa_def *tex_pos, + nir_def *tex_pos, enum glsl_base_type tex_type) { const struct glsl_type *sampler_type = @@ -3453,17 +3696,17 @@ build_nir_tex_op_ms_read(struct nir_builder *b, sampler->data.descriptor_set = 0; sampler->data.binding = 0; - nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa; + nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def; return build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref, tex_type, tex_pos, nir_load_sample_id(b)); } -static nir_ssa_def * +static nir_def * build_nir_tex_op(struct nir_builder *b, struct v3dv_device *device, - nir_ssa_def *tex_pos, + nir_def *tex_pos, enum glsl_base_type tex_type, VkSampleCountFlagBits dst_samples, VkSampleCountFlagBits src_samples, @@ -3507,10 +3750,10 @@ get_blit_vs() vs_out_tex_coord->data.location = VARYING_SLOT_VAR0; vs_out_tex_coord->data.interpolation = INTERP_MODE_SMOOTH; - nir_ssa_def *pos = nir_gen_rect_vertices(&b, NULL, NULL); + nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL); nir_store_var(&b, vs_out_pos, pos, 0xf); - nir_ssa_def *tex_coord = gen_tex_coords(&b); + nir_def *tex_coord = gen_tex_coords(&b); nir_store_var(&b, vs_out_tex_coord, tex_coord, 0xf); return b.shader; @@ -3561,11 +3804,11 @@ get_color_blit_fs(struct v3dv_device *device, nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color"); fs_out_color->data.location = FRAG_RESULT_DATA0; - nir_ssa_def *tex_coord = nir_load_var(&b, fs_in_tex_coord); + nir_def *tex_coord = nir_load_var(&b, fs_in_tex_coord); const uint32_t channel_mask = get_channel_mask_for_sampler_dim(sampler_dim); tex_coord = nir_channels(&b, tex_coord, channel_mask); - nir_ssa_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type, + nir_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type, dst_samples, src_samples, sampler_dim); /* For integer textures, if the bit-size of the destination is too small to @@ -3580,7 +3823,7 @@ get_color_blit_fs(struct v3dv_device *device, enum pipe_format src_pformat = vk_format_to_pipe_format(src_format); enum pipe_format dst_pformat = vk_format_to_pipe_format(dst_format); - nir_ssa_def *c[4]; + nir_def *c[4]; for (uint32_t i = 0; i < 4; i++) { c[i] = nir_channel(&b, color, i); @@ -3598,11 +3841,11 @@ get_color_blit_fs(struct v3dv_device *device, assert(dst_bit_size > 0); if (util_format_is_pure_uint(dst_pformat)) { - nir_ssa_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1); + nir_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1); c[i] = nir_umin(&b, c[i], max); } else { - nir_ssa_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1); - nir_ssa_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1))); + nir_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1); + nir_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1))); c[i] = nir_imax(&b, nir_imin(&b, c[i], max), min); } } @@ -4062,12 +4305,10 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer, /* We don't support rendering to linear depth/stencil, this should have * been rewritten to a compatible color blit by the caller. */ - assert(dst->vk.tiling != VK_IMAGE_TILING_LINEAR || - !vk_format_is_depth_or_stencil(dst_format)); + assert(dst->tiled || !vk_format_is_depth_or_stencil(dst_format)); /* Can't sample from linear images */ - if (src->vk.tiling == VK_IMAGE_TILING_LINEAR && - src->vk.image_type != VK_IMAGE_TYPE_1D) { + if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D) { return false; } @@ -4538,8 +4779,10 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, assert(dst->plane_count == 1); assert(src->plane_count == 1); - if (!v3dv_meta_can_use_tlb(src, 0, ®ion->srcOffset, NULL) || - !v3dv_meta_can_use_tlb(dst, 0, ®ion->dstOffset, NULL)) { + if (!v3dv_meta_can_use_tlb(src, 0, region->srcSubresource.mipLevel, + ®ion->srcOffset, NULL, NULL) || + !v3dv_meta_can_use_tlb(dst, 0, region->dstSubresource.mipLevel, + ®ion->dstOffset, ®ion->extent, NULL)) { return false; } @@ -4572,8 +4815,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer, (fb_format, region->srcSubresource.aspectMask, &internal_type, &internal_bpp); - v3dv_job_start_frame(job, width, height, num_layers, false, true, - 1, internal_bpp, true); + v3dv_job_start_frame(job, width, height, num_layers, false, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + true); struct v3dv_meta_framebuffer framebuffer; v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format, diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_pass.c b/lib/mesa/src/broadcom/vulkan/v3dv_pass.c index 683acde62..0583faf6f 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_pass.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_pass.c @@ -234,13 +234,15 @@ v3dv_CreateRenderPass2(VkDevice _device, .layout = desc->pDepthStencilAttachment->layout, }; - /* GFXH-1461: if depth is cleared but stencil is loaded (or viceversa), + /* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa), * the clear might get lost. If a subpass has this then we can't emit - * the clear using the TLB and we have to do it as a draw call. + * the clear using the TLB and we have to do it as a draw call. This + * issue is fixed since V3D 4.3.18. * * FIXME: separate stencil. */ - if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) { + if (device->devinfo.ver == 42 && + subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) { struct v3dv_render_pass_attachment *att = &pass->attachments[subpass->ds_attachment.attachment]; if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) { @@ -320,11 +322,12 @@ subpass_get_granularity(struct v3dv_device *device, /* Granularity is defined by the tile size */ assert(subpass_idx < pass->subpass_count); struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx]; - const uint32_t color_attachment_count = subpass->color_count; + const uint32_t color_count = subpass->color_count; bool msaa = false; - uint32_t max_bpp = 0; - for (uint32_t i = 0; i < color_attachment_count; i++) { + uint32_t max_internal_bpp = 0; + uint32_t total_color_bpp = 0; + for (uint32_t i = 0; i < color_count; i++) { uint32_t attachment_idx = subpass->color_attachments[i].attachment; if (attachment_idx == VK_ATTACHMENT_UNUSED) continue; @@ -337,7 +340,8 @@ subpass_get_granularity(struct v3dv_device *device, v3dv_X(device, get_internal_type_bpp_for_output_format) (format->planes[0].rt_type, &internal_type, &internal_bpp); - max_bpp = MAX2(max_bpp, internal_bpp); + max_internal_bpp = MAX2(max_internal_bpp, internal_bpp); + total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); if (desc->samples > VK_SAMPLE_COUNT_1_BIT) msaa = true; @@ -347,7 +351,8 @@ subpass_get_granularity(struct v3dv_device *device, * heuristics so we choose a conservative granularity here, with it disabled. */ uint32_t width, height; - v3d_choose_tile_size(color_attachment_count, max_bpp, msaa, + v3d_choose_tile_size(&device->devinfo, color_count, + max_internal_bpp, total_color_bpp, msaa, false /* double-buffer */, &width, &height); *granularity = (VkExtent2D) { .width = width, diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_pipeline.c b/lib/mesa/src/broadcom/vulkan/v3dv_pipeline.c index 116c0f70f..54a26cb14 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_pipeline.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_pipeline.c @@ -30,13 +30,14 @@ #include "qpu/qpu_disasm.h" #include "compiler/nir/nir_builder.h" -#include "nir/nir_vulkan.h" #include "nir/nir_serialize.h" #include "util/u_atomic.h" #include "util/u_prim.h" #include "util/os_time.h" +#include "util/u_helpers.h" +#include "vk_nir_convert_ycbcr.h" #include "vk_pipeline.h" #include "vulkan/util/vk_format.h" @@ -192,8 +193,8 @@ const nir_shader_compiler_options v3dv_nir_options = { .lower_extract_word = true, .lower_insert_byte = true, .lower_insert_word = true, - .lower_bitfield_insert_to_shifts = true, - .lower_bitfield_extract_to_shifts = true, + .lower_bitfield_insert = true, + .lower_bitfield_extract = true, .lower_bitfield_reverse = true, .lower_bit_count = true, .lower_cs_local_id_to_index = true, @@ -226,10 +227,10 @@ const nir_shader_compiler_options v3dv_nir_options = { .lower_isign = true, .lower_ldexp = true, .lower_mul_high = true, - .lower_wpos_pntc = true, - .lower_rotate = true, + .lower_wpos_pntc = false, .lower_to_scalar = true, .lower_device_index_to_zero = true, + .lower_fquantize2f16 = true, .has_fsub = true, .has_isub = true, .vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic @@ -238,7 +239,7 @@ const nir_shader_compiler_options v3dv_nir_options = { .max_unroll_iterations = 16, .force_indirect_unrolling = (nir_var_shader_in | nir_var_function_temp), .divergence_analysis_options = - nir_divergence_multiple_workgroup_per_compute_subgroup + nir_divergence_multiple_workgroup_per_compute_subgroup, }; const nir_shader_compiler_options * @@ -546,7 +547,7 @@ lower_vulkan_resource_index(nir_builder *b, uint32_t start_index = 0; if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER || binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) { - start_index = MAX_INLINE_UNIFORM_BUFFERS; + start_index += MAX_INLINE_UNIFORM_BUFFERS; } index = descriptor_map_add(descriptor_map, set, binding, @@ -555,14 +556,6 @@ lower_vulkan_resource_index(nir_builder *b, start_index, 32 /* return_size: doesn't really apply for this case */, 0); - - /* We always reserve index 0 for push constants */ - if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER || - binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || - binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { - index++; - } - break; } @@ -575,7 +568,7 @@ lower_vulkan_resource_index(nir_builder *b, * vulkan_load_descriptor return a vec2 providing an index and * offset. Our backend compiler only cares about the index part. */ - nir_ssa_def_rewrite_uses(&instr->dest.ssa, + nir_def_rewrite_uses(&instr->def, nir_imm_ivec2(b, index, 0)); nir_instr_remove(&instr->instr); } @@ -601,7 +594,7 @@ lower_tex_src(nir_builder *b, unsigned src_idx, struct lower_pipeline_layout_state *state) { - nir_ssa_def *index = NULL; + nir_def *index = NULL; unsigned base_index = 0; unsigned array_elements = 1; nir_tex_src *src = &instr->src[src_idx]; @@ -612,7 +605,6 @@ lower_tex_src(nir_builder *b, /* We compute first the offsets */ nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr); while (deref->deref_type != nir_deref_type_var) { - assert(deref->parent.is_ssa); nir_deref_instr *parent = nir_instr_as_deref(deref->parent.ssa->parent_instr); @@ -629,8 +621,8 @@ lower_tex_src(nir_builder *b, } index = nir_iadd(b, index, - nir_imul(b, nir_imm_int(b, array_elements), - nir_ssa_for_src(b, deref->arr.index, 1))); + nir_imul_imm(b, deref->arr.index.ssa, + array_elements)); } array_elements *= glsl_get_length(parent->type); @@ -645,8 +637,7 @@ lower_tex_src(nir_builder *b, * instr if needed */ if (index) { - nir_instr_rewrite_src(&instr->instr, &src->src, - nir_src_for_ssa(index)); + nir_src_rewrite(&src->src, index); src->src_type = is_sampler ? nir_tex_src_sampler_offset : @@ -658,7 +649,7 @@ lower_tex_src(nir_builder *b, uint32_t set = deref->var->data.descriptor_set; uint32_t binding = deref->var->data.binding; /* FIXME: this is a really simplified check for the precision to be used - * for the sampling. Right now we are ony checking for the variables used + * for the sampling. Right now we are only checking for the variables used * on the operation itself, but there are other cases that we could use to * infer the precision requirement. */ @@ -720,18 +711,20 @@ lower_sampler(nir_builder *b, int sampler_idx = nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref); - if (sampler_idx >= 0) + if (sampler_idx >= 0) { + assert(nir_tex_instr_need_sampler(instr)); lower_tex_src(b, instr, sampler_idx, state); + } if (texture_idx < 0 && sampler_idx < 0) return false; - /* If we don't have a sampler, we assign it the idx we reserve for this - * case, and we ensure that it is using the correct return size. + /* If the instruction doesn't have a sampler (i.e. txf) we use backend_flags + * to bind a default sampler state to configure precission. */ if (sampler_idx < 0) { state->needs_default_sampler_state = true; - instr->sampler_index = return_size == 16 ? + instr->backend_flags = return_size == 16 ? V3DV_NO_SAMPLER_16BIT_IDX : V3DV_NO_SAMPLER_32BIT_IDX; } @@ -745,12 +738,11 @@ lower_image_deref(nir_builder *b, struct lower_pipeline_layout_state *state) { nir_deref_instr *deref = nir_src_as_deref(instr->src[0]); - nir_ssa_def *index = NULL; + nir_def *index = NULL; unsigned array_elements = 1; unsigned base_index = 0; while (deref->deref_type != nir_deref_type_var) { - assert(deref->parent.is_ssa); nir_deref_instr *parent = nir_instr_as_deref(deref->parent.ssa->parent_instr); @@ -767,8 +759,8 @@ lower_image_deref(nir_builder *b, } index = nir_iadd(b, index, - nir_imul(b, nir_imm_int(b, array_elements), - nir_ssa_for_src(b, deref->arr.index, 1))); + nir_imul_imm(b, deref->arr.index.ssa, + array_elements)); } array_elements *= glsl_get_length(parent->type); @@ -833,23 +825,15 @@ lower_intrinsic(nir_builder *b, /* Loading the descriptor happens as part of load/store instructions, * so for us this is a no-op. */ - nir_ssa_def_rewrite_uses(&instr->dest.ssa, instr->src[0].ssa); + nir_def_rewrite_uses(&instr->def, instr->src[0].ssa); nir_instr_remove(&instr->instr); return true; } case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: - case nir_intrinsic_image_deref_atomic_add: - case nir_intrinsic_image_deref_atomic_imin: - case nir_intrinsic_image_deref_atomic_umin: - case nir_intrinsic_image_deref_atomic_imax: - case nir_intrinsic_image_deref_atomic_umax: - case nir_intrinsic_image_deref_atomic_and: - case nir_intrinsic_image_deref_atomic_or: - case nir_intrinsic_image_deref_atomic_xor: - case nir_intrinsic_image_deref_atomic_exchange: - case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic: + case nir_intrinsic_image_deref_atomic_swap: case nir_intrinsic_image_deref_size: case nir_intrinsic_image_deref_samples: lower_image_deref(b, instr, state); @@ -907,6 +891,34 @@ lower_pipeline_layout_info(nir_shader *shader, return progress; } +/* This flips gl_PointCoord.y to match Vulkan requirements */ +static bool +lower_point_coord_cb(nir_builder *b, nir_intrinsic_instr *intr, void *_state) +{ + if (intr->intrinsic != nir_intrinsic_load_input) + return false; + + if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_PNTC) + return false; + + b->cursor = nir_after_instr(&intr->instr); + nir_def *result = &intr->def; + result = + nir_vector_insert_imm(b, result, + nir_fsub_imm(b, 1.0, nir_channel(b, result, 1)), 1); + nir_def_rewrite_uses_after(&intr->def, + result, result->parent_instr); + return true; +} + +static bool +v3d_nir_lower_point_coord(nir_shader *s) +{ + assert(s->info.stage == MESA_SHADER_FRAGMENT); + return nir_shader_intrinsics_pass(s, lower_point_coord_cb, + nir_metadata_block_index | + nir_metadata_dominance, NULL); +} static void lower_fs_io(nir_shader *nir) @@ -1043,24 +1055,22 @@ pipeline_populate_v3d_key(struct v3d_key *key, p_stage->robustness.storage_buffers == robust_buffer_enabled; key->robust_image_access = p_stage->robustness.images == robust_image_enabled; - - key->environment = V3D_ENVIRONMENT_VULKAN; } /* FIXME: anv maps to hw primitive type. Perhaps eventually we would do the * same. For not using prim_mode that is the one already used on v3d */ -static const enum pipe_prim_type vk_to_pipe_prim_type[] = { - [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = PIPE_PRIM_POINTS, - [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = PIPE_PRIM_LINES, - [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = PIPE_PRIM_LINE_STRIP, - [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = PIPE_PRIM_TRIANGLES, - [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = PIPE_PRIM_TRIANGLE_STRIP, - [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = PIPE_PRIM_TRIANGLE_FAN, - [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = PIPE_PRIM_LINES_ADJACENCY, - [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_LINE_STRIP_ADJACENCY, - [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLES_ADJACENCY, - [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY, +static const enum mesa_prim vk_to_mesa_prim[] = { + [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = MESA_PRIM_POINTS, + [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = MESA_PRIM_LINES, + [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = MESA_PRIM_LINE_STRIP, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = MESA_PRIM_TRIANGLES, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = MESA_PRIM_TRIANGLE_STRIP, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = MESA_PRIM_TRIANGLE_FAN, + [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = MESA_PRIM_LINES_ADJACENCY, + [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = MESA_PRIM_LINE_STRIP_ADJACENCY, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = MESA_PRIM_TRIANGLES_ADJACENCY, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = MESA_PRIM_TRIANGLE_STRIP_ADJACENCY, }; static const enum pipe_logicop vk_to_pipe_logicop[] = { @@ -1100,11 +1110,22 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key, const VkPipelineInputAssemblyStateCreateInfo *ia_info = pCreateInfo->pInputAssemblyState; - uint8_t topology = vk_to_pipe_prim_type[ia_info->topology]; + uint8_t topology = vk_to_mesa_prim[ia_info->topology]; + + key->is_points = (topology == MESA_PRIM_POINTS); + key->is_lines = (topology >= MESA_PRIM_LINES && + topology <= MESA_PRIM_LINE_STRIP); + + if (key->is_points) { + /* This mask represents state for GL_ARB_point_sprite which is not + * relevant to Vulkan. + */ + key->point_sprite_mask = 0; + + /* Vulkan mandates upper left. */ + key->point_coord_upper_left = true; + } - key->is_points = (topology == PIPE_PRIM_POINTS); - key->is_lines = (topology >= PIPE_PRIM_LINES && - topology <= PIPE_PRIM_LINE_STRIP); key->has_gs = has_geometry_shader; const VkPipelineColorBlendStateCreateInfo *cb_info = @@ -1181,16 +1202,6 @@ pipeline_populate_v3d_fs_key(struct v3d_fs_key *key, else if (util_format_is_pure_sint(fb_pipe_format)) key->int_color_rb |= 1 << i; } - - if (key->is_points) { - /* This mask represents state for GL_ARB_point_sprite which is not - * relevant to Vulkan. - */ - key->point_sprite_mask = 0; - - /* Vulkan mandates upper left. */ - key->point_coord_upper_left = true; - } } } @@ -1271,11 +1282,11 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key, */ const VkPipelineInputAssemblyStateCreateInfo *ia_info = pCreateInfo->pInputAssemblyState; - uint8_t topology = vk_to_pipe_prim_type[ia_info->topology]; + uint8_t topology = vk_to_mesa_prim[ia_info->topology]; /* FIXME: PRIM_POINTS is not enough, in gallium the full check is - * PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */ - key->per_vertex_point_size = (topology == PIPE_PRIM_POINTS); + * MESA_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */ + key->per_vertex_point_size = (topology == MESA_PRIM_POINTS); key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage); @@ -1340,8 +1351,10 @@ pipeline_populate_v3d_vs_key(struct v3d_vs_key *key, const VkVertexInputAttributeDescription *desc = &vi_info->pVertexAttributeDescriptions[i]; assert(desc->location < MAX_VERTEX_ATTRIBS); - if (desc->format == VK_FORMAT_B8G8R8A8_UNORM) + if (desc->format == VK_FORMAT_B8G8R8A8_UNORM || + desc->format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) { key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location); + } } } @@ -1790,7 +1803,7 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage, if (nir) { assert(nir->info.stage == broadcom_shader_stage_to_gl(p_stage->stage)); - /* A NIR cach hit doesn't avoid the large majority of pipeline stage + /* A NIR cache hit doesn't avoid the large majority of pipeline stage * creation so the cache hit is not recorded in the pipeline feedback * flags */ @@ -1910,6 +1923,11 @@ pipeline_compile_fragment_shader(struct v3dv_pipeline *pipeline, p_stage_gs != NULL, get_ucp_enable_mask(p_stage_vs)); + if (key.is_points) { + assert(key.point_coord_upper_left); + NIR_PASS(_, p_stage_fs->nir, v3d_nir_lower_point_coord); + } + VkResult vk_result; pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT] = pipeline_compile_shader_variant(p_stage_fs, &key.base, sizeof(key), @@ -1933,7 +1951,7 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline, const VkPipelineInputAssemblyStateCreateInfo *ia_info = pCreateInfo->pInputAssemblyState; - key->topology = vk_to_pipe_prim_type[ia_info->topology]; + key->topology = vk_to_mesa_prim[ia_info->topology]; const VkPipelineColorBlendStateCreateInfo *cb_info = raster_enabled ? pCreateInfo->pColorBlendState : NULL; @@ -1998,8 +2016,10 @@ pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline, const VkVertexInputAttributeDescription *desc = &vi_info->pVertexAttributeDescriptions[i]; assert(desc->location < MAX_VERTEX_ATTRIBS); - if (desc->format == VK_FORMAT_B8G8R8A8_UNORM) + if (desc->format == VK_FORMAT_B8G8R8A8_UNORM || + desc->format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) { key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location); + } } assert(pipeline->subpass); @@ -2130,19 +2150,19 @@ write_creation_feedback(struct v3dv_pipeline *pipeline, } } -static enum shader_prim +static enum mesa_prim multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline) { switch (pipeline->topology) { - case PIPE_PRIM_POINTS: - return SHADER_PRIM_POINTS; - case PIPE_PRIM_LINES: - case PIPE_PRIM_LINE_STRIP: - return SHADER_PRIM_LINES; - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_TRIANGLE_FAN: - return SHADER_PRIM_TRIANGLES; + case MESA_PRIM_POINTS: + return MESA_PRIM_POINTS; + case MESA_PRIM_LINES: + case MESA_PRIM_LINE_STRIP: + return MESA_PRIM_LINES; + case MESA_PRIM_TRIANGLES: + case MESA_PRIM_TRIANGLE_STRIP: + case MESA_PRIM_TRIANGLE_FAN: + return MESA_PRIM_TRIANGLES; default: /* Since we don't allow GS with multiview, we can only see non-adjacency * primitives. @@ -2151,19 +2171,19 @@ multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline) } } -static enum shader_prim +static enum mesa_prim multiview_gs_output_primitive_from_pipeline(struct v3dv_pipeline *pipeline) { switch (pipeline->topology) { - case PIPE_PRIM_POINTS: - return SHADER_PRIM_POINTS; - case PIPE_PRIM_LINES: - case PIPE_PRIM_LINE_STRIP: - return SHADER_PRIM_LINE_STRIP; - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_TRIANGLE_FAN: - return SHADER_PRIM_TRIANGLE_STRIP; + case MESA_PRIM_POINTS: + return MESA_PRIM_POINTS; + case MESA_PRIM_LINES: + case MESA_PRIM_LINE_STRIP: + return MESA_PRIM_LINE_STRIP; + case MESA_PRIM_TRIANGLES: + case MESA_PRIM_TRIANGLE_STRIP: + case MESA_PRIM_TRIANGLE_FAN: + return MESA_PRIM_TRIANGLE_STRIP; default: /* Since we don't allow GS with multiview, we can only see non-adjacency * primitives. @@ -2232,7 +2252,7 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline, out_layer->data.location = VARYING_SLOT_LAYER; /* Get the view index value that we will write to gl_Layer */ - nir_ssa_def *layer = + nir_def *layer = nir_load_system_value(&b, nir_intrinsic_load_view_index, 0, 1, 32); /* Emit all output vertices */ @@ -2612,13 +2632,8 @@ v3dv_dynamic_state_mask(VkDynamicState state) return V3DV_DYNAMIC_LINE_WIDTH; case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT: return V3DV_DYNAMIC_COLOR_WRITE_ENABLE; - - /* Depth bounds testing is not available in in V3D 4.2 so here we are just - * ignoring this dynamic state. We are already asserting at pipeline creation - * time that depth bounds testing is not enabled. - */ case VK_DYNAMIC_STATE_DEPTH_BOUNDS: - return 0; + return V3DV_DYNAMIC_DEPTH_BOUNDS; default: unreachable("Unhandled dynamic state"); @@ -2636,6 +2651,7 @@ pipeline_init_dynamic_state( const VkPipelineColorWriteCreateInfoEXT *pColorWriteState) { /* Initialize to default values */ + const struct v3d_device_info *devinfo = &pipeline->device->devinfo; struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state; memset(dynamic, 0, sizeof(*dynamic)); dynamic->stencil_compare_mask.front = ~0; @@ -2643,7 +2659,9 @@ pipeline_init_dynamic_state( dynamic->stencil_write_mask.front = ~0; dynamic->stencil_write_mask.back = ~0; dynamic->line_width = 1.0f; - dynamic->color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1; + dynamic->color_write_enable = + (1ull << (4 * V3D_MAX_RENDER_TARGETS(devinfo->ver))) - 1; + dynamic->depth_bounds.max = 1.0f; /* Create a mask of enabled dynamic states */ uint32_t dynamic_states = 0; @@ -2665,9 +2683,10 @@ pipeline_init_dynamic_state( pViewportState->viewportCount); for (uint32_t i = 0; i < dynamic->viewport.count; i++) { - v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i], - dynamic->viewport.scale[i], - dynamic->viewport.translate[i]); + v3dv_X(pipeline->device, viewport_compute_xform) + (&dynamic->viewport.viewports[i], + dynamic->viewport.scale[i], + dynamic->viewport.translate[i]); } } @@ -2695,6 +2714,11 @@ pipeline_init_dynamic_state( dynamic->stencil_reference.front = pDepthStencilState->front.reference; dynamic->stencil_reference.back = pDepthStencilState->back.reference; } + + if (!(dynamic_states & V3DV_DYNAMIC_DEPTH_BOUNDS)) { + dynamic->depth_bounds.min = pDepthStencilState->minDepthBounds; + dynamic->depth_bounds.max = pDepthStencilState->maxDepthBounds; + } } if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) { @@ -2806,62 +2830,6 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline, } } -static bool -pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline) -{ - for (uint8_t i = 0; i < pipeline->va_count; i++) { - if (vk_format_is_int(pipeline->va[i].vk_format)) - return true; - } - return false; -} - -/* @pipeline can be NULL. We assume in that case that all the attributes have - * a float format (we only create an all-float BO once and we reuse it with - * all float pipelines), otherwise we look at the actual type of each - * attribute used with the specific pipeline passed in. - */ -struct v3dv_bo * -v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device, - struct v3dv_pipeline *pipeline) -{ - uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4; - struct v3dv_bo *bo; - - bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true); - - if (!bo) { - fprintf(stderr, "failed to allocate memory for the default " - "attribute values\n"); - return NULL; - } - - bool ok = v3dv_bo_map(device, bo, size); - if (!ok) { - fprintf(stderr, "failed to map default attribute values buffer\n"); - return false; - } - - uint32_t *attrs = bo->map; - uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0; - for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) { - attrs[i * 4 + 0] = 0; - attrs[i * 4 + 1] = 0; - attrs[i * 4 + 2] = 0; - VkFormat attr_format = - pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED; - if (i < va_count && vk_format_is_int(attr_format)) { - attrs[i * 4 + 3] = 1; - } else { - attrs[i * 4 + 3] = fui(1.0); - } - } - - v3dv_bo_unmap(device, bo); - - return bo; -} - static void pipeline_set_sample_mask(struct v3dv_pipeline *pipeline, const VkPipelineMultisampleStateCreateInfo *ms_info) @@ -2909,7 +2877,7 @@ pipeline_init(struct v3dv_pipeline *pipeline, const VkPipelineInputAssemblyStateCreateInfo *ia_info = pCreateInfo->pInputAssemblyState; - pipeline->topology = vk_to_pipe_prim_type[ia_info->topology]; + pipeline->topology = vk_to_mesa_prim[ia_info->topology]; /* If rasterization is not enabled, various CreateInfo structs must be * ignored. @@ -2964,7 +2932,9 @@ pipeline_init(struct v3dv_pipeline *pipeline, /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that * feature and it shouldn't be used by any pipeline. */ - assert(!ds_info || !ds_info->depthBoundsTestEnable); + assert(device->devinfo.ver >= 71 || + !ds_info || !ds_info->depthBoundsTestEnable); + pipeline->depth_bounds_test_enabled = ds_info && ds_info->depthBoundsTestEnable; enable_depth_bias(pipeline, rs_info); @@ -2996,9 +2966,10 @@ pipeline_init(struct v3dv_pipeline *pipeline, v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info); - if (pipeline_has_integer_vertex_attrib(pipeline)) { + if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) { pipeline->default_attribute_values = - v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline); + v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline); + if (!pipeline->default_attribute_values) return VK_ERROR_OUT_OF_DEVICE_MEMORY; } else { @@ -3106,14 +3077,20 @@ shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) } static void -lower_cs_shared(struct nir_shader *nir) +lower_compute(struct nir_shader *nir) { if (!nir->info.shared_memory_explicit_layout) { NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, shared_type_info); } + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset); + + struct nir_lower_compute_system_values_options sysval_options = { + .has_base_workgroup_id = true, + }; + NIR_PASS_V(nir, nir_lower_compute_system_values, &sysval_options); } static VkResult @@ -3201,7 +3178,7 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline, v3d_optimize_nir(NULL, p_stage->nir); pipeline_lower_nir(pipeline, p_stage, pipeline->layout); - lower_cs_shared(p_stage->nir); + lower_compute(p_stage->nir); VkResult result = VK_SUCCESS; diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_pipeline_cache.c b/lib/mesa/src/broadcom/vulkan/v3dv_pipeline_cache.c index bafa8d759..3f58940c7 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_pipeline_cache.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_pipeline_cache.c @@ -542,7 +542,7 @@ shader_variant_create_from_blob(struct v3dv_device *device, if (blob->overrun) return NULL; - uint ulist_data_size = sizeof(uint32_t) * ulist_count; + size_t ulist_data_size = sizeof(uint32_t) * ulist_count; const void *ulist_data_data = blob_read_bytes(blob, ulist_data_size); if (blob->overrun) return NULL; diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_private.h b/lib/mesa/src/broadcom/vulkan/v3dv_private.h index 91c1ec2f6..21934d802 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_private.h +++ b/lib/mesa/src/broadcom/vulkan/v3dv_private.h @@ -38,6 +38,7 @@ #include "vk_descriptor_update_template.h" #include "vk_device.h" +#include "vk_device_memory.h" #include "vk_format.h" #include "vk_instance.h" #include "vk_image.h" @@ -64,6 +65,11 @@ #define VG(x) ((void)0) #endif +#ifdef ANDROID +#include <vndk/hardware_buffer.h> +#include "util/u_gralloc/u_gralloc.h" +#endif + #include "v3dv_limits.h" #include "common/v3d_device_info.h" @@ -123,13 +129,15 @@ struct v3d_simulator_file; /* Minimum required by the Vulkan 1.1 spec */ #define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30) +/* Maximum performance counters number */ +#define V3D_MAX_PERFCNT 93 + struct v3dv_physical_device { struct vk_physical_device vk; char *name; int32_t render_fd; int32_t display_fd; - int32_t master_fd; /* We need these because it is not clear how to detect * valid devids in a portable way @@ -168,7 +176,7 @@ struct v3dv_physical_device { const struct v3d_compiler *compiler; uint32_t next_program_id; - uint64_t heap_used; + alignas(8) uint64_t heap_used; /* This array holds all our 'struct v3dv_bo' allocations. We use this * so we can add a refcount to our BOs and check if a particular BO @@ -197,9 +205,6 @@ struct v3dv_physical_device { } caps; }; -VkResult v3dv_physical_device_acquire_display(struct v3dv_physical_device *pdevice, - VkIcdSurfaceBase *surface); - static inline struct v3dv_bo * v3dv_device_lookup_bo(struct v3dv_physical_device *device, uint32_t handle) { @@ -222,7 +227,9 @@ void v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device); bool v3dv_meta_can_use_tlb(struct v3dv_image *image, uint8_t plane, + uint8_t miplevel, const VkOffset3D *offset, + const VkExtent3D *extent, VkFormat *compat_format); struct v3dv_instance { @@ -579,6 +586,9 @@ struct v3dv_device { * being float being float, allowing us to reuse the same BO for all * pipelines matching this requirement. Pipelines that need integer * attributes will create their own BO. + * + * Note that since v71 the default attribute values are not needed, so this + * can be NULL. */ struct v3dv_bo *default_attribute_float; @@ -586,17 +596,12 @@ struct v3dv_device { struct util_dynarray device_address_bo_list; /* Array of struct v3dv_bo * */ #ifdef ANDROID - const void *gralloc; - enum { - V3DV_GRALLOC_UNKNOWN, - V3DV_GRALLOC_CROS, - V3DV_GRALLOC_OTHER, - } gralloc_type; + struct u_gralloc *gralloc; #endif }; struct v3dv_device_memory { - struct vk_object_base base; + struct vk_device_memory vk; struct v3dv_bo *bo; const VkMemoryType *type; @@ -670,6 +675,8 @@ struct v3d_resource_slice { uint32_t offset; uint32_t stride; uint32_t padded_height; + uint32_t width; + uint32_t height; /* Size of a single pane of the slice. For 3D textures, there will be * a number of panes equal to the minified, power-of-two-aligned * depth. @@ -724,9 +731,18 @@ struct v3dv_image { VkFormat vk_format; } planes[V3DV_MAX_PLANE_COUNT]; + /* Used only when sampling a linear texture (which V3D doesn't support). + * This holds a tiled copy of the image we can use for that purpose. + */ + struct v3dv_image *shadow; + #ifdef ANDROID /* Image is backed by VK_ANDROID_native_buffer, */ bool is_native_buffer_memory; + /* Image is backed by VK_ANDROID_external_memory_android_hardware_buffer */ + bool is_ahb; + VkImageDrmFormatModifierExplicitCreateInfoEXT *android_explicit_layout; + VkSubresourceLayout *android_plane_layouts; #endif }; @@ -768,6 +784,8 @@ struct v3dv_image_view { const struct v3dv_format *format; + uint8_t view_swizzle[4]; + uint8_t plane_count; struct { uint8_t image_plane; @@ -778,8 +796,8 @@ struct v3dv_image_view { uint32_t internal_type; uint32_t offset; - /* Precomputed (composed from createinfo->components and formar swizzle) - * swizzles to pass in to the shader key. + /* Precomputed swizzle (composed from the view swizzle and the format + * swizzle). * * This could be also included on the descriptor bo, but the shader state * packet doesn't need it on a bo, so we can just avoid a memory copy @@ -796,6 +814,11 @@ struct v3dv_image_view { */ uint8_t texture_shader_state[2][V3DV_TEXTURE_SHADER_STATE_LENGTH]; } planes[V3DV_MAX_PLANE_COUNT]; + + /* Used only when sampling a linear texture (which V3D doesn't support). + * This would represent a view over the tiled shadow image. + */ + struct v3dv_image_view *shadow; }; VkResult v3dv_create_image_view(struct v3dv_device *device, @@ -916,7 +939,7 @@ struct v3dv_framebuffer { uint32_t layers; /* Typically, edge tiles in the framebuffer have padding depending on the - * underlying tiling layout. One consequnce of this is that when the + * underlying tiling layout. One consequence of this is that when the * framebuffer dimensions are not aligned to tile boundaries, tile stores * would still write full tiles on the edges and write to the padded area. * If the framebuffer is aliasing a smaller region of a larger image, then @@ -942,6 +965,7 @@ struct v3dv_frame_tiling { uint32_t layers; uint32_t render_target_count; uint32_t internal_bpp; + uint32_t total_color_bpp; bool msaa; bool double_buffer; uint32_t tile_width; @@ -1036,7 +1060,8 @@ enum v3dv_dynamic_state_bits { V3DV_DYNAMIC_DEPTH_BIAS = 1 << 6, V3DV_DYNAMIC_LINE_WIDTH = 1 << 7, V3DV_DYNAMIC_COLOR_WRITE_ENABLE = 1 << 8, - V3DV_DYNAMIC_ALL = (1 << 9) - 1, + V3DV_DYNAMIC_DEPTH_BOUNDS = 1 << 9, + V3DV_DYNAMIC_ALL = (1 << 10) - 1, }; /* Flags for dirty pipeline state. @@ -1061,6 +1086,7 @@ enum v3dv_cmd_dirty_bits { V3DV_CMD_DIRTY_LINE_WIDTH = 1 << 16, V3DV_CMD_DIRTY_VIEW_INDEX = 1 << 17, V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE = 1 << 18, + V3DV_CMD_DIRTY_DEPTH_BOUNDS = 1 << 19, }; struct v3dv_dynamic_state { @@ -1097,6 +1123,11 @@ struct v3dv_dynamic_state { float slope_factor; } depth_bias; + struct { + float min; + float max; + } depth_bounds; + float line_width; uint32_t color_write_enable; @@ -1121,7 +1152,6 @@ enum v3dv_job_type { V3DV_JOB_TYPE_CPU_RESET_QUERIES, V3DV_JOB_TYPE_CPU_END_QUERY, V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS, - V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE, V3DV_JOB_TYPE_CPU_CSD_INDIRECT, V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY, }; @@ -1160,20 +1190,6 @@ struct v3dv_submit_sync_info { struct vk_sync_signal *signals; }; -struct v3dv_copy_buffer_to_image_cpu_job_info { - struct v3dv_image *image; - struct v3dv_buffer *buffer; - uint32_t buffer_offset; - uint32_t buffer_stride; - uint32_t buffer_layer_stride; - VkOffset3D image_offset; - VkExtent3D image_extent; - uint32_t mip_level; - uint32_t base_layer; - uint32_t layer_count; - uint8_t plane; -}; - struct v3dv_csd_indirect_cpu_job_info { struct v3dv_buffer *buffer; uint32_t offset; @@ -1192,7 +1208,7 @@ struct v3dv_timestamp_query_cpu_job_info { }; /* Number of perfmons required to handle all supported performance counters */ -#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \ +#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \ DRM_V3D_MAX_PERF_COUNTERS) struct v3dv_perf_query { @@ -1327,7 +1343,6 @@ struct v3dv_job { struct v3dv_reset_query_cpu_job_info query_reset; struct v3dv_end_query_info query_end; struct v3dv_copy_query_results_cpu_job_info query_copy_results; - struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image; struct v3dv_csd_indirect_cpu_job_info csd_indirect; struct v3dv_timestamp_query_cpu_job_info query_timestamp; } cpu; @@ -1365,6 +1380,7 @@ void v3dv_job_start_frame(struct v3dv_job *job, bool allocate_tile_state_now, uint32_t render_target_count, uint8_t max_internal_bpp, + uint8_t total_color_bpp, bool msaa); bool v3dv_job_type_is_gpu(struct v3dv_job *job); @@ -1482,7 +1498,7 @@ struct v3dv_cmd_buffer_state { /* FIXME: we have just one client-side BO for the push constants, * independently of the stageFlags in vkCmdPushConstants, and the * pipelineBindPoint in vkCmdBindPipeline. We could probably do more stage - * tunning in the future if it makes sense. + * tuning in the future if it makes sense. */ uint32_t push_constants_size; uint32_t push_constants_data[MAX_PUSH_CONSTANTS_SIZE / 4]; @@ -1663,7 +1679,7 @@ struct v3dv_query_pool { /* Only used with performance queries */ struct { uint32_t ncounters; - uint8_t counters[V3D_PERFCNT_NUM]; + uint8_t counters[V3D_MAX_PERFCNT]; /* V3D has a limit on the number of counters we can track in a * single performance monitor, so if too many counters are requested @@ -1799,7 +1815,8 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer, void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer, struct drm_v3d_submit_tfu *tfu); -void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info, +void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device, + struct v3dv_csd_indirect_cpu_job_info *info, const uint32_t *wg_counts); void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer, @@ -1827,6 +1844,11 @@ bool v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state *state void v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer *cmd_buffer, const VkDependencyInfoKHR *info); +bool v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_image *dst, + struct v3dv_image *src, + const VkImageCopy2 *region); + struct v3dv_event { struct vk_object_base base; @@ -2156,32 +2178,6 @@ struct v3dv_sampler { #define V3DV_NO_SAMPLER_16BIT_IDX 0 #define V3DV_NO_SAMPLER_32BIT_IDX 1 -/* - * Following two methods are using on the combined to/from texture/sampler - * indices maps at v3dv_pipeline. - */ -static inline uint32_t -v3dv_pipeline_combined_index_key_create(uint32_t texture_index, - uint32_t sampler_index) -{ - return texture_index << 24 | sampler_index; -} - -static inline void -v3dv_pipeline_combined_index_key_unpack(uint32_t combined_index_key, - uint32_t *texture_index, - uint32_t *sampler_index) -{ - uint32_t texture = combined_index_key >> 24; - uint32_t sampler = combined_index_key & 0xffffff; - - if (texture_index) - *texture_index = texture; - - if (sampler_index) - *sampler_index = sampler; -} - struct v3dv_descriptor_maps { struct v3dv_descriptor_map ubo_map; struct v3dv_descriptor_map ssbo_map; @@ -2277,7 +2273,7 @@ struct v3dv_pipeline { } va[MAX_VERTEX_ATTRIBS]; uint32_t va_count; - enum pipe_prim_type topology; + enum mesa_prim topology; struct v3dv_pipeline_shared_data *shared_data; @@ -2285,7 +2281,8 @@ struct v3dv_pipeline { unsigned char sha1[20]; /* In general we can reuse v3dv_device->default_attribute_float, so note - * that the following can be NULL. + * that the following can be NULL. In 7.x this is not used, so it will be + * always NULL. * * FIXME: the content of this BO will be small, so it could be improved to * be uploaded to a common BO. But as in most cases it will be NULL, it is @@ -2319,6 +2316,9 @@ struct v3dv_pipeline { bool is_z16; } depth_bias; + /* Depth bounds */ + bool depth_bounds_test_enabled; + struct { void *mem_ctx; struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */ @@ -2334,6 +2334,13 @@ struct v3dv_pipeline { uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH]; }; +static inline bool +v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device) +{ + return device->devinfo.ver > 71 || + (device->devinfo.ver == 71 && device->devinfo.rev >= 5); +} + static inline VkPipelineBindPoint v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline) { @@ -2496,10 +2503,6 @@ void v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline, struct v3dv_pipeline_cache *cache); -struct v3dv_bo * -v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device, - struct v3dv_pipeline *pipeline); - VkResult v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device, nir_shader *nir, @@ -2522,7 +2525,7 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer, base, VkBuffer, VK_OBJECT_TYPE_BUFFER) VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer_view, base, VkBufferView, VK_OBJECT_TYPE_BUFFER_VIEW) -VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, base, VkDeviceMemory, +VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, vk.base, VkDeviceMemory, VK_OBJECT_TYPE_DEVICE_MEMORY) VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_pool, base, VkDescriptorPool, VK_OBJECT_TYPE_DESCRIPTOR_POOL) @@ -2604,12 +2607,32 @@ u64_compare(const void *key1, const void *key2) case 42: \ v3d_X_thing = &v3d42_##thing; \ break; \ + case 71: \ + v3d_X_thing = &v3d71_##thing; \ + break; \ default: \ unreachable("Unsupported hardware generation"); \ } \ v3d_X_thing; \ }) +/* Helper to get hw-specific macro values */ +#define V3DV_X(device, thing) ({ \ + __typeof(V3D42_##thing) V3D_X_THING; \ + switch (device->devinfo.ver) { \ + case 42: \ + V3D_X_THING = V3D42_##thing; \ + break; \ + case 71: \ + V3D_X_THING = V3D71_##thing; \ + break; \ + default: \ + unreachable("Unsupported hardware generation"); \ + } \ + V3D_X_THING; \ +}) + + /* v3d_macros from common requires v3dX and V3DX definitions. Below we need to * define v3dX for each version supported, because when we compile code that @@ -2622,16 +2645,26 @@ u64_compare(const void *key1, const void *key2) # define v3dX(x) v3d42_##x # include "v3dvx_private.h" # undef v3dX + +# define v3dX(x) v3d71_##x +# include "v3dvx_private.h" +# undef v3dX #endif +VkResult +v3dv_update_image_layout(struct v3dv_device *device, + struct v3dv_image *image, + uint64_t modifier, + bool disjoint, + const VkImageDrmFormatModifierExplicitCreateInfoEXT *explicit_mod_info); + #ifdef ANDROID VkResult -v3dv_gralloc_info(struct v3dv_device *device, - const VkNativeBufferANDROID *gralloc_info, - int *out_dmabuf, - int *out_stride, - int *out_size, - uint64_t *out_modifier); +v3dv_gralloc_to_drm_explicit_layout(struct u_gralloc *gralloc, + struct u_gralloc_buffer_handle *in_hnd, + VkImageDrmFormatModifierExplicitCreateInfoEXT *out, + VkSubresourceLayout *out_layouts, + int max_planes); VkResult v3dv_import_native_buffer_fd(VkDevice device_h, diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_query.c b/lib/mesa/src/broadcom/vulkan/v3dv_query.c index 216dd1567..d6f93466d 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_query.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_query.c @@ -23,7 +23,6 @@ #include "v3dv_private.h" -#include "common/v3d_performance_counters.h" #include "util/timespec.h" #include "compiler/nir/nir_builder.h" @@ -48,7 +47,7 @@ kperfmon_create(struct v3dv_device *device, DRM_IOCTL_V3D_PERFMON_CREATE, &req); if (ret) - fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret)); + fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret)); pool->queries[query].perf.kperfmon_ids[i] = req.id; } @@ -303,7 +302,6 @@ v3dv_CreateQueryPool(VkDevice _device, QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); assert(pq_info); - assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM); pool->perfmon.ncounters = pq_info->counterIndexCount; for (uint32_t i = 0; i < pq_info->counterIndexCount; i++) @@ -592,7 +590,7 @@ write_performance_query_result(struct v3dv_device *device, assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); struct v3dv_query *q = &pool->queries[query]; - uint64_t counter_values[V3D_PERFCNT_NUM]; + uint64_t counter_values[V3D_MAX_PERFCNT]; for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { struct drm_v3d_perfmon_get_values req = { @@ -1284,40 +1282,11 @@ v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( VkPerformanceCounterKHR *pCounters, VkPerformanceCounterDescriptionKHR *pCounterDescriptions) { - uint32_t desc_count = *pCounterCount; + V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice); - VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, - out, pCounters, pCounterCount); - VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, - out_desc, pCounterDescriptions, &desc_count); - - for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) { - vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { - counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR; - counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; - counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR; - - unsigned char sha1_result[20]; - _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME], - strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]), - sha1_result); - - memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); - } - - vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, - &out_desc, desc) { - desc->flags = 0; - snprintf(desc->name, sizeof(desc->name), "%s", - v3d_performance_counters[i][V3D_PERFCNT_NAME]); - snprintf(desc->category, sizeof(desc->category), "%s", - v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]); - snprintf(desc->description, sizeof(desc->description), "%s", - v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]); - } - } - - return vk_outarray_status(&out); + return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount, + pCounters, + pCounterDescriptions); } VKAPI_ATTR void VKAPI_CALL @@ -1345,23 +1314,23 @@ v3dv_ReleaseProfilingLockKHR(VkDevice device) static inline void nir_set_query_availability(nir_builder *b, - nir_ssa_def *buf, - nir_ssa_def *offset, - nir_ssa_def *query_idx, - nir_ssa_def *avail) + nir_def *buf, + nir_def *offset, + nir_def *query_idx, + nir_def *avail) { offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */ nir_store_ssbo(b, avail, buf, offset, .write_mask = 0x1, .align_mul = 1); } -static inline nir_ssa_def * +static inline nir_def * nir_get_query_availability(nir_builder *b, - nir_ssa_def *buf, - nir_ssa_def *offset, - nir_ssa_def *query_idx) + nir_def *buf, + nir_def *offset, + nir_def *query_idx) { offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */ - nir_ssa_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1); + nir_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1); return nir_i2i32(b, avail); } @@ -1372,12 +1341,7 @@ get_set_query_availability_cs() nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "set query availability cs"); - /* We rely on supergroup packing to maximize SIMD lane occupancy */ - b.shader->info.workgroup_size[0] = 1; - b.shader->info.workgroup_size[1] = 1; - b.shader->info.workgroup_size[2] = 1; - - nir_ssa_def *buf = + nir_def *buf = nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), .desc_set = 0, .binding = 0, @@ -1387,15 +1351,15 @@ get_set_query_availability_cs() * ever change any of these parameters we need to update how we compute the * query index here. */ - nir_ssa_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b, 32), 0); + nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0); - nir_ssa_def *offset = + nir_def *offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); - nir_ssa_def *query_idx = + nir_def *query_idx = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4); - nir_ssa_def *avail = + nir_def *avail = nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 8, .range = 1); query_idx = nir_iadd(&b, query_idx, wg_id); @@ -1404,33 +1368,33 @@ get_set_query_availability_cs() return b.shader; } -static inline nir_ssa_def * -nir_get_occlusion_counter_offset(nir_builder *b, nir_ssa_def *query_idx) +static inline nir_def * +nir_get_occlusion_counter_offset(nir_builder *b, nir_def *query_idx) { - nir_ssa_def *query_group = nir_udiv_imm(b, query_idx, 16); - nir_ssa_def *query_group_offset = nir_umod_imm(b, query_idx, 16); - nir_ssa_def *offset = - nir_iadd(b, nir_imul(b, query_group, nir_imm_int(b, 1024)), - nir_imul(b, query_group_offset, nir_imm_int(b, 4))); + nir_def *query_group = nir_udiv_imm(b, query_idx, 16); + nir_def *query_group_offset = nir_umod_imm(b, query_idx, 16); + nir_def *offset = + nir_iadd(b, nir_imul_imm(b, query_group, 1024), + nir_imul_imm(b, query_group_offset, 4)); return offset; } static inline void nir_reset_occlusion_counter(nir_builder *b, - nir_ssa_def *buf, - nir_ssa_def *query_idx) + nir_def *buf, + nir_def *query_idx) { - nir_ssa_def *offset = nir_get_occlusion_counter_offset(b, query_idx); - nir_ssa_def *zero = nir_imm_int(b, 0); + nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx); + nir_def *zero = nir_imm_int(b, 0); nir_store_ssbo(b, zero, buf, offset, .write_mask = 0x1, .align_mul = 4); } -static inline nir_ssa_def * +static inline nir_def * nir_read_occlusion_counter(nir_builder *b, - nir_ssa_def *buf, - nir_ssa_def *query_idx) + nir_def *buf, + nir_def *query_idx) { - nir_ssa_def *offset = nir_get_occlusion_counter_offset(b, query_idx); + nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx); return nir_load_ssbo(b, 1, 32, buf, offset, .access = 0, .align_mul = 4); } @@ -1441,12 +1405,7 @@ get_reset_occlusion_query_cs() nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "reset occlusion query cs"); - /* We rely on supergroup packing to maximize SIMD lane occupancy */ - b.shader->info.workgroup_size[0] = 1; - b.shader->info.workgroup_size[1] = 1; - b.shader->info.workgroup_size[2] = 1; - - nir_ssa_def *buf = + nir_def *buf = nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), .desc_set = 0, .binding = 0, @@ -1456,15 +1415,15 @@ get_reset_occlusion_query_cs() * ever change any of these parameters we need to update how we compute the * query index here. */ - nir_ssa_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b, 32), 0); + nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0); - nir_ssa_def *avail_offset = + nir_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); - nir_ssa_def *base_query_idx = + nir_def *base_query_idx = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4); - nir_ssa_def *query_idx = nir_iadd(&b, base_query_idx, wg_id); + nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id); nir_set_query_availability(&b, buf, avail_offset, query_idx, nir_imm_intN_t(&b, 0, 8)); @@ -1475,21 +1434,21 @@ get_reset_occlusion_query_cs() static void write_query_buffer(nir_builder *b, - nir_ssa_def *buf, - nir_ssa_def **offset, - nir_ssa_def *value, + nir_def *buf, + nir_def **offset, + nir_def *value, bool flag_64bit) { if (flag_64bit) { /* Create a 64-bit value using a vec2 with the .Y component set to 0 * so we can write a 64-bit value in a single store. */ - nir_ssa_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0)); + nir_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0)); nir_store_ssbo(b, value64, buf, *offset, .write_mask = 0x3, .align_mul = 8); - *offset = nir_iadd(b, *offset, nir_imm_int(b, 8)); + *offset = nir_iadd_imm(b, *offset, 8); } else { nir_store_ssbo(b, value, buf, *offset, .write_mask = 0x1, .align_mul = 4); - *offset = nir_iadd(b, *offset, nir_imm_int(b, 4)); + *offset = nir_iadd_imm(b, *offset, 4); } } @@ -1504,60 +1463,55 @@ get_copy_query_results_cs(VkQueryResultFlags flags) nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "copy query results cs"); - /* We rely on supergroup packing to maximize SIMD lane occupancy */ - b.shader->info.workgroup_size[0] = 1; - b.shader->info.workgroup_size[1] = 1; - b.shader->info.workgroup_size[2] = 1; - - nir_ssa_def *buf = + nir_def *buf = nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), .desc_set = 0, .binding = 0, .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); - nir_ssa_def *buf_out = + nir_def *buf_out = nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0), .desc_set = 1, .binding = 0, .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); /* Read push constants */ - nir_ssa_def *avail_offset = + nir_def *avail_offset = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4); - nir_ssa_def *base_query_idx = + nir_def *base_query_idx = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4); - nir_ssa_def *base_offset_out = + nir_def *base_offset_out = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 8, .range = 4); - nir_ssa_def *stride = + nir_def *stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 12, .range = 4); /* This assumes a local size of 1 and a horizontal-only dispatch. If we * ever change any of these parameters we need to update how we compute the * query index here. */ - nir_ssa_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b, 32), 0); - nir_ssa_def *query_idx = nir_iadd(&b, base_query_idx, wg_id); + nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0); + nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id); /* Read query availability if needed */ - nir_ssa_def *avail = NULL; + nir_def *avail = NULL; if (flag_avail || !flag_partial) avail = nir_get_query_availability(&b, buf, avail_offset, query_idx); /* Write occusion query result... */ - nir_ssa_def *offset = + nir_def *offset = nir_iadd(&b, base_offset_out, nir_imul(&b, wg_id, stride)); /* ...if partial is requested, we always write */ if(flag_partial) { - nir_ssa_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx); + nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx); write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit); } else { /*...otherwise, we only write if the query is available */ nir_if *if_stmt = nir_push_if(&b, nir_ine_imm(&b, avail, 0)); - nir_ssa_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx); + nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx); write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit); nir_pop_if(&b, if_stmt); } diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_queue.c b/lib/mesa/src/broadcom/vulkan/v3dv_queue.c index 9e1bc702f..a0942cf1c 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_queue.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_queue.c @@ -135,7 +135,7 @@ handle_reset_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, * we handle those in the CPU. */ if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) - v3dv_bo_wait(job->device, info->pool->occlusion.bo, PIPE_TIMEOUT_INFINITE); + v3dv_bo_wait(job->device, info->pool->occlusion.bo, OS_TIMEOUT_INFINITE); if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { struct vk_sync_wait waits[info->count]; @@ -296,60 +296,6 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job) } static VkResult -handle_copy_buffer_to_image_cpu_job(struct v3dv_queue *queue, - struct v3dv_job *job, - struct v3dv_submit_sync_info *sync_info) -{ - assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE); - struct v3dv_copy_buffer_to_image_cpu_job_info *info = - &job->cpu.copy_buffer_to_image; - - /* Wait for all GPU work to finish first, since we may be accessing - * the BOs involved in the operation. - */ - VkResult result = queue_wait_idle(queue, sync_info); - if (result != VK_SUCCESS) - return result; - - /* Map BOs */ - struct v3dv_bo *dst_bo = info->image->planes[info->plane].mem->bo; - assert(!dst_bo->map || dst_bo->map_size == dst_bo->size); - if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size)) - return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); - void *dst_ptr = dst_bo->map; - - struct v3dv_bo *src_bo = info->buffer->mem->bo; - assert(!src_bo->map || src_bo->map_size == src_bo->size); - if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size)) - return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY); - void *src_ptr = src_bo->map; - - const struct v3d_resource_slice *slice = - &info->image->planes[info->plane].slices[info->mip_level]; - - const struct pipe_box box = { - info->image_offset.x, info->image_offset.y, info->base_layer, - info->image_extent.width, info->image_extent.height, info->layer_count, - }; - - /* Copy each layer */ - for (uint32_t i = 0; i < info->layer_count; i++) { - const uint32_t dst_offset = - v3dv_layer_offset(info->image, info->mip_level, - info->base_layer + i, info->plane); - const uint32_t src_offset = - info->buffer->mem_offset + info->buffer_offset + - info->buffer_layer_stride * i; - v3d_store_tiled_image( - dst_ptr + dst_offset, slice->stride, - src_ptr + src_offset, info->buffer_stride, - slice->tiling, info->image->planes[info->plane].cpp, slice->padded_height, &box); - } - - return VK_SUCCESS; -} - -static VkResult handle_timestamp_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, struct v3dv_submit_sync_info *sync_info) { @@ -392,7 +338,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue, /* Make sure the GPU is no longer using the indirect buffer*/ assert(info->buffer && info->buffer->mem && info->buffer->mem->bo); - v3dv_bo_wait(queue->device, info->buffer->mem->bo, PIPE_TIMEOUT_INFINITE); + v3dv_bo_wait(queue->device, info->buffer->mem->bo, OS_TIMEOUT_INFINITE); /* Map the indirect buffer and read the dispatch parameters */ assert(info->buffer && info->buffer->mem && info->buffer->mem->bo); @@ -408,7 +354,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue, if (memcmp(group_counts, info->csd_job->csd.wg_count, sizeof(info->csd_job->csd.wg_count)) != 0) { - v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts); + v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts); } return VK_SUCCESS; @@ -757,7 +703,7 @@ handle_cl_job(struct v3dv_queue *queue, if (job->tmu_dirty_rcl) submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE; - /* If the job uses VK_KHR_buffer_device_addess we need to ensure all + /* If the job uses VK_KHR_buffer_device_address we need to ensure all * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR * are included. */ @@ -923,7 +869,7 @@ handle_csd_job(struct v3dv_queue *queue, struct drm_v3d_submit_csd *submit = &job->csd.submit; - /* If the job uses VK_KHR_buffer_device_addess we need to ensure all + /* If the job uses VK_KHR_buffer_device_address we need to ensure all * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR * are included. */ @@ -1014,8 +960,6 @@ queue_handle_job(struct v3dv_queue *queue, return handle_end_query_cpu_job(job, counter_pass_idx); case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS: return handle_copy_query_results_cpu_job(job); - case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE: - return handle_copy_buffer_to_image_cpu_job(queue, job, sync_info); case V3DV_JOB_TYPE_CPU_CSD_INDIRECT: return handle_csd_indirect_cpu_job(queue, job, sync_info); case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY: diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_uniforms.c b/lib/mesa/src/broadcom/vulkan/v3dv_uniforms.c index f3a98ab7e..098bfb648 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_uniforms.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_uniforms.c @@ -87,7 +87,7 @@ push_constants_bo_free(VkDevice _device, * This method checks if the ubo used for push constants is needed to be * updated or not. * - * push contants ubo is only used for push constants accessed by a non-const + * push constants ubo is only used for push constants accessed by a non-const * index. */ static void @@ -288,9 +288,10 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer, offset + dynamic_offset); } else { if (content == QUNIFORM_UBO_ADDR) { - /* We reserve index 0 for push constants and artificially increase our - * indices by one for that reason, fix that now before accessing the - * descriptor map. + /* We reserve UBO index 0 for push constants in Vulkan (and for the + * constant buffer in GL) so the compiler always adds one to all UBO + * indices, fix it up before we access the descriptor map, since + * indices start from 0 there. */ assert(index > 0); index--; @@ -497,7 +498,6 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect); struct v3dv_cl_out *uniforms = cl_start(&job->indirect); - for (int i = 0; i < uinfo->count; i++) { uint32_t data = uinfo->data[i]; @@ -519,13 +519,17 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, cmd_buffer, pipeline, variant->stage); break; - case QUNIFORM_VIEWPORT_X_SCALE: - cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f); + case QUNIFORM_VIEWPORT_X_SCALE: { + float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY); + cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity); break; + } - case QUNIFORM_VIEWPORT_Y_SCALE: - cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f); + case QUNIFORM_VIEWPORT_Y_SCALE: { + float clipper_xy_granularity = V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY); + cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity); break; + } case QUNIFORM_VIEWPORT_Z_OFFSET: { float translate_z; diff --git a/lib/mesa/src/broadcom/vulkan/v3dv_wsi.c b/lib/mesa/src/broadcom/vulkan/v3dv_wsi.c index 5efb1ea95..404a64d0e 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dv_wsi.c +++ b/lib/mesa/src/broadcom/vulkan/v3dv_wsi.c @@ -24,8 +24,6 @@ */ #include "v3dv_private.h" -#include "drm-uapi/drm_fourcc.h" -#include "wsi_common_entrypoints.h" #include "vk_util.h" #include "wsi_common.h" #include "wsi_common_drm.h" @@ -41,19 +39,7 @@ static bool v3dv_wsi_can_present_on_device(VkPhysicalDevice _pdevice, int fd) { V3DV_FROM_HANDLE(v3dv_physical_device, pdevice, _pdevice); - - /* There are some instances with direct display extensions where this may be - * called before we have ever tried to create a swapchain, and therefore, - * before we have ever tried to acquire the display device, in which case we - * have to do it now. - */ - if (unlikely(pdevice->display_fd < 0 && pdevice->master_fd >= 0)) { - VkResult result = - v3dv_physical_device_acquire_display(pdevice, NULL); - if (result != VK_SUCCESS) - return false; - } - + assert(pdevice->display_fd != -1); return wsi_common_drm_devices_equal(fd, pdevice->display_fd); } @@ -66,7 +52,7 @@ v3dv_wsi_init(struct v3dv_physical_device *physical_device) v3dv_physical_device_to_handle(physical_device), v3dv_wsi_proc_addr, &physical_device->vk.instance->alloc, - physical_device->master_fd, NULL, + physical_device->display_fd, NULL, &(struct wsi_device_options){.sw_device = false}); if (result != VK_SUCCESS) @@ -89,67 +75,6 @@ v3dv_wsi_finish(struct v3dv_physical_device *physical_device) &physical_device->vk.instance->alloc); } -static void -constraint_surface_capabilities(VkSurfaceCapabilitiesKHR *caps) -{ - /* Our display pipeline requires that images are linear, so we cannot - * ensure that our swapchain images can be sampled. If we are running under - * a compositor in windowed mode, the DRM modifier negotiation should - * probably end up selecting an UIF layout for the swapchain images but it - * may still choose linear and send images directly for scanout if the - * surface is in fullscreen mode for example. If we are not running under - * a compositor, then we would always need them to be linear anyway. - */ - caps->supportedUsageFlags &= ~VK_IMAGE_USAGE_SAMPLED_BIT; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetPhysicalDeviceSurfaceCapabilitiesKHR( - VkPhysicalDevice physicalDevice, - VkSurfaceKHR surface, - VkSurfaceCapabilitiesKHR* pSurfaceCapabilities) -{ - VkResult result; - result = wsi_GetPhysicalDeviceSurfaceCapabilitiesKHR(physicalDevice, - surface, - pSurfaceCapabilities); - constraint_surface_capabilities(pSurfaceCapabilities); - return result; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_GetPhysicalDeviceSurfaceCapabilities2KHR( - VkPhysicalDevice physicalDevice, - const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo, - VkSurfaceCapabilities2KHR* pSurfaceCapabilities) -{ - VkResult result; - result = wsi_GetPhysicalDeviceSurfaceCapabilities2KHR(physicalDevice, - pSurfaceInfo, - pSurfaceCapabilities); - constraint_surface_capabilities(&pSurfaceCapabilities->surfaceCapabilities); - return result; -} - -VKAPI_ATTR VkResult VKAPI_CALL -v3dv_CreateSwapchainKHR( - VkDevice _device, - const VkSwapchainCreateInfoKHR* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkSwapchainKHR* pSwapchain) -{ - V3DV_FROM_HANDLE(v3dv_device, device, _device); - struct v3dv_physical_device *pdevice = device->pdevice; - - ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, pCreateInfo->surface); - VkResult result = - v3dv_physical_device_acquire_display(pdevice, surface); - if (result != VK_SUCCESS) - return result; - - return wsi_CreateSwapchainKHR(_device, pCreateInfo, pAllocator, pSwapchain); -} - struct v3dv_image * v3dv_wsi_get_image_from_swapchain(VkSwapchainKHR swapchain, uint32_t index) { diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/lib/mesa/src/broadcom/vulkan/v3dvx_cmd_buffer.c index 0c23a33b5..011f5c8e1 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dvx_cmd_buffer.c +++ b/lib/mesa/src/broadcom/vulkan/v3dvx_cmd_buffer.c @@ -56,10 +56,15 @@ v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job) }; config.width_in_pixels = tiling->width; config.height_in_pixels = tiling->height; +#if V3D_VERSION == 42 config.number_of_render_targets = MAX2(tiling->render_target_count, 1); config.multisample_mode_4x = tiling->msaa; config.double_buffer_in_non_ms_mode = tiling->double_buffer; config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; +#endif +#if V3D_VERSION >= 71 + unreachable("HW generation 71 not supported yet."); +#endif uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr; cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config); @@ -82,10 +87,22 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job, cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { config.width_in_pixels = tiling->width; config.height_in_pixels = tiling->height; +#if V3D_VERSION == 42 config.number_of_render_targets = MAX2(tiling->render_target_count, 1); config.multisample_mode_4x = tiling->msaa; config.double_buffer_in_non_ms_mode = tiling->double_buffer; config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = log2_tile_size(tiling->tile_width); + config.log2_tile_height = log2_tile_size(tiling->tile_height); + /* FIXME: ideally we would like next assert on the packet header (as is + * general, so also applies to GL). We would need to expand + * gen_pack_header for that. + */ + assert(config.log2_tile_width == config.log2_tile_height || + config.log2_tile_width == config.log2_tile_height + 1); +#endif } /* There's definitely nothing in the VCD cache we want. */ @@ -345,6 +362,11 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer, iview->vk.base_array_layer + layer, image_plane); + /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it + * is broken in earlier V3D versions. + */ + assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear); + cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) { store.buffer_to_store = buffer; store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset); @@ -467,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, const VkImageAspectFlags aspects = vk_format_aspects(ds_attachment->desc.format); +#if V3D_VERSION <= 42 + /* GFXH-1689: The per-buffer store command's clear buffer bit is broken + * for depth/stencil. + * + * There used to be some confusion regarding the Clear Tile Buffers + * Z/S bit also being broken, but we confirmed with Broadcom that this + * is not the case, it was just that some other hardware bugs (that we + * need to work around, such as GFXH-1461) could cause this bit to behave + * incorrectly. + * + * There used to be another issue where the RTs bit in the Clear Tile + * Buffers packet also cleared Z/S, but Broadcom confirmed this is + * fixed since V3D 4.1. + * + * So if we have to emit a clear of depth or stencil we don't use + * the per-buffer store clear bit, even if we need to store the buffers, + * instead we always have to use the Clear Tile Buffers Z/S bit. + * If we have configured the job to do early Z/S clearing, then we + * don't want to emit any Clear Tile Buffers command at all here. + * + * Note that GFXH-1689 is not reproduced in the simulator, where + * using the clear buffer bit in depth/stencil stores works fine. + */ + /* Only clear once on the first subpass that uses the attachment */ uint32_t ds_first_subpass = !state->pass->multiview_enabled ? ds_attachment->first_subpass : @@ -486,6 +532,17 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, ds_attachment->desc.stencilLoadOp, subpass->do_stencil_clear_with_draw); + use_global_zs_clear = !state->job->early_zs_clear && + (needs_depth_clear || needs_stencil_clear); +#endif +#if V3D_VERSION >= 71 + /* The store command's clear buffer bit cannot be used for Z/S stencil: + * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles, + * so we don't want to emit redundant clears here. + */ + use_global_zs_clear = false; +#endif + /* Skip the last store if it is not required */ uint32_t ds_last_subpass = !pass->multiview_enabled ? ds_attachment->last_subpass : @@ -528,30 +585,6 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, needs_stencil_store = subpass->resolve_stencil; } - /* GFXH-1689: The per-buffer store command's clear buffer bit is broken - * for depth/stencil. - * - * There used to be some confusion regarding the Clear Tile Buffers - * Z/S bit also being broken, but we confirmed with Broadcom that this - * is not the case, it was just that some other hardware bugs (that we - * need to work around, such as GFXH-1461) could cause this bit to behave - * incorrectly. - * - * There used to be another issue where the RTs bit in the Clear Tile - * Buffers packet also cleared Z/S, but Broadcom confirmed this is - * fixed since V3D 4.1. - * - * So if we have to emit a clear of depth or stencil we don't use - * the per-buffer store clear bit, even if we need to store the buffers, - * instead we always have to use the Clear Tile Buffers Z/S bit. - * If we have configured the job to do early Z/S clearing, then we - * don't want to emit any Clear Tile Buffers command at all here. - * - * Note that GFXH-1689 is not reproduced in the simulator, where - * using the clear buffer bit in depth/stencil stores works fine. - */ - use_global_zs_clear = !state->job->early_zs_clear && - (needs_depth_clear || needs_stencil_clear); if (needs_depth_store || needs_stencil_store) { const uint32_t zs_buffer = v3dv_zs_buffer(needs_depth_store, needs_stencil_store); @@ -649,10 +682,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer, * bit and instead we have to emit a single clear of all tile buffers. */ if (use_global_zs_clear || use_global_rt_clear) { +#if V3D_VERSION == 42 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) { clear.clear_z_stencil_buffer = use_global_zs_clear; clear.clear_all_render_targets = use_global_rt_clear; } +#endif +#if V3D_VERSION >= 71 + cl_emit(cl, CLEAR_RENDER_TARGETS, clear); +#endif } } @@ -778,6 +816,103 @@ set_rcl_early_z_config(struct v3dv_job *job, } } +/* Note that for v71, render target cfg packets has just one field that + * combined the internal type and clamp mode. For simplicity we keep just one + * helper. + * + * Note: rt_type is in fact a "enum V3DX(Internal_Type)". + * + * FIXME: for v71 we are not returning all the possible combinations for + * render target internal type and clamp. For example for int types we are + * always using clamp int, and for 16f we are using clamp none or pos (that + * seems to be the equivalent for no-clamp on 4.2), but not pq or hlg. In + * summary right now we are just porting what we were doing on 4.2 + */ +uint32_t +v3dX(clamp_for_format_and_type)(uint32_t rt_type, + VkFormat vk_format) +{ +#if V3D_VERSION == 42 + if (vk_format_is_int(vk_format)) + return V3D_RENDER_TARGET_CLAMP_INT; + else if (vk_format_is_srgb(vk_format)) + return V3D_RENDER_TARGET_CLAMP_NORM; + else + return V3D_RENDER_TARGET_CLAMP_NONE; +#endif +#if V3D_VERSION >= 71 + switch (rt_type) { + case V3D_INTERNAL_TYPE_8I: + return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED; + case V3D_INTERNAL_TYPE_8UI: + return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED; + case V3D_INTERNAL_TYPE_8: + return V3D_RENDER_TARGET_TYPE_CLAMP_8; + case V3D_INTERNAL_TYPE_16I: + return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED; + case V3D_INTERNAL_TYPE_16UI: + return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED; + case V3D_INTERNAL_TYPE_16F: + return vk_format_is_srgb(vk_format) ? + V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM : + V3D_RENDER_TARGET_TYPE_CLAMP_16F; + case V3D_INTERNAL_TYPE_32I: + return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED; + case V3D_INTERNAL_TYPE_32UI: + return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED; + case V3D_INTERNAL_TYPE_32F: + return V3D_RENDER_TARGET_TYPE_CLAMP_32F; + default: + unreachable("Unknown internal render target type"); + } + + return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID; +#endif +} + +static void +cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer, + int rt, + uint32_t *rt_bpp, +#if V3D_VERSION == 42 + uint32_t *rt_type, + uint32_t *rt_clamp) +#else + uint32_t *rt_type_clamp) +#endif +{ + const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; + + assert(state->subpass_idx < state->pass->subpass_count); + const struct v3dv_subpass *subpass = + &state->pass->subpasses[state->subpass_idx]; + + if (rt >= subpass->color_count) + return; + + struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt]; + const uint32_t attachment_idx = attachment->attachment; + if (attachment_idx == VK_ATTACHMENT_UNUSED) + return; + + assert(attachment_idx < state->framebuffer->attachment_count && + attachment_idx < state->attachment_alloc_count); + struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view; + assert(vk_format_is_color(iview->vk.format)); + + assert(iview->plane_count == 1); + *rt_bpp = iview->planes[0].internal_bpp; +#if V3D_VERSION == 42 + *rt_type = iview->planes[0].internal_type; + *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type, + iview->vk.format); +#endif +#if V3D_VERSION >= 71 + *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type, + iview->vk.format); +#endif +} + void v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) { @@ -824,7 +959,19 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) config.number_of_render_targets = MAX2(subpass->color_count, 1); config.multisample_mode_4x = tiling->msaa; config.double_buffer_in_non_ms_mode = tiling->double_buffer; +#if V3D_VERSION == 42 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = log2_tile_size(tiling->tile_width); + config.log2_tile_height = log2_tile_size(tiling->tile_height); + /* FIXME: ideallly we would like next assert on the packet header (as is + * general, so also applies to GL). We would need to expand + * gen_pack_header for that. + */ + assert(config.log2_tile_width == config.log2_tile_height || + config.log2_tile_width == config.log2_tile_height + 1); +#endif if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { const struct v3dv_image_view *iview = @@ -851,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) * Early-Z/S clearing is independent of Early Z/S testing, so it is * possible to enable one but not the other so long as their * respective requirements are met. + * + * From V3D 4.5.6, Z/S buffers are always cleared automatically + * between tiles, but we still want to enable early ZS clears + * when Z/S are not loaded or stored. */ struct v3dv_render_pass_attachment *ds_attachment = &pass->attachments[ds_attachment_idx]; @@ -858,21 +1009,33 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) const VkImageAspectFlags ds_aspects = vk_format_aspects(ds_attachment->desc.format); - bool needs_depth_clear = - check_needs_clear(state, - ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, - ds_attachment->first_subpass, - ds_attachment->desc.loadOp, - subpass->do_depth_clear_with_draw); - bool needs_depth_store = v3dv_cmd_buffer_check_needs_store(state, ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, ds_attachment->last_subpass, ds_attachment->desc.storeOp) || subpass->resolve_depth; +#if V3D_VERSION <= 42 + bool needs_depth_clear = + check_needs_clear(state, + ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_attachment->first_subpass, + ds_attachment->desc.loadOp, + subpass->do_depth_clear_with_draw); do_early_zs_clear = needs_depth_clear && !needs_depth_store; +#endif +#if V3D_VERSION >= 71 + bool needs_depth_load = + v3dv_cmd_buffer_check_needs_load(state, + ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ds_attachment->first_subpass, + ds_attachment->desc.loadOp, + ds_attachment->last_subpass, + ds_attachment->desc.storeOp); + do_early_zs_clear = !needs_depth_load && !needs_depth_store; +#endif + if (do_early_zs_clear && vk_format_has_stencil(ds_attachment->desc.format)) { bool needs_stencil_load = @@ -905,10 +1068,20 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) */ job->early_zs_clear = do_early_zs_clear; +#if V3D_VERSION >= 71 + uint32_t base_addr = 0; +#endif for (uint32_t i = 0; i < subpass->color_count; i++) { uint32_t attachment_idx = subpass->color_attachments[i].attachment; - if (attachment_idx == VK_ATTACHMENT_UNUSED) + if (attachment_idx == VK_ATTACHMENT_UNUSED) { +#if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.render_target_number = i; + rt.stride = 1; /* Unused */ + } +#endif continue; + } struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view; @@ -920,10 +1093,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) const struct v3d_resource_slice *slice = &image->planes[plane].slices[iview->vk.base_mip_level]; - const uint32_t *clear_color = + UNUSED const uint32_t *clear_color = &state->attachments[attachment_idx].clear_value.color[0]; - uint32_t clear_pad = 0; + UNUSED uint32_t clear_pad = 0; if (slice->tiling == V3D_TILING_UIF_NO_XOR || slice->tiling == V3D_TILING_UIF_XOR) { int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2; @@ -937,6 +1110,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) } } +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { clear.clear_color_low_32_bits = clear_color[0]; clear.clear_color_next_24_bits = clear_color[1] & 0xffffff; @@ -960,22 +1134,74 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) clear.render_target_number = i; }; } +#endif + +#if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.clear_color_low_bits = clear_color[0]; + cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp, + &rt.internal_type_and_clamping); + rt.stride = + v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width, + v3d_internal_bpp_words(rt.internal_bpp)); + rt.base_address = base_addr; + rt.render_target_number = i; + + /* base_addr in multiples of 512 bits. We divide by 8 because stride + * is in 128-bit units, but it is packing 2 rows worth of data, so we + * need to divide it by 2 so it is only 1 row, and then again by 4 so + * it is in 512-bit units. + */ + base_addr += (tiling->tile_height * rt.stride) / 8; + } + + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { + rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ + ((uint64_t) clear_color[1]) | + (((uint64_t) (clear_color[2] & 0xff)) << 32); + rt.render_target_number = i; + } + } + + if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { + rt.clear_color_top_bits = /* 56 bits (24 + 32) */ + (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) | + (((uint64_t) (clear_color[3])) << 24); + rt.render_target_number = i; + } + } +#endif + } + +#if V3D_VERSION >= 71 + /* If we don't have any color RTs, we still need to emit one and flag + * it as not used using stride = 1. + */ + if (subpass->color_count == 0) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.stride = 1; + } } +#endif +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { - v3dX(cmd_buffer_render_pass_setup_render_target) + cmd_buffer_render_pass_setup_render_target (cmd_buffer, 0, &rt.render_target_0_internal_bpp, &rt.render_target_0_internal_type, &rt.render_target_0_clamp); - v3dX(cmd_buffer_render_pass_setup_render_target) + cmd_buffer_render_pass_setup_render_target (cmd_buffer, 1, &rt.render_target_1_internal_bpp, &rt.render_target_1_internal_type, &rt.render_target_1_clamp); - v3dX(cmd_buffer_render_pass_setup_render_target) + cmd_buffer_render_pass_setup_render_target (cmd_buffer, 2, &rt.render_target_2_internal_bpp, &rt.render_target_2_internal_type, &rt.render_target_2_clamp); - v3dX(cmd_buffer_render_pass_setup_render_target) + cmd_buffer_render_pass_setup_render_target (cmd_buffer, 3, &rt.render_target_3_internal_bpp, &rt.render_target_3_internal_type, &rt.render_target_3_clamp); } +#endif /* Ends rendering mode config. */ if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) { @@ -1036,10 +1262,15 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) } if (cmd_buffer->state.tile_aligned_render_area && (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) { +#if V3D_VERSION == 42 cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { clear.clear_z_stencil_buffer = !job->early_zs_clear; clear.clear_all_render_targets = true; } +#endif +#if V3D_VERSION >= 71 + cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt); +#endif } cl_emit(rcl, END_OF_TILE_MARKER, end); } @@ -1055,6 +1286,43 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer) } void +v3dX(viewport_compute_xform)(const VkViewport *viewport, + float scale[3], + float translate[3]) +{ + float x = viewport->x; + float y = viewport->y; + float half_width = 0.5f * viewport->width; + float half_height = 0.5f * viewport->height; + double n = viewport->minDepth; + double f = viewport->maxDepth; + + scale[0] = half_width; + translate[0] = half_width + x; + scale[1] = half_height; + translate[1] = half_height + y; + + scale[2] = (f - n); + translate[2] = n; + + /* It seems that if the scale is small enough the hardware won't clip + * correctly so we work around this my choosing the smallest scale that + * seems to work. + * + * This case is exercised by CTS: + * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero + * + * V3D 7.x fixes this by using the new + * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND. + */ +#if V3D_VERSION <= 42 + const float min_abs_scale = 0.0005f; + if (fabs(scale[2]) < min_abs_scale) + scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale; +#endif +} + +void v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) { struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; @@ -1078,19 +1346,45 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size); v3dv_return_if_oom(cmd_buffer, NULL); +#if V3D_VERSION == 42 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f; clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f; } +#endif +#if V3D_VERSION >= 71 + cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) { + clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f; + clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f; + } +#endif float translate_z, scale_z; v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0, &translate_z, &scale_z); +#if V3D_VERSION == 42 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { clip.viewport_z_offset_zc_to_zs = translate_z; clip.viewport_z_scale_zc_to_zs = scale_z; } +#endif + +#if V3D_VERSION >= 71 + /* If the Z scale is too small guardband clipping may not clip correctly */ + if (fabsf(scale_z) < 0.01f) { + cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) { + clip.viewport_z_offset_zc_to_zs = translate_z; + clip.viewport_z_scale_zc_to_zs = scale_z; + } + } else { + cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) { + clip.viewport_z_offset_zc_to_zs = translate_z; + clip.viewport_z_scale_zc_to_zs = scale_z; + } + } +#endif + cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) { /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled, * we are using OpenGL's [-1, 1] instead. @@ -1103,8 +1397,28 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer) } cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) { - vp.viewport_centre_x_coordinate = vptranslate[0]; - vp.viewport_centre_y_coordinate = vptranslate[1]; + float vp_fine_x = vptranslate[0]; + float vp_fine_y = vptranslate[1]; + int32_t vp_coarse_x = 0; + int32_t vp_coarse_y = 0; + + /* The fine coordinates must be unsigned, but coarse can be signed */ + if (unlikely(vp_fine_x < 0)) { + int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_x), 64); + vp_fine_x += 64.0f * blocks_64; + vp_coarse_x -= blocks_64; + } + + if (unlikely(vp_fine_y < 0)) { + int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_y), 64); + vp_fine_y += 64.0f * blocks_64; + vp_coarse_y -= blocks_64; + } + + vp.fine_x = vp_fine_x; + vp.fine_y = vp_fine_y; + vp.coarse_x = vp_coarse_x; + vp.coarse_y = vp_coarse_y; } cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEWPORT; @@ -1185,8 +1499,10 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer) cl_emit(&job->bcl, DEPTH_OFFSET, bias) { bias.depth_offset_factor = dynamic->depth_bias.slope_factor; bias.depth_offset_units = dynamic->depth_bias.constant_factor; +#if V3D_VERSION <= 42 if (pipeline->depth_bias.is_z16) bias.depth_offset_units *= 256.0f; +#endif bias.limit = dynamic->depth_bias.depth_bias_clamp; } @@ -1194,6 +1510,38 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer) } void +v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer) +{ + /* No depthBounds support for v42, so this method is empty in that case. + * + * Note that this method is being called as v3dv_job_init flags all state + * as dirty. See FIXME note in v3dv_job_init. + */ + +#if V3D_VERSION >= 71 + struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + assert(pipeline); + + if (!pipeline->depth_bounds_test_enabled) + return; + + struct v3dv_job *job = cmd_buffer->state.job; + assert(job); + + v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS)); + v3dv_return_if_oom(cmd_buffer, NULL); + + struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; + cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) { + bounds.lower_test_limit = dynamic->depth_bounds.min; + bounds.upper_test_limit = dynamic->depth_bounds.max; + } + + cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BOUNDS; +#endif +} + +void v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer) { struct v3dv_job *job = cmd_buffer->state.job; @@ -1236,10 +1584,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer) struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; assert(pipeline); + const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo; + const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver); + const uint32_t blend_packets_size = cl_packet_length(BLEND_ENABLES) + cl_packet_length(BLEND_CONSTANT_COLOR) + - cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS; + cl_packet_length(BLEND_CFG) * max_color_rts; v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size); v3dv_return_if_oom(cmd_buffer, NULL); @@ -1251,7 +1602,7 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer) } } - for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) { + for (uint32_t i = 0; i < max_color_rts; i++) { if (pipeline->blend.enables & (1 << i)) cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]); } @@ -1278,9 +1629,15 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer) struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic; + uint32_t color_write_mask = ~dynamic->color_write_enable | + pipeline->blend.color_write_masks; +#if V3D_VERSION <= 42 + /* Only 4 RTs */ + color_write_mask &= 0xffff; +#endif + cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) { - mask.mask = (~dynamic->color_write_enable | - pipeline->blend.color_write_masks) & 0xffff; + mask.mask = color_write_mask; } cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE; @@ -1571,15 +1928,16 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer) struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; assert(pipeline); - bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer); - v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS)); v3dv_return_if_oom(cmd_buffer, NULL); cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) { +#if V3D_VERSION == 42 + bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer); config.early_z_enable = enable_ez; config.early_z_updates_enable = config.early_z_enable && pipeline->z_updates_enable; +#endif } } @@ -1825,7 +2183,9 @@ emit_gs_shader_state_record(struct v3dv_job *job, gs_bin->prog_data.gs->base.threads == 4; shader.geometry_bin_mode_shader_start_in_final_thread_section = gs_bin->prog_data.gs->base.single_seg; +#if V3D_VERSION <= 42 shader.geometry_bin_mode_shader_propagate_nans = true; +#endif shader.geometry_bin_mode_shader_uniforms_address = gs_bin_uniforms; @@ -1835,21 +2195,23 @@ emit_gs_shader_state_record(struct v3dv_job *job, gs->prog_data.gs->base.threads == 4; shader.geometry_render_mode_shader_start_in_final_thread_section = gs->prog_data.gs->base.single_seg; +#if V3D_VERSION <= 42 shader.geometry_render_mode_shader_propagate_nans = true; +#endif shader.geometry_render_mode_shader_uniforms_address = gs_render_uniforms; } } static uint8_t -v3d_gs_output_primitive(enum shader_prim prim_type) +v3d_gs_output_primitive(enum mesa_prim prim_type) { switch (prim_type) { - case SHADER_PRIM_POINTS: + case MESA_PRIM_POINTS: return GEOMETRY_SHADER_POINTS; - case SHADER_PRIM_LINE_STRIP: + case MESA_PRIM_LINE_STRIP: return GEOMETRY_SHADER_LINE_STRIP; - case SHADER_PRIM_TRIANGLE_STRIP: + case MESA_PRIM_TRIANGLE_STRIP: return GEOMETRY_SHADER_TRI_STRIP; default: unreachable("Unsupported primitive type"); @@ -2011,10 +2373,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) pipeline->vpm_cfg.Gv); } +#if V3D_VERSION == 42 struct v3dv_bo *default_attribute_values = pipeline->default_attribute_values != NULL ? pipeline->default_attribute_values : pipeline->device->default_attribute_float; +#endif cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD, pipeline->shader_state_record, shader) { @@ -2040,8 +2404,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer) shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs; shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs; +#if V3D_VERSION == 42 shader.address_of_default_attribute_values = v3dv_cl_address(default_attribute_values, 0); +#endif shader.any_shader_reads_hardware_written_primitive_id = (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid; @@ -2350,40 +2716,3 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer, buffer->mem_offset + offset); } } - -void -v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer, - int rt, - uint32_t *rt_bpp, - uint32_t *rt_type, - uint32_t *rt_clamp) -{ - const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state; - - assert(state->subpass_idx < state->pass->subpass_count); - const struct v3dv_subpass *subpass = - &state->pass->subpasses[state->subpass_idx]; - - if (rt >= subpass->color_count) - return; - - struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt]; - const uint32_t attachment_idx = attachment->attachment; - if (attachment_idx == VK_ATTACHMENT_UNUSED) - return; - - assert(attachment_idx < state->framebuffer->attachment_count && - attachment_idx < state->attachment_alloc_count); - struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view; - assert(vk_format_is_color(iview->vk.format)); - - assert(iview->plane_count == 1); - *rt_bpp = iview->planes[0].internal_bpp; - *rt_type = iview->planes[0].internal_type; - if (vk_format_is_int(iview->vk.view_format)) - *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT; - else if (vk_format_is_srgb(iview->vk.view_format)) - *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM; - else - *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE; -} diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_device.c b/lib/mesa/src/broadcom/vulkan/v3dvx_device.c index e23598386..1b50d51e1 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dvx_device.c +++ b/lib/mesa/src/broadcom/vulkan/v3dvx_device.c @@ -49,8 +49,8 @@ vk_to_v3d_compare_func[] = { [VK_COMPARE_OP_ALWAYS] = V3D_COMPARE_FUNC_ALWAYS, }; - static union pipe_color_union encode_border_color( + const struct v3dv_device *device, const VkSamplerCustomBorderColorCreateInfoEXT *bc_info) { const struct util_format_description *desc = @@ -77,12 +77,28 @@ static union pipe_color_union encode_border_color( * colors so we need to fix up the swizzle manually for this case. */ uint8_t swizzle[4]; - if (v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) && + const bool v3d_has_reverse_swap_rb_bits = + v3dv_texture_shader_state_has_rb_swap_reverse_bits(device); + if (!v3d_has_reverse_swap_rb_bits && + v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) && v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) { swizzle[0] = PIPE_SWIZZLE_W; swizzle[1] = PIPE_SWIZZLE_X; swizzle[2] = PIPE_SWIZZLE_Y; swizzle[3] = PIPE_SWIZZLE_Z; + } + /* In v3d 7.x we no longer have a reverse flag for the border color. Instead + * we have to use the new reverse and swap_r/b flags in the texture shader + * state which will apply the format swizzle automatically when sampling + * the border color too and we should not apply it manually here. + */ + else if (v3d_has_reverse_swap_rb_bits && + (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) || + v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) { + swizzle[0] = PIPE_SWIZZLE_X; + swizzle[1] = PIPE_SWIZZLE_Y; + swizzle[2] = PIPE_SWIZZLE_Z; + swizzle[3] = PIPE_SWIZZLE_W; } else { memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle)); } @@ -118,7 +134,11 @@ static union pipe_color_union encode_border_color( (1 << (desc->channel[i].size - 1)) - 1); } - /* convert from float to expected format */ +#if V3D_VERSION <= 42 + /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions + * for us. In V3D 4.x we need to manually convert floating point color + * values to the expected format. + */ if (vk_format_is_srgb(bc_info->format) || vk_format_is_compressed(bc_info->format)) { for (int i = 0; i < 4; i++) @@ -170,12 +190,14 @@ static union pipe_color_union encode_border_color( } } } +#endif return border; } void -v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, +v3dX(pack_sampler_state)(const struct v3dv_device *device, + struct v3dv_sampler *sampler, const VkSamplerCreateInfo *pCreateInfo, const VkSamplerCustomBorderColorCreateInfoEXT *bc_info) { @@ -217,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, s.border_color_mode = border_color_mode; if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) { - union pipe_color_union border = encode_border_color(bc_info); + union pipe_color_union border = encode_border_color(device, bc_info); s.border_color_word_0 = border.ui[0]; s.border_color_word_1 = border.ui[1]; @@ -253,11 +275,13 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( const struct v3dv_framebuffer *framebuffer, const struct v3dv_cmd_buffer_attachment_state *attachments, const struct v3dv_subpass *subpass, - uint8_t *max_bpp, + uint8_t *max_internal_bpp, + uint8_t *total_color_bpp, bool *msaa) { STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0); - *max_bpp = V3D_INTERNAL_BPP_32; + *max_internal_bpp = V3D_INTERNAL_BPP_32; + *total_color_bpp = 0; *msaa = false; if (subpass) { @@ -270,8 +294,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( assert(att); assert(att->plane_count == 1); - if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) - *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp); + if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) { + const uint32_t internal_bpp = att->planes[0].internal_bpp; + *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp); + *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); + } if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) *msaa = true; @@ -285,7 +312,6 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) *msaa = true; } - return; } @@ -295,8 +321,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)( assert(att); assert(att->plane_count == 1); - if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) - *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp); + if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) { + const uint32_t internal_bpp = att->planes[0].internal_bpp; + *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp); + *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp); + } if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT) *msaa = true; diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_formats.c b/lib/mesa/src/broadcom/vulkan/v3dvx_formats.c index 45a1cf65b..2392e8367 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dvx_formats.c +++ b/lib/mesa/src/broadcom/vulkan/v3dvx_formats.c @@ -155,6 +155,7 @@ static const struct v3dv_format format_table[] = { FORMAT(A8B8G8R8_SRGB_PACK32, SRGB8_ALPHA8, RGBA8, SWIZ_XYZW, 16, true), /* RGBA8 sRGB */ FORMAT(A2B10G10R10_UNORM_PACK32,RGB10_A2, RGB10_A2, SWIZ_XYZW, 16, true), FORMAT(A2B10G10R10_UINT_PACK32, RGB10_A2UI, RGB10_A2UI, SWIZ_XYZW, 16, false), + FORMAT(A2R10G10B10_UNORM_PACK32,RGB10_A2, RGB10_A2, SWIZ_ZYXW, 16, true), FORMAT(E5B9G9R9_UFLOAT_PACK32, NO, RGB9_E5, SWIZ_XYZ1, 16, true), FORMAT(B10G11R11_UFLOAT_PACK32, R11F_G11F_B10F,R11F_G11F_B10F, SWIZ_XYZ1, 16, true), diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_image.c b/lib/mesa/src/broadcom/vulkan/v3dvx_image.c index 80a3e5bfd..de984e812 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dvx_image.c +++ b/lib/mesa/src/broadcom/vulkan/v3dvx_image.c @@ -76,8 +76,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device, tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]); tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]); - tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse; - tex.texture_type = image_view->format->planes[plane].tex_type; if (image->vk.image_type == VK_IMAGE_TYPE_3D) { @@ -110,8 +108,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device, tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64; - tex.srgb = vk_format_is_srgb(image_view->vk.view_format); - /* At this point we don't have the job. That's the reason the first * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to * add the bo to the job. This also means that we need to add manually @@ -122,6 +118,51 @@ pack_texture_shader_state_helper(struct v3dv_device *device, v3dv_layer_offset(image, 0, image_view->vk.base_array_layer, iplane); tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset); + + bool is_srgb = vk_format_is_srgb(image_view->vk.format); + + /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose + * the reverse and/or swap_r/b swizzle from the format table with the + * image view swizzle. This, however, doesn't work for border colors, + * for that there is the reverse_standard_border_color. + * + * In v3d 7.x, however, there is no reverse_standard_border_color bit, + * since the reverse and swap_r/b bits also affect border colors. It is + * because of this that we absolutely need to use these bits with + * reversed and swpaped formats, since that's the only way to ensure + * correct border colors. In that case we don't want to program the + * swizzle to the composition of the format swizzle and the view + * swizzle like we do in v3d 4.x, since the format swizzle is applied + * via the reverse and swap_r/b bits. + */ +#if V3D_VERSION == 42 + tex.srgb = is_srgb; + tex.reverse_standard_border_color = + image_view->planes[plane].channel_reverse; +#endif +#if V3D_VERSION >= 71 + tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; + + tex.reverse = image_view->planes[plane].channel_reverse; + tex.r_b_swap = image_view->planes[plane].swap_rb; + + if (tex.reverse || tex.r_b_swap) { + tex.swizzle_r = + v3d_translate_pipe_swizzle(image_view->view_swizzle[0]); + tex.swizzle_g = + v3d_translate_pipe_swizzle(image_view->view_swizzle[1]); + tex.swizzle_b = + v3d_translate_pipe_swizzle(image_view->view_swizzle[2]); + tex.swizzle_a = + v3d_translate_pipe_swizzle(image_view->view_swizzle[3]); + } + + tex.chroma_offset_x = 1; + tex.chroma_offset_y = 1; + /* See comment in XML field definition for rationale of the shifts */ + tex.texture_base_pointer_cb = base_offset >> 6; + tex.texture_base_pointer_cr = base_offset >> 6; +#endif } } } @@ -166,7 +207,14 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device, assert(buffer_view->format->plane_count == 1); tex.texture_type = buffer_view->format->planes[0].tex_type; - tex.srgb = vk_format_is_srgb(buffer_view->vk_format); + + bool is_srgb = vk_format_is_srgb(buffer_view->vk_format); +#if V3D_VERSION == 42 + tex.srgb = is_srgb; +#endif +#if V3D_VERSION >= 71 + tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE; +#endif /* At this point we don't have the job. That's the reason the first * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to @@ -179,5 +227,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device, buffer_view->offset; tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset); + +#if V3D_VERSION >= 71 + tex.chroma_offset_x = 1; + tex.chroma_offset_y = 1; + /* See comment in XML field definition for rationale of the shifts */ + tex.texture_base_pointer_cb = base_offset >> 6; + tex.texture_base_pointer_cr = base_offset >> 6; +#endif } } diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_meta_common.c b/lib/mesa/src/broadcom/vulkan/v3dvx_meta_common.c index 04147b82c..858096f9e 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dvx_meta_common.c +++ b/lib/mesa/src/broadcom/vulkan/v3dvx_meta_common.c @@ -26,6 +26,7 @@ #include "broadcom/common/v3d_macros.h" #include "broadcom/common/v3d_tfu.h" +#include "broadcom/common/v3d_util.h" #include "broadcom/cle/v3dx_pack.h" #include "broadcom/compiler/v3d_compiler.h" @@ -58,12 +59,25 @@ emit_rcl_prologue(struct v3dv_job *job, config.number_of_render_targets = 1; config.multisample_mode_4x = tiling->msaa; config.double_buffer_in_non_ms_mode = tiling->double_buffer; +#if V3D_VERSION == 42 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = log2_tile_size(tiling->tile_width); + config.log2_tile_height = log2_tile_size(tiling->tile_height); + /* FIXME: ideallly we would like next assert on the packet header (as is + * general, so also applies to GL). We would need to expand + * gen_pack_header for that. + */ + assert(config.log2_tile_width == config.log2_tile_height || + config.log2_tile_width == config.log2_tile_height + 1); +#endif config.internal_depth_type = fb->internal_depth_type; } + const uint32_t *color = NULL; if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) { - uint32_t clear_pad = 0; + UNUSED uint32_t clear_pad = 0; if (clear_info->image) { const struct v3dv_image *image = clear_info->image; @@ -88,7 +102,9 @@ emit_rcl_prologue(struct v3dv_job *job, } } - const uint32_t *color = &clear_info->clear_value->color[0]; + color = &clear_info->clear_value->color[0]; + +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) { clear.clear_color_low_32_bits = color[0]; clear.clear_color_next_24_bits = color[1] & 0x00ffffff; @@ -112,13 +128,49 @@ emit_rcl_prologue(struct v3dv_job *job, clear.render_target_number = 0; }; } +#endif } +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { rt.render_target_0_internal_bpp = tiling->internal_bpp; rt.render_target_0_internal_type = fb->internal_type; rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE; } +#endif + +#if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + if (color) + rt.clear_color_low_bits = color[0]; + rt.internal_bpp = tiling->internal_bpp; + rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type, + fb->vk_format); + rt.stride = + v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width, + v3d_internal_bpp_words(rt.internal_bpp)); + rt.base_address = 0; + rt.render_target_number = 0; + } + + if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) { + rt.clear_color_mid_bits = /* 40 bits (32 + 8) */ + ((uint64_t) color[1]) | + (((uint64_t) (color[2] & 0xff)) << 32); + rt.render_target_number = 0; + } + } + + if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) { + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) { + rt.clear_color_top_bits = /* 56 bits (24 + 32) */ + (((uint64_t) (color[2] & 0xffffff00)) >> 8) | + (((uint64_t) (color[3])) << 24); + rt.render_target_number = 0; + } + } +#endif cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f; @@ -179,10 +231,15 @@ emit_frame_setup(struct v3dv_job *job, */ if (clear_value && (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) { +#if V3D_VERSION == 42 cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) { clear.clear_z_stencil_buffer = true; clear.clear_all_render_targets = true; } +#endif +#if V3D_VERSION >= 71 + cl_emit(rcl, CLEAR_RENDER_TARGETS, clear); +#endif } cl_emit(rcl, END_OF_TILE_MARKER, end); } @@ -893,6 +950,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, tfu.iia |= src_offset; +#if V3D_VERSION <= 42 if (src_tiling == V3D_TILING_RASTER) { tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT; } else { @@ -901,12 +959,46 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, V3D33_TFU_ICFG_FORMAT_SHIFT; } tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT; +#endif +#if V3D_VERSION >= 71 + if (src_tiling == V3D_TILING_RASTER) { + tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT; + } else { + tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE + + (src_tiling - V3D_TILING_LINEARTILE)) << + V3D71_TFU_ICFG_IFORMAT_SHIFT; + } + tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT; +#endif tfu.ioa = dst_offset; +#if V3D_VERSION <= 42 tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE + (dst_tiling - V3D_TILING_LINEARTILE)) << V3D33_TFU_IOA_FORMAT_SHIFT; +#endif + +#if V3D_VERSION >= 71 + tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE + + (dst_tiling - V3D_TILING_LINEARTILE)) << + V3D71_TFU_IOC_FORMAT_SHIFT; + + switch (dst_tiling) { + case V3D_TILING_UIF_NO_XOR: + case V3D_TILING_UIF_XOR: + tfu.v71.ioc |= + (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) << + V3D71_TFU_IOC_STRIDE_SHIFT; + break; + case V3D_TILING_RASTER: + tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) << + V3D71_TFU_IOC_STRIDE_SHIFT; + break; + default: + break; + } +#endif switch (src_tiling) { case V3D_TILING_UIF_NO_XOR: @@ -923,6 +1015,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, /* The TFU can handle raster sources but always produces UIF results */ assert(dst_tiling != V3D_TILING_RASTER); +#if V3D_VERSION <= 42 /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the * OPAD field for the destination (how many extra UIF blocks beyond * those necessary to cover the height). @@ -934,6 +1027,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer, uif_block_h; tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT; } +#endif v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu); } @@ -1314,8 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer, uint32_t width, height; framebuffer_size_for_pixel_count(num_items, &width, &height); - v3dv_job_start_frame(job, width, height, 1, true, true, - 1, internal_bpp, false); + v3dv_job_start_frame(job, width, height, 1, true, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + false); struct v3dv_meta_framebuffer framebuffer; v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type, @@ -1361,8 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer, uint32_t width, height; framebuffer_size_for_pixel_count(num_items, &width, &height); - v3dv_job_start_frame(job, width, height, 1, true, true, - 1, internal_bpp, false); + v3dv_job_start_frame(job, width, height, 1, true, true, 1, + internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp), + false); struct v3dv_meta_framebuffer framebuffer; v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT, diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_pipeline.c b/lib/mesa/src/broadcom/vulkan/v3dvx_pipeline.c index 45aec2623..ad22add15 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dvx_pipeline.c +++ b/lib/mesa/src/broadcom/vulkan/v3dvx_pipeline.c @@ -223,14 +223,49 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline, config.depth_test_function = VK_COMPARE_OP_ALWAYS; } - /* EZ state will be updated at draw time based on bound pipeline state */ - config.early_z_updates_enable = false; - config.early_z_enable = false; - config.stencil_enable = ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false; pipeline->z_updates_enable = config.z_updates_enable; + +#if V3D_VERSION >= 71 + /* From the Vulkan spec: + * + * "depthClampEnable controls whether to clamp the fragment’s depth + * values as described in Depth Test. If the pipeline is not created + * with VkPipelineRasterizationDepthClipStateCreateInfoEXT present + * then enabling depth clamp will also disable clipping primitives to + * the z planes of the frustrum as described in Primitive Clipping. + * Otherwise depth clipping is controlled by the state set in + * VkPipelineRasterizationDepthClipStateCreateInfoEXT." + * + * Note: neither depth clamping nor VK_EXT_depth_clip_enable are actually + * supported in the driver yet, so in practice we are always enabling Z + * clipping for now. + */ + bool z_clamp_enable = rs_info && rs_info->depthClampEnable; + bool z_clip_enable = false; + const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info = + ds_info ? vk_find_struct_const(ds_info->pNext, + PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) : + NULL; + if (clip_info) + z_clip_enable = clip_info->depthClipEnable; + else if (!z_clamp_enable) + z_clip_enable = true; + + if (z_clip_enable) { + config.z_clipping_mode = pipeline->negative_one_to_one ? + V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE; + } else { + config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE; + } + + config.z_clamp_mode = z_clamp_enable; + + config.depth_bounds_test_enable = + ds_info && ds_info->depthBoundsTestEnable && has_ds_attachment; +#endif }; } @@ -364,7 +399,7 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline, static void pack_shader_state_record(struct v3dv_pipeline *pipeline) { - assert(sizeof(pipeline->shader_state_record) == + assert(sizeof(pipeline->shader_state_record) >= cl_packet_length(GL_SHADER_STATE_RECORD)); struct v3d_fs_prog_data *prog_data_fs = @@ -388,7 +423,7 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) if (!pipeline->has_gs) { shader.point_size_in_shaded_vertex_data = - pipeline->topology == PIPE_PRIM_POINTS; + pipeline->topology == MESA_PRIM_POINTS; } else { struct v3d_gs_prog_data *prog_data_gs = pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]->prog_data.gs; @@ -439,15 +474,16 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) shader.number_of_varyings_in_fragment_shader = prog_data_fs->num_inputs; - shader.coordinate_shader_propagate_nans = true; - shader.vertex_shader_propagate_nans = true; - shader.fragment_shader_propagate_nans = true; - /* Note: see previous note about addresses */ /* shader.coordinate_shader_code_address */ /* shader.vertex_shader_code_address */ /* shader.fragment_shader_code_address */ +#if V3D_VERSION == 42 + shader.coordinate_shader_propagate_nans = true; + shader.vertex_shader_propagate_nans = true; + shader.fragment_shader_propagate_nans = true; + /* FIXME: Use combined input/output size flag in the common case (also * on v3d, see v3dx_draw). */ @@ -455,13 +491,25 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline) prog_data_vs_bin->separate_segments; shader.vertex_shader_has_separate_input_and_output_vpm_blocks = prog_data_vs->separate_segments; - shader.coordinate_shader_input_vpm_segment_size = prog_data_vs_bin->separate_segments ? prog_data_vs_bin->vpm_input_size : 1; shader.vertex_shader_input_vpm_segment_size = prog_data_vs->separate_segments ? prog_data_vs->vpm_input_size : 1; +#endif + + /* On V3D 7.1 there isn't a specific flag to set if we are using + * shared/separate segments or not. We just set the value of + * vpm_input_size to 0, and set output to the max needed. That should be + * already properly set on prog_data_vs_bin + */ +#if V3D_VERSION == 71 + shader.coordinate_shader_input_vpm_segment_size = + prog_data_vs_bin->vpm_input_size; + shader.vertex_shader_input_vpm_segment_size = + prog_data_vs->vpm_input_size; +#endif shader.coordinate_shader_output_vpm_segment_size = prog_data_vs_bin->vpm_output_size; @@ -663,3 +711,76 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline, } } } + +#if V3D_VERSION == 42 +static bool +pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline) +{ + for (uint8_t i = 0; i < pipeline->va_count; i++) { + if (vk_format_is_int(pipeline->va[i].vk_format)) + return true; + } + return false; +} +#endif + +bool +v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline) +{ +#if V3D_VERSION == 42 + return pipeline_has_integer_vertex_attrib(pipeline); +#endif + + return false; +} + +/* @pipeline can be NULL. In that case we assume the most common case. For + * example, for v42 we assume in that case that all the attributes have a + * float format (we only create an all-float BO once and we reuse it with all + * float pipelines), otherwise we look at the actual type of each attribute + * used with the specific pipeline passed in. + */ +struct v3dv_bo * +v3dX(create_default_attribute_values)(struct v3dv_device *device, + struct v3dv_pipeline *pipeline) +{ +#if V3D_VERSION >= 71 + return NULL; +#endif + + uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4; + struct v3dv_bo *bo; + + bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true); + + if (!bo) { + fprintf(stderr, "failed to allocate memory for the default " + "attribute values\n"); + return NULL; + } + + bool ok = v3dv_bo_map(device, bo, size); + if (!ok) { + fprintf(stderr, "failed to map default attribute values buffer\n"); + return NULL; + } + + uint32_t *attrs = bo->map; + uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0; + for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) { + attrs[i * 4 + 0] = 0; + attrs[i * 4 + 1] = 0; + attrs[i * 4 + 2] = 0; + VkFormat attr_format = + pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED; + if (i < va_count && vk_format_is_int(attr_format)) { + attrs[i * 4 + 3] = 1; + } else { + attrs[i * 4 + 3] = fui(1.0); + } + } + + v3dv_bo_unmap(device, bo); + + return bo; +} diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_private.h b/lib/mesa/src/broadcom/vulkan/v3dvx_private.h index c693952d0..0f5887eab 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dvx_private.h +++ b/lib/mesa/src/broadcom/vulkan/v3dvx_private.h @@ -55,6 +55,9 @@ void v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer); void +v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer); + +void v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer); void @@ -125,17 +128,11 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color, uint32_t internal_size, uint32_t *hw_color); -void -v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer, - int rt, - uint32_t *rt_bpp, - uint32_t *rt_type, - uint32_t *rt_clamp); - /* Used at v3dv_device */ void -v3dX(pack_sampler_state)(struct v3dv_sampler *sampler, +v3dX(pack_sampler_state)(const struct v3dv_device *device, + struct v3dv_sampler *sampler, const VkSamplerCreateInfo *pCreateInfo, const VkSamplerCustomBorderColorCreateInfoEXT *bc_info); @@ -143,7 +140,9 @@ void v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer, const struct v3dv_cmd_buffer_attachment_state *attachments, const struct v3dv_subpass *subpass, - uint8_t *max_bpp, bool *msaa); + uint8_t *max_internal_bpp, + uint8_t *total_color_bpp, + bool *msaa); #ifdef DEBUG void @@ -165,6 +164,10 @@ v3dX(format_supports_tlb_resolve)(const struct v3dv_format *format); bool v3dX(format_supports_blending)(const struct v3dv_format *format); +/* FIXME: tex_format should be `enum V3DX(Texture_Data_Formats)`, but using + * that enum type in the header requires including v3dx_pack.h, which triggers + * circular include dependencies issues, so we're using a `uint32_t` for now. + */ bool v3dX(tfu_supports_tex_format)(uint32_t tex_format); @@ -309,10 +312,24 @@ void v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline, const VkPipelineVertexInputStateCreateInfo *vi_info, const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info); + +bool +v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline); + +struct v3dv_bo * +v3dX(create_default_attribute_values)(struct v3dv_device *device, + struct v3dv_pipeline *pipeline); + /* Used at v3dv_queue */ void v3dX(job_emit_noop)(struct v3dv_job *job); +/* Used at v3dv_query */ +VkResult +v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount, + VkPerformanceCounterKHR *pCounters, + VkPerformanceCounterDescriptionKHR *pCounterDescriptions); + /* Used at v3dv_descriptor_set, and other descriptor set utils */ uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type); @@ -321,3 +338,21 @@ uint32_t v3dX(max_descriptor_bo_size)(void); uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane); uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane); + +/* General utils */ + +uint32_t +v3dX(clamp_for_format_and_type)(uint32_t rt_type, + VkFormat vk_format); + +#define V3D42_CLIPPER_XY_GRANULARITY 256.0f +#define V3D71_CLIPPER_XY_GRANULARITY 64.0f + +uint32_t +v3dX(clamp_for_format_and_type)(uint32_t rt_type, + VkFormat vk_format); + +void +v3dX(viewport_compute_xform)(const VkViewport *viewport, + float scale[3], + float translate[3]); diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_query.c b/lib/mesa/src/broadcom/vulkan/v3dvx_query.c new file mode 100644 index 000000000..e59a1e84f --- /dev/null +++ b/lib/mesa/src/broadcom/vulkan/v3dvx_query.c @@ -0,0 +1,67 @@ +/* + * Copyright © 2023 Raspberry Pi Ltd + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "v3dv_private.h" + +#include "common/v3d_performance_counters.h" + +VkResult +v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount, + VkPerformanceCounterKHR *pCounters, + VkPerformanceCounterDescriptionKHR *pCounterDescriptions) +{ + uint32_t desc_count = *pCounterCount; + + VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, + out, pCounters, pCounterCount); + VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, + out_desc, pCounterDescriptions, &desc_count); + + for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) { + vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { + counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR; + counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; + counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR; + + unsigned char sha1_result[20]; + _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME], + strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]), + sha1_result); + + memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); + } + + vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, + &out_desc, desc) { + desc->flags = 0; + snprintf(desc->name, sizeof(desc->name), "%s", + v3d_performance_counters[i][V3D_PERFCNT_NAME]); + snprintf(desc->category, sizeof(desc->category), "%s", + v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]); + snprintf(desc->description, sizeof(desc->description), "%s", + v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]); + } + } + + return vk_outarray_status(&out); +} diff --git a/lib/mesa/src/broadcom/vulkan/v3dvx_queue.c b/lib/mesa/src/broadcom/vulkan/v3dvx_queue.c index efe63de42..6eed2de9d 100644 --- a/lib/mesa/src/broadcom/vulkan/v3dvx_queue.c +++ b/lib/mesa/src/broadcom/vulkan/v3dvx_queue.c @@ -29,7 +29,8 @@ void v3dX(job_emit_noop)(struct v3dv_job *job) { - v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, V3D_INTERNAL_BPP_32, false); + v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, + V3D_INTERNAL_BPP_32, 4, false); v3dX(job_emit_binning_flush)(job); struct v3dv_cl *rcl = &job->rcl; @@ -42,14 +43,29 @@ v3dX(job_emit_noop)(struct v3dv_job *job) config.image_height_pixels = 1; config.number_of_render_targets = 1; config.multisample_mode_4x = false; +#if V3D_VERSION == 42 config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32; +#endif +#if V3D_VERSION >= 71 + config.log2_tile_width = 3; /* Tile size 64 */ + config.log2_tile_height = 3; /* Tile size 64 */ +#endif } +#if V3D_VERSION == 42 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) { rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32; rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8; rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE; } +#endif +#if V3D_VERSION >= 71 + cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) { + rt.internal_bpp = V3D_INTERNAL_BPP_32; + rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8; + rt.stride = 1; /* Unused RT */ + } +#endif cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) { clear.z_clear_value = 1.0f; |