summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorJonathan Gray <jsg@cvs.openbsd.org>2020-01-22 02:09:34 +0000
committerJonathan Gray <jsg@cvs.openbsd.org>2020-01-22 02:09:34 +0000
commit53b0736c56ca5142a5722eb827a3675ca08e123d (patch)
tree52fd72557407af997e5b871b29a378c9bfa58299 /lib
parent4bb763fef12ec314b7ed27d8c928ee833fddb0a3 (diff)
Import Mesa 19.2.8
Diffstat (limited to 'lib')
-rw-r--r--lib/mesa/src/amd/vulkan/radv_nir_lower_ycbcr_textures.c621
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_android.c391
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_cmd_buffer.c6752
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_cs.c237
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_descriptor_set.c1500
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_device.c3527
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_drm.c1287
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_formats.c1143
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_image.c1021
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_pass.c1294
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_pipeline.c5917
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_query.c1692
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_shader.c1050
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_util.c318
-rw-r--r--lib/mesa/src/freedreno/vulkan/tu_wsi.c278
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_compute.c334
16 files changed, 7037 insertions, 20325 deletions
diff --git a/lib/mesa/src/amd/vulkan/radv_nir_lower_ycbcr_textures.c b/lib/mesa/src/amd/vulkan/radv_nir_lower_ycbcr_textures.c
index 7d15a78b3..5d771c2fc 100644
--- a/lib/mesa/src/amd/vulkan/radv_nir_lower_ycbcr_textures.c
+++ b/lib/mesa/src/amd/vulkan/radv_nir_lower_ycbcr_textures.c
@@ -21,301 +21,432 @@
* IN THE SOFTWARE.
*/
-#include "nir/nir.h"
-#include "nir/nir_builder.h"
-#include "nir/nir_vulkan.h"
#include "radv_private.h"
#include "radv_shader.h"
#include "vk_format.h"
+#include "nir/nir.h"
+#include "nir/nir_builder.h"
struct ycbcr_state {
- nir_builder *builder;
- nir_ssa_def *image_size;
- nir_tex_instr *origin_tex;
- nir_deref_instr *tex_deref;
- const struct radv_sampler_ycbcr_conversion_state *conversion;
- bool unnormalized_coordinates;
+ nir_builder *builder;
+ nir_ssa_def *image_size;
+ nir_tex_instr *origin_tex;
+ nir_deref_instr *tex_deref;
+ const struct radv_sampler_ycbcr_conversion *conversion;
};
static nir_ssa_def *
-get_texture_size(struct ycbcr_state *state, nir_deref_instr *texture)
+y_range(nir_builder *b,
+ nir_ssa_def *y_channel,
+ int bpc,
+ VkSamplerYcbcrRange range)
{
- nir_builder *b = state->builder;
- const struct glsl_type *type = texture->type;
- nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
-
- tex->op = nir_texop_txs;
- tex->sampler_dim = glsl_get_sampler_dim(type);
- tex->is_array = glsl_sampler_type_is_array(type);
- tex->is_shadow = glsl_sampler_type_is_shadow(type);
- tex->dest_type = nir_type_int32;
+ switch (range) {
+ case VK_SAMPLER_YCBCR_RANGE_ITU_FULL:
+ return y_channel;
+ case VK_SAMPLER_YCBCR_RANGE_ITU_NARROW:
+ return nir_fmul(b,
+ nir_fadd(b,
+ nir_fmul(b, y_channel,
+ nir_imm_float(b, pow(2, bpc) - 1)),
+ nir_imm_float(b, -16.0f * pow(2, bpc - 8))),
+ nir_frcp(b, nir_imm_float(b, 219.0f * pow(2, bpc - 8))));
+ default:
+ unreachable("missing Ycbcr range");
+ return NULL;
+ }
+}
- tex->src[0].src_type = nir_tex_src_texture_deref;
- tex->src[0].src = nir_src_for_ssa(&texture->dest.ssa);
+static nir_ssa_def *
+chroma_range(nir_builder *b,
+ nir_ssa_def *chroma_channel,
+ int bpc,
+ VkSamplerYcbcrRange range)
+{
+ switch (range) {
+ case VK_SAMPLER_YCBCR_RANGE_ITU_FULL:
+ return nir_fadd(b, chroma_channel,
+ nir_imm_float(b, -pow(2, bpc - 1) / (pow(2, bpc) - 1.0f)));
+ case VK_SAMPLER_YCBCR_RANGE_ITU_NARROW:
+ return nir_fmul(b,
+ nir_fadd(b,
+ nir_fmul(b, chroma_channel,
+ nir_imm_float(b, pow(2, bpc) - 1)),
+ nir_imm_float(b, -128.0f * pow(2, bpc - 8))),
+ nir_frcp(b, nir_imm_float(b, 224.0f * pow(2, bpc - 8))));
+ default:
+ unreachable("missing Ycbcr range");
+ return NULL;
+ }
+}
- nir_ssa_dest_init(&tex->instr, &tex->dest, nir_tex_instr_dest_size(tex), 32, NULL);
- nir_builder_instr_insert(b, &tex->instr);
+typedef struct nir_const_value_3_4 {
+ nir_const_value v[3][4];
+} nir_const_value_3_4;
- state->builder->shader->info.uses_resource_info_query = true;
+static const nir_const_value_3_4 *
+ycbcr_model_to_rgb_matrix(VkSamplerYcbcrModelConversion model)
+{
+ switch (model) {
+ case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601: {
+ static const nir_const_value_3_4 bt601 = { {
+ { { .f32 = 1.402f }, { .f32 = 1.0f }, { .f32 = 0.0f }, { .f32 = 0.0f } },
+ { { .f32 = -0.714136286201022f }, { .f32 = 1.0f }, { .f32 = -0.344136286201022f }, { .f32 = 0.0f } },
+ { { .f32 = 0.0f }, { .f32 = 1.0f }, { .f32 = 1.772f }, { .f32 = 0.0f } },
+ } };
+
+ return &bt601;
+ }
+ case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709: {
+ static const nir_const_value_3_4 bt709 = { {
+ { { .f32 = 1.5748031496063f }, { .f32 = 1.0f }, { .f32 = 0.0f }, { .f32 = 0.0f } },
+ { { .f32 = -0.468125209181067f }, { .f32 = 1.0f }, { .f32 = -0.187327487470334f }, { .f32 = 0.0f } },
+ { { .f32 = 0.0f }, { .f32 = 1.0f }, { .f32 = 1.85563184264242f }, { .f32 = 0.0f } },
+ } };
+
+ return &bt709;
+ }
+ case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_2020: {
+ static const nir_const_value_3_4 bt2020 = { {
+ { { .f32 = 1.4746f }, { .f32 = 1.0f }, { .f32 = 0.0f }, { .f32 = 0.0f } },
+ { { .f32 = -0.571353126843658f }, { .f32 = 1.0f }, { .f32 = -0.164553126843658f }, { .f32 = 0.0f } },
+ { { .f32 = 0.0f }, { .f32 = 1.0f }, { .f32 = 1.8814f }, { .f32 = 0.0f } },
+ } };
+
+ return &bt2020;
+ }
+ default:
+ unreachable("missing Ycbcr model");
+ return NULL;
+ }
+}
- return nir_i2f32(b, &tex->dest.ssa);
+static nir_ssa_def *
+convert_ycbcr(struct ycbcr_state *state,
+ nir_ssa_def *raw_channels,
+ uint8_t bits)
+{
+ nir_builder *b = state->builder;
+ const struct radv_sampler_ycbcr_conversion *conversion = state->conversion;
+
+ nir_ssa_def *expanded_channels =
+ nir_vec4(b,
+ chroma_range(b, nir_channel(b, raw_channels, 0),
+ bits, conversion->ycbcr_range),
+ y_range(b, nir_channel(b, raw_channels, 1),
+ bits, conversion->ycbcr_range),
+ chroma_range(b, nir_channel(b, raw_channels, 2),
+ bits, conversion->ycbcr_range),
+ nir_imm_float(b, 1.0f));
+
+ if (conversion->ycbcr_model == VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_IDENTITY)
+ return expanded_channels;
+
+ const nir_const_value_3_4 *conversion_matrix =
+ ycbcr_model_to_rgb_matrix(conversion->ycbcr_model);
+
+ nir_ssa_def *converted_channels[] = {
+ nir_fdot4(b, expanded_channels, nir_build_imm(b, 4, 32, conversion_matrix->v[0])),
+ nir_fdot4(b, expanded_channels, nir_build_imm(b, 4, 32, conversion_matrix->v[1])),
+ nir_fdot4(b, expanded_channels, nir_build_imm(b, 4, 32, conversion_matrix->v[2]))
+ };
+
+ return nir_vec4(b,
+ converted_channels[0], converted_channels[1],
+ converted_channels[2], nir_imm_float(b, 1.0f));
}
static nir_ssa_def *
-implicit_downsampled_coord(nir_builder *b, nir_ssa_def *value, nir_ssa_def *max_value,
- int div_scale)
+get_texture_size(struct ycbcr_state *state, nir_deref_instr *texture)
{
- return nir_fadd(
- b, value,
- nir_fdiv(b, nir_imm_float(b, 1.0f), nir_fmul(b, nir_imm_float(b, div_scale), max_value)));
+ nir_builder *b = state->builder;
+ const struct glsl_type *type = texture->type;
+ nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
+
+ tex->op = nir_texop_txs;
+ tex->sampler_dim = glsl_get_sampler_dim(type);
+ tex->is_array = glsl_sampler_type_is_array(type);
+ tex->is_shadow = glsl_sampler_type_is_shadow(type);
+ tex->dest_type = nir_type_int;
+
+ tex->src[0].src_type = nir_tex_src_texture_deref;
+ tex->src[0].src = nir_src_for_ssa(&texture->dest.ssa);
+
+ nir_ssa_dest_init(&tex->instr, &tex->dest,
+ nir_tex_instr_dest_size(tex), 32, NULL);
+ nir_builder_instr_insert(b, &tex->instr);
+
+ return nir_i2f32(b, &tex->dest.ssa);
}
static nir_ssa_def *
-implicit_downsampled_coord_unnormalized(nir_builder *b, nir_ssa_def *value, int div_scale)
+implicit_downsampled_coord(nir_builder *b,
+ nir_ssa_def *value,
+ nir_ssa_def *max_value,
+ int div_scale)
{
- return nir_fadd(
- b, value,
- nir_imm_float(b, 1.0f / (float)div_scale));
+ return nir_fadd(b,
+ value,
+ nir_fdiv(b,
+ nir_imm_float(b, 1.0f),
+ nir_fmul(b,
+ nir_imm_float(b, div_scale),
+ max_value)));
}
static nir_ssa_def *
-implicit_downsampled_coords(struct ycbcr_state *state, nir_ssa_def *old_coords)
+implicit_downsampled_coords(struct ycbcr_state *state,
+ nir_ssa_def *old_coords)
{
- nir_builder *b = state->builder;
- const struct radv_sampler_ycbcr_conversion_state *conversion = state->conversion;
- nir_ssa_def *image_size = NULL;
- nir_ssa_def *comp[4] = {
- NULL,
- };
- enum pipe_video_chroma_format chroma_format =
- pipe_format_to_chroma_format(vk_format_to_pipe_format(state->conversion->format));
- const unsigned divisors[2] = {chroma_format <= PIPE_VIDEO_CHROMA_FORMAT_422 ? 2 : 1,
- chroma_format <= PIPE_VIDEO_CHROMA_FORMAT_420 ? 2 : 1};
-
- for (int c = 0; c < old_coords->num_components; c++) {
- comp[c] = nir_channel(b, old_coords, c);
-
- if (c < ARRAY_SIZE(divisors) && divisors[c] > 1) {
- if (state->unnormalized_coordinates)
- comp[c] = nir_fdiv(b, comp[c], nir_imm_float(b, divisors[c]));
-
- if (conversion->chroma_offsets[c] == VK_CHROMA_LOCATION_COSITED_EVEN) {
- if (state->unnormalized_coordinates) {
- comp[c] = implicit_downsampled_coord_unnormalized(b, comp[c], divisors[c]);
- } else {
- if (!image_size)
- image_size = get_texture_size(state, state->tex_deref);
-
- comp[c] = implicit_downsampled_coord(b, comp[c], nir_channel(b, image_size, c), divisors[c]);
- }
- }
- }
- }
-
- return nir_vec(b, comp, old_coords->num_components);
+ nir_builder *b = state->builder;
+ const struct radv_sampler_ycbcr_conversion *conversion = state->conversion;
+ nir_ssa_def *image_size = NULL;
+ nir_ssa_def *comp[4] = { NULL, };
+ const struct vk_format_description *fmt_desc = vk_format_description(state->conversion->format);
+ const unsigned divisors[2] = {fmt_desc->width_divisor, fmt_desc->height_divisor};
+
+ for (int c = 0; c < old_coords->num_components; c++) {
+ if (c < ARRAY_SIZE(divisors) && divisors[c] > 1 &&
+ conversion->chroma_offsets[c] == VK_CHROMA_LOCATION_COSITED_EVEN) {
+ if (!image_size)
+ image_size = get_texture_size(state, state->tex_deref);
+
+ comp[c] = implicit_downsampled_coord(b,
+ nir_channel(b, old_coords, c),
+ nir_channel(b, image_size, c),
+ divisors[c]);
+ } else {
+ comp[c] = nir_channel(b, old_coords, c);
+ }
+ }
+
+ return nir_vec(b, comp, old_coords->num_components);
}
static nir_ssa_def *
-create_plane_tex_instr_implicit(struct ycbcr_state *state, uint32_t plane)
+create_plane_tex_instr_implicit(struct ycbcr_state *state,
+ uint32_t plane)
{
- nir_builder *b = state->builder;
- nir_tex_instr *old_tex = state->origin_tex;
- nir_tex_instr *tex = nir_tex_instr_create(b->shader, old_tex->num_srcs + 1);
- for (uint32_t i = 0; i < old_tex->num_srcs; i++) {
- tex->src[i].src_type = old_tex->src[i].src_type;
-
- switch (old_tex->src[i].src_type) {
- case nir_tex_src_coord:
- if (plane && true /*state->conversion->chroma_reconstruction*/) {
- assert(old_tex->src[i].src.is_ssa);
- tex->src[i].src =
- nir_src_for_ssa(implicit_downsampled_coords(state, old_tex->src[i].src.ssa));
- break;
- }
- FALLTHROUGH;
- default:
- nir_src_copy(&tex->src[i].src, &old_tex->src[i].src, &tex->instr);
- break;
- }
- }
-
- tex->src[tex->num_srcs - 1].src = nir_src_for_ssa(nir_imm_int(b, plane));
- tex->src[tex->num_srcs - 1].src_type = nir_tex_src_plane;
-
- tex->sampler_dim = old_tex->sampler_dim;
- tex->dest_type = old_tex->dest_type;
- tex->is_array = old_tex->is_array;
-
- tex->op = old_tex->op;
- tex->coord_components = old_tex->coord_components;
- tex->is_new_style_shadow = old_tex->is_new_style_shadow;
- tex->component = old_tex->component;
-
- tex->texture_index = old_tex->texture_index;
- tex->sampler_index = old_tex->sampler_index;
-
- nir_ssa_dest_init(&tex->instr, &tex->dest, old_tex->dest.ssa.num_components,
- nir_dest_bit_size(old_tex->dest), NULL);
- nir_builder_instr_insert(b, &tex->instr);
-
- return &tex->dest.ssa;
+ nir_builder *b = state->builder;
+ nir_tex_instr *old_tex = state->origin_tex;
+ nir_tex_instr *tex = nir_tex_instr_create(b->shader, old_tex->num_srcs+ 1);
+ for (uint32_t i = 0; i < old_tex->num_srcs; i++) {
+ tex->src[i].src_type = old_tex->src[i].src_type;
+
+ switch (old_tex->src[i].src_type) {
+ case nir_tex_src_coord:
+ if (plane && true/*state->conversion->chroma_reconstruction*/) {
+ assert(old_tex->src[i].src.is_ssa);
+ tex->src[i].src =
+ nir_src_for_ssa(implicit_downsampled_coords(state,
+ old_tex->src[i].src.ssa));
+ break;
+ }
+ /* fall through */
+ default:
+ nir_src_copy(&tex->src[i].src, &old_tex->src[i].src, tex);
+ break;
+ }
+ }
+
+ tex->src[tex->num_srcs - 1].src = nir_src_for_ssa(nir_imm_int(b, plane));
+ tex->src[tex->num_srcs - 1].src_type = nir_tex_src_plane;
+
+ tex->sampler_dim = old_tex->sampler_dim;
+ tex->dest_type = old_tex->dest_type;
+ tex->is_array = old_tex->is_array;
+
+ tex->op = old_tex->op;
+ tex->coord_components = old_tex->coord_components;
+ tex->is_new_style_shadow = old_tex->is_new_style_shadow;
+ tex->component = old_tex->component;
+
+ tex->texture_index = old_tex->texture_index;
+ tex->texture_array_size = old_tex->texture_array_size;
+ tex->sampler_index = old_tex->sampler_index;
+
+ nir_ssa_dest_init(&tex->instr, &tex->dest,
+ old_tex->dest.ssa.num_components,
+ nir_dest_bit_size(old_tex->dest), NULL);
+ nir_builder_instr_insert(b, &tex->instr);
+
+ return &tex->dest.ssa;
}
struct swizzle_info {
- unsigned plane[4];
- unsigned swizzle[4];
+ unsigned plane[4];
+ unsigned swizzle[4];
};
static struct swizzle_info
get_plane_swizzles(VkFormat format)
{
- int planes = vk_format_get_plane_count(format);
- switch (planes) {
- case 3:
- return (struct swizzle_info){{2, 0, 1, 0}, {0, 0, 0, 3}};
- case 2:
- return (struct swizzle_info){{1, 0, 1, 0}, {1, 0, 0, 3}};
- case 1:
- return (struct swizzle_info){{0, 0, 0, 0}, {0, 1, 2, 3}};
- default:
- unreachable("unhandled plane count for ycbcr swizzling");
- }
+ int planes = vk_format_get_plane_count(format);
+ switch (planes) {
+ case 3:
+ return (struct swizzle_info) {
+ {2, 0, 1, 0},
+ {0, 0, 0, 3}
+ };
+ case 2:
+ return (struct swizzle_info) {
+ {1, 0, 1, 0},
+ {1, 0, 0, 3}
+ };
+ case 1:
+ return (struct swizzle_info) {
+ {0, 0, 0, 0},
+ {0, 1, 2, 3}
+ };
+ default:
+ unreachable("unhandled plane count for ycbcr swizzling");
+ }
}
+
static nir_ssa_def *
-build_swizzled_components(nir_builder *builder, VkFormat format, VkComponentMapping mapping,
+build_swizzled_components(nir_builder *builder,
+ VkFormat format,
+ VkComponentMapping mapping,
nir_ssa_def **plane_values)
{
- struct swizzle_info plane_swizzle = get_plane_swizzles(format);
- enum pipe_swizzle swizzles[4];
- nir_ssa_def *values[4];
-
- vk_format_compose_swizzles(&mapping, (const unsigned char[4]){0, 1, 2, 3}, swizzles);
-
- nir_ssa_def *zero = nir_imm_float(builder, 0.0f);
- nir_ssa_def *one = nir_imm_float(builder, 1.0f);
-
- for (unsigned i = 0; i < 4; ++i) {
- switch (swizzles[i]) {
- case PIPE_SWIZZLE_X:
- case PIPE_SWIZZLE_Y:
- case PIPE_SWIZZLE_Z:
- case PIPE_SWIZZLE_W: {
- unsigned channel = swizzles[i] - PIPE_SWIZZLE_X;
- values[i] = nir_channel(builder, plane_values[plane_swizzle.plane[channel]],
- plane_swizzle.swizzle[channel]);
- break;
- }
- case PIPE_SWIZZLE_0:
- values[i] = zero;
- break;
- case PIPE_SWIZZLE_1:
- values[i] = one;
- break;
- default:
- unreachable("unhandled swizzle");
- }
- }
- return nir_vec(builder, values, 4);
+ struct swizzle_info plane_swizzle = get_plane_swizzles(format);
+ enum vk_swizzle swizzles[4];
+ nir_ssa_def *values[4];
+
+ vk_format_compose_swizzles(&mapping, (const unsigned char[4]){0,1,2,3}, swizzles);
+
+ nir_ssa_def *zero = nir_imm_float(builder, 0.0f);
+ nir_ssa_def *one = nir_imm_float(builder, 1.0f);
+
+ for (unsigned i = 0; i < 4; ++i) {
+ switch(swizzles[i]) {
+ case VK_SWIZZLE_X:
+ case VK_SWIZZLE_Y:
+ case VK_SWIZZLE_Z:
+ case VK_SWIZZLE_W: {
+ unsigned channel = swizzles[i] - VK_SWIZZLE_X;
+ values[i] = nir_channel(builder,
+ plane_values[plane_swizzle.plane[channel]],
+ plane_swizzle.swizzle[channel]);
+ break;
+ }
+ case VK_SWIZZLE_0:
+ values[i] = zero;
+ break;
+ case VK_SWIZZLE_1:
+ values[i] = one;
+ break;
+ default:
+ unreachable("unhandled swizzle");
+ }
+ }
+ return nir_vec(builder, values, 4);
}
static bool
-try_lower_tex_ycbcr(const struct radv_pipeline_layout *layout, nir_builder *builder,
+try_lower_tex_ycbcr(const struct radv_pipeline_layout *layout,
+ nir_builder *builder,
nir_tex_instr *tex)
{
- int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
- assert(deref_src_idx >= 0);
- nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
-
- nir_variable *var = nir_deref_instr_get_variable(deref);
- const struct radv_descriptor_set_layout *set_layout =
- layout->set[var->data.descriptor_set].layout;
- const struct radv_descriptor_set_binding_layout *binding =
- &set_layout->binding[var->data.binding];
- const struct radv_sampler_ycbcr_conversion_state *ycbcr_samplers =
- radv_immutable_ycbcr_samplers(set_layout, var->data.binding);
-
- if (!ycbcr_samplers)
- return false;
-
- assert(binding->immutable_samplers_offset);
- const uint32_t *immutable_samplers =
- radv_immutable_samplers(set_layout, binding);
-
- /* For the following instructions, we don't apply any change and let the
- * instruction apply to the first plane.
- */
- if (tex->op == nir_texop_txs || tex->op == nir_texop_query_levels || tex->op == nir_texop_lod)
- return false;
-
- assert(tex->texture_index == 0);
- unsigned array_index = 0;
- if (deref->deref_type != nir_deref_type_var) {
- assert(deref->deref_type == nir_deref_type_array);
- if (!nir_src_is_const(deref->arr.index))
- return false;
- array_index = nir_src_as_uint(deref->arr.index);
- array_index = MIN2(array_index, binding->array_size - 1);
- }
- const struct radv_sampler_ycbcr_conversion_state *ycbcr_sampler = ycbcr_samplers + array_index;
-
- if (ycbcr_sampler->format == VK_FORMAT_UNDEFINED)
- return false;
-
- bool unnormalized_coordinates = immutable_samplers[4 * array_index + 0] & S_008F30_FORCE_UNNORMALIZED(1);
-
- struct ycbcr_state state = {
- .builder = builder,
- .origin_tex = tex,
- .tex_deref = deref,
- .conversion = ycbcr_sampler,
- .unnormalized_coordinates = unnormalized_coordinates,
- };
-
- builder->cursor = nir_before_instr(&tex->instr);
-
- VkFormat format = state.conversion->format;
- const int plane_count = vk_format_get_plane_count(format);
- nir_ssa_def *plane_values[3];
-
- for (int p = 0; p < plane_count; ++p) {
- plane_values[p] = create_plane_tex_instr_implicit(&state, p);
- }
-
- nir_ssa_def *result =
- build_swizzled_components(builder, format, ycbcr_sampler->components, plane_values);
- if (state.conversion->ycbcr_model != VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY) {
- VkFormat first_format = vk_format_get_plane_format(format, 0);
- uint32_t bits =
- vk_format_get_component_bits(first_format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X);
- /* TODO: swizzle and bpcs */
- uint32_t bpcs[3] = {bits, bits, bits};
- result = nir_convert_ycbcr_to_rgb(builder, state.conversion->ycbcr_model,
- state.conversion->ycbcr_range, result, bpcs);
- }
-
- nir_ssa_def_rewrite_uses(&tex->dest.ssa, result);
- nir_instr_remove(&tex->instr);
-
- return true;
+ int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
+ assert(deref_src_idx >= 0);
+ nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
+
+ nir_variable *var = nir_deref_instr_get_variable(deref);
+ const struct radv_descriptor_set_layout *set_layout =
+ layout->set[var->data.descriptor_set].layout;
+ const struct radv_descriptor_set_binding_layout *binding =
+ &set_layout->binding[var->data.binding];
+ const struct radv_sampler_ycbcr_conversion *ycbcr_samplers =
+ radv_immutable_ycbcr_samplers(set_layout, var->data.binding);
+
+ if (!ycbcr_samplers)
+ return false;
+
+ /* For the following instructions, we don't apply any change and let the
+ * instruction apply to the first plane.
+ */
+ if (tex->op == nir_texop_txs ||
+ tex->op == nir_texop_query_levels ||
+ tex->op == nir_texop_lod)
+ return false;
+
+ assert(tex->texture_index == 0);
+ unsigned array_index = 0;
+ if (deref->deref_type != nir_deref_type_var) {
+ assert(deref->deref_type == nir_deref_type_array);
+ if (!nir_src_is_const(deref->arr.index))
+ return false;
+ array_index = nir_src_as_uint(deref->arr.index);
+ array_index = MIN2(array_index, binding->array_size - 1);
+ }
+ const struct radv_sampler_ycbcr_conversion *ycbcr_sampler = ycbcr_samplers + array_index;
+
+ if (ycbcr_sampler->format == VK_FORMAT_UNDEFINED)
+ return false;
+
+ struct ycbcr_state state = {
+ .builder = builder,
+ .origin_tex = tex,
+ .tex_deref = deref,
+ .conversion = ycbcr_sampler,
+ };
+
+ builder->cursor = nir_before_instr(&tex->instr);
+
+ VkFormat format = state.conversion->format;
+ const int plane_count = vk_format_get_plane_count(format);
+ nir_ssa_def *plane_values[3];
+
+ for (int p = 0; p < plane_count; ++p) {
+ plane_values[p] = create_plane_tex_instr_implicit(&state, p);
+ }
+
+ nir_ssa_def *result = build_swizzled_components(builder, format, ycbcr_sampler->components, plane_values);
+ if (state.conversion->ycbcr_model != VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY) {
+ VkFormat first_format = vk_format_get_plane_format(format, 0);
+ result = convert_ycbcr(&state, result, vk_format_get_component_bits(first_format, VK_FORMAT_COLORSPACE_RGB, VK_SWIZZLE_X));
+ }
+
+ nir_ssa_def_rewrite_uses(&tex->dest.ssa, nir_src_for_ssa(result));
+ nir_instr_remove(&tex->instr);
+
+ return true;
}
-static bool
-radv_nir_lower_ycbcr_textures_instr(nir_builder *b, nir_instr *instr, void *layout)
+bool
+radv_nir_lower_ycbcr_textures(nir_shader *shader,
+ const struct radv_pipeline_layout *layout)
{
- if (instr->type != nir_instr_type_tex)
- return false;
+ bool progress = false;
- nir_tex_instr *tex = nir_instr_as_tex(instr);
- return try_lower_tex_ycbcr(layout, b, tex);
-}
+ nir_foreach_function(function, shader) {
+ if (!function->impl)
+ continue;
-bool
-radv_nir_lower_ycbcr_textures(nir_shader *shader, const struct radv_pipeline_layout *layout)
-{
- return nir_shader_instructions_pass(shader,
- radv_nir_lower_ycbcr_textures_instr,
- nir_metadata_block_index |
- nir_metadata_dominance,
- (void *)layout);
+ bool function_progress = false;
+ nir_builder builder;
+ nir_builder_init(&builder, function->impl);
+
+ nir_foreach_block(block, function->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_tex)
+ continue;
+
+ nir_tex_instr *tex = nir_instr_as_tex(instr);
+ function_progress |= try_lower_tex_ycbcr(layout, &builder, tex);
+ }
+ }
+
+ if (function_progress) {
+ nir_metadata_preserve(function->impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
+
+ progress |= function_progress;
+ }
+
+ return progress;
}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_android.c b/lib/mesa/src/freedreno/vulkan/tu_android.c
index d1f6bb3ab..1ebc9e726 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_android.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_android.c
@@ -1,26 +1,35 @@
/*
* Copyright © 2017, Google Inc.
- * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
-#include "tu_android.h"
+#include "tu_private.h"
#include <hardware/gralloc.h>
-
-#if ANDROID_API_LEVEL >= 26
-#include <hardware/gralloc1.h>
-#endif
-
#include <hardware/hardware.h>
#include <hardware/hwvulkan.h>
+#include <libsync.h>
-#include "drm-uapi/drm_fourcc.h"
-
-#include "util/libsync.h"
-#include "util/os_file.h"
-
-#include "tu_device.h"
-#include "tu_image.h"
+#include <vulkan/vk_android_native_buffer.h>
+#include <vulkan/vk_icd.h>
static int
tu_hal_open(const struct hw_module_t *mod,
@@ -42,7 +51,7 @@ PUBLIC struct hwvulkan_module_t HAL_MODULE_INFO_SYM = {
.module_api_version = HWVULKAN_MODULE_API_VERSION_0_1,
.hal_api_version = HARDWARE_MAKE_API_VERSION(1, 0),
.id = HWVULKAN_HARDWARE_MODULE_ID,
- .name = "Turnip Vulkan HAL",
+ .name = "AMD Vulkan HAL",
.author = "Google",
.methods =
&(hw_module_methods_t){
@@ -97,161 +106,41 @@ tu_hal_close(struct hw_device_t *dev)
return -1;
}
-/* get dma-buf and modifier from gralloc info */
-static VkResult
-tu_gralloc_info_other(struct tu_device *device,
+VkResult
+tu_image_from_gralloc(VkDevice device_h,
+ const VkImageCreateInfo *base_info,
const VkNativeBufferANDROID *gralloc_info,
- int *dma_buf,
- uint64_t *modifier)
+ const VkAllocationCallbacks *alloc,
+ VkImage *out_image_h)
{
- const uint32_t *handle_fds = (uint32_t *)gralloc_info->handle->data;
- const uint32_t *handle_data = &handle_fds[gralloc_info->handle->numFds];
- bool ubwc = false;
-
- if (gralloc_info->handle->numFds == 1) {
- /* gbm_gralloc. TODO: modifiers support */
- *dma_buf = handle_fds[0];
- } else if (gralloc_info->handle->numFds == 2) {
- /* Qualcomm gralloc, find it at:
- *
- * https://android.googlesource.com/platform/hardware/qcom/display/.
- *
- * The gralloc_info->handle is a pointer to a struct private_handle_t
- * from your platform's gralloc. On msm8996 (a5xx) and newer grallocs
- * that's libgralloc1/gr_priv_handle.h, while previously it was
- * libgralloc/gralloc_priv.h.
- */
-
- if (gralloc_info->handle->numInts < 2) {
- return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
- "VkNativeBufferANDROID::handle::numInts is %d, "
- "expected at least 2 for qcom gralloc",
- gralloc_info->handle->numFds);
- }
+ TU_FROM_HANDLE(tu_device, device, device_h);
+ VkImage image_h = VK_NULL_HANDLE;
+ struct tu_image *image = NULL;
+ struct tu_bo *bo = NULL;
+ VkResult result;
- uint32_t gmsm = ('g' << 24) | ('m' << 16) | ('s' << 8) | 'm';
- if (handle_data[0] != gmsm) {
- return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
- "private_handle_t::magic is %x, expected %x",
- handle_data[0], gmsm);
- }
+ result = tu_image_create(
+ device_h,
+ &(struct tu_image_create_info) {
+ .vk_info = base_info, .scanout = true, .no_metadata_planes = true },
+ alloc, &image_h);
- /* This UBWC flag was introduced in a5xx. */
- ubwc = handle_data[1] & 0x08000000;
+ if (result != VK_SUCCESS)
+ return result;
- /* QCOM gralloc has two fds passed in: the actual GPU buffer, and a buffer
- * of CPU-side metadata. I haven't found any need for the metadata buffer
- * yet. See qdMetaData.h for what's in the metadata fd.
- */
- *dma_buf = handle_fds[0];
- } else {
- return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+ if (gralloc_info->handle->numFds != 1) {
+ return vk_errorf(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE,
"VkNativeBufferANDROID::handle::numFds is %d, "
- "expected 1 (gbm_gralloc) or 2 (qcom gralloc)",
+ "expected 1",
gralloc_info->handle->numFds);
}
- *modifier = ubwc ? DRM_FORMAT_MOD_QCOM_COMPRESSED : DRM_FORMAT_MOD_LINEAR;
- return VK_SUCCESS;
-}
-
-static const char cros_gralloc_module_name[] = "CrOS Gralloc";
-
-#define CROS_GRALLOC_DRM_GET_BUFFER_INFO 4
-#define CROS_GRALLOC_DRM_GET_USAGE 5
-#define CROS_GRALLOC_DRM_GET_USAGE_FRONT_RENDERING_BIT 0x1
-
-struct cros_gralloc0_buffer_info {
- uint32_t drm_fourcc;
- int num_fds;
- int fds[4];
- uint64_t modifier;
- int offset[4];
- int stride[4];
-};
-
-static VkResult
-tu_gralloc_info_cros(struct tu_device *device,
- const VkNativeBufferANDROID *gralloc_info,
- int *dma_buf,
- uint64_t *modifier)
-
-{
- const gralloc_module_t *gralloc = device->gralloc;
- struct cros_gralloc0_buffer_info info;
- int ret;
-
- ret = gralloc->perform(gralloc, CROS_GRALLOC_DRM_GET_BUFFER_INFO,
- gralloc_info->handle, &info);
- if (ret)
- return VK_ERROR_INVALID_EXTERNAL_HANDLE;
-
- *dma_buf = info.fds[0];
- *modifier = info.modifier;
-
- return VK_SUCCESS;
-}
-
-VkResult
-tu_gralloc_info(struct tu_device *device,
- const VkNativeBufferANDROID *gralloc_info,
- int *dma_buf,
- uint64_t *modifier)
-
-{
- if (!device->gralloc) {
- /* get gralloc module for gralloc buffer info query */
- int ret = hw_get_module(GRALLOC_HARDWARE_MODULE_ID,
- (const hw_module_t **)&device->gralloc);
-
- if (ret) {
- /* This is *slightly* awkward, but if we are asked to import
- * a gralloc handle, and there is no gralloc, it is some sort
- * of invalid handle.
- */
- return vk_startup_errorf(device->instance,
- VK_ERROR_INVALID_EXTERNAL_HANDLE,
- "Could not open gralloc\n");
- }
-
- const gralloc_module_t *gralloc = device->gralloc;
-
- mesa_logi("opened gralloc module name: %s", gralloc->common.name);
-
- /* TODO not sure qcom gralloc module name, but we should check
- * for it here and move the special gmsm handling out of
- * tu_gralloc_info_other()
- */
- if (!strcmp(gralloc->common.name, cros_gralloc_module_name) && gralloc->perform) {
- device->gralloc_type = TU_GRALLOC_CROS;
- } else {
- device->gralloc_type = TU_GRALLOC_OTHER;
- }
- }
-
- if (device->gralloc_type == TU_GRALLOC_CROS) {
- return tu_gralloc_info_cros(device, gralloc_info, dma_buf, modifier);
- } else {
- return tu_gralloc_info_other(device, gralloc_info, dma_buf, modifier);
- }
-}
-
-/**
- * Creates the VkImage using the gralloc handle in *gralloc_info.
- *
- * We support two different grallocs here, gbm_gralloc, and the qcom gralloc
- * used on Android phones.
- */
-VkResult
-tu_import_memory_from_gralloc_handle(VkDevice device_h,
- int dma_buf,
- const VkAllocationCallbacks *alloc,
- VkImage image_h)
-
-{
- struct tu_image *image = NULL;
- VkResult result;
+ /* Do not close the gralloc handle's dma_buf. The lifetime of the dma_buf
+ * must exceed that of the gralloc handle, and we do not own the gralloc
+ * handle.
+ */
+ int dma_buf = gralloc_info->handle->data[0];
image = tu_image_from_handle(image_h);
@@ -264,52 +153,70 @@ tu_import_memory_from_gralloc_handle(VkDevice device_h,
.image = image_h
};
- const VkImportMemoryFdInfoKHR import_info = {
- .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
+ const VkImportMemoryFdInfo import_info = {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO,
.pNext = &ded_alloc,
.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
- .fd = os_dupfd_cloexec(dma_buf),
+ .fd = dup(dma_buf),
};
+ /* Find the first VRAM memory type, or GART for PRIME images. */
+ int memory_type_index = -1;
+ for (int i = 0;
+ i < device->physical_device->memory_properties.memoryTypeCount; ++i) {
+ bool is_local =
+ !!(device->physical_device->memory_properties.memoryTypes[i]
+ .propertyFlags &
+ VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+ if (is_local) {
+ memory_type_index = i;
+ break;
+ }
+ }
+
+ /* fallback */
+ if (memory_type_index == -1)
+ memory_type_index = 0;
result =
tu_AllocateMemory(device_h,
&(VkMemoryAllocateInfo) {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.pNext = &import_info,
- .allocationSize = image->total_size,
- .memoryTypeIndex = 0,
+ .allocationSize = image->size,
+ .memoryTypeIndex = memory_type_index,
},
alloc, &memory_h);
if (result != VK_SUCCESS)
goto fail_create_image;
- VkBindImageMemoryInfo bind_info = {
- .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
- .image = image_h,
- .memory = memory_h,
- .memoryOffset = 0,
- };
- tu_BindImageMemory2(device_h, 1, &bind_info);
+ tu_BindImageMemory(device_h, image_h, memory_h, 0);
image->owned_memory = memory_h;
+ /* Don't clobber the out-parameter until success is certain. */
+ *out_image_h = image_h;
return VK_SUCCESS;
fail_create_image:
+fail_size:
tu_DestroyImage(device_h, image_h, alloc);
return result;
}
-static VkResult
-format_supported_with_usage(VkDevice device_h, VkFormat format,
- VkImageUsageFlags imageUsage)
+VkResult
+tu_GetSwapchainGrallocUsageANDROID(VkDevice device_h,
+ VkFormat format,
+ VkImageUsageFlags imageUsage,
+ int *grallocUsage)
{
TU_FROM_HANDLE(tu_device, device, device_h);
struct tu_physical_device *phys_dev = device->physical_device;
VkPhysicalDevice phys_dev_h = tu_physical_device_to_handle(phys_dev);
VkResult result;
+ *grallocUsage = 0;
+
/* WARNING: Android Nougat's libvulkan.so hardcodes the VkImageUsageFlags
* returned to applications via
* VkSurfaceCapabilitiesKHR::supportedUsageFlags.
@@ -340,19 +247,12 @@ format_supported_with_usage(VkDevice device_h, VkFormat format,
result = tu_GetPhysicalDeviceImageFormatProperties2(
phys_dev_h, &image_format_info, &image_format_props);
if (result != VK_SUCCESS) {
- return vk_errorf(device, result,
+ return vk_errorf(device->instance, result,
"tu_GetPhysicalDeviceImageFormatProperties2 failed "
"inside %s",
__func__);
}
- return VK_SUCCESS;
-}
-
-static VkResult
-setup_gralloc0_usage(struct tu_device *device, VkFormat format,
- VkImageUsageFlags imageUsage, int *grallocUsage)
-{
if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_DST_BIT |
VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT))
*grallocUsage |= GRALLOC_USAGE_HW_RENDER;
@@ -367,7 +267,7 @@ setup_gralloc0_usage(struct tu_device *device, VkFormat format,
* gralloc swapchains.
*/
if (imageUsage != 0) {
- return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+ return vk_errorf(device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
"unsupported VkImageUsageFlags(0x%x) for gralloc "
"swapchain",
imageUsage);
@@ -390,66 +290,93 @@ setup_gralloc0_usage(struct tu_device *device, VkFormat format,
return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetSwapchainGrallocUsageANDROID(VkDevice device_h,
- VkFormat format,
- VkImageUsageFlags imageUsage,
- int *grallocUsage)
+VkResult
+tu_AcquireImageANDROID(VkDevice device,
+ VkImage image_h,
+ int nativeFenceFd,
+ VkSemaphore semaphore,
+ VkFence fence)
{
- TU_FROM_HANDLE(tu_device, device, device_h);
- VkResult result;
+ VkResult semaphore_result = VK_SUCCESS, fence_result = VK_SUCCESS;
+
+ if (semaphore != VK_NULL_HANDLE) {
+ int semaphore_fd =
+ nativeFenceFd >= 0 ? dup(nativeFenceFd) : nativeFenceFd;
+ semaphore_result = tu_ImportSemaphoreFdKHR(
+ device, &(VkImportSemaphoreFdInfoKHR) {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR,
+ .flags = VK_SEMAPHORE_IMPORT_TEMPORARY_BIT,
+ .fd = semaphore_fd,
+ .semaphore = semaphore,
+ });
+ }
- result = format_supported_with_usage(device_h, format, imageUsage);
- if (result != VK_SUCCESS)
- return result;
+ if (fence != VK_NULL_HANDLE) {
+ int fence_fd = nativeFenceFd >= 0 ? dup(nativeFenceFd) : nativeFenceFd;
+ fence_result = tu_ImportFenceFdKHR(
+ device, &(VkImportFenceFdInfoKHR) {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR,
+ .flags = VK_FENCE_IMPORT_TEMPORARY_BIT,
+ .fd = fence_fd,
+ .fence = fence,
+ });
+ }
- *grallocUsage = 0;
- return setup_gralloc0_usage(device, format, imageUsage, grallocUsage);
+ close(nativeFenceFd);
+
+ if (semaphore_result != VK_SUCCESS)
+ return semaphore_result;
+ return fence_result;
}
-#if ANDROID_API_LEVEL >= 26
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetSwapchainGrallocUsage2ANDROID(VkDevice device_h,
- VkFormat format,
- VkImageUsageFlags imageUsage,
- VkSwapchainImageUsageFlagsANDROID swapchainImageUsage,
- uint64_t *grallocConsumerUsage,
- uint64_t *grallocProducerUsage)
+VkResult
+tu_QueueSignalReleaseImageANDROID(VkQueue _queue,
+ uint32_t waitSemaphoreCount,
+ const VkSemaphore *pWaitSemaphores,
+ VkImage image,
+ int *pNativeFenceFd)
{
- TU_FROM_HANDLE(tu_device, device, device_h);
- VkResult result;
-
- *grallocConsumerUsage = 0;
- *grallocProducerUsage = 0;
- mesa_logd("%s: format=%d, usage=0x%x", __func__, format, imageUsage);
+ TU_FROM_HANDLE(tu_queue, queue, _queue);
+ VkResult result = VK_SUCCESS;
- result = format_supported_with_usage(device_h, format, imageUsage);
- if (result != VK_SUCCESS)
- return result;
-
- int32_t grallocUsage = 0;
- result = setup_gralloc0_usage(device, format, imageUsage, &grallocUsage);
- if (result != VK_SUCCESS)
- return result;
+ if (waitSemaphoreCount == 0) {
+ if (pNativeFenceFd)
+ *pNativeFenceFd = -1;
+ return VK_SUCCESS;
+ }
- /* Setup gralloc1 usage flags from gralloc0 flags. */
+ int fd = -1;
- if (grallocUsage & GRALLOC_USAGE_HW_RENDER) {
- *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET;
- *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_CLIENT_TARGET;
- }
+ for (uint32_t i = 0; i < waitSemaphoreCount; ++i) {
+ int tmp_fd;
+ result = tu_GetSemaphoreFdKHR(
+ tu_device_to_handle(queue->device),
+ &(VkSemaphoreGetFdInfoKHR) {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR,
+ .handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT,
+ .semaphore = pWaitSemaphores[i],
+ },
+ &tmp_fd);
+ if (result != VK_SUCCESS) {
+ if (fd >= 0)
+ close(fd);
+ return result;
+ }
- if (grallocUsage & GRALLOC_USAGE_HW_TEXTURE) {
- *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_GPU_TEXTURE;
+ if (fd < 0)
+ fd = tmp_fd;
+ else if (tmp_fd >= 0) {
+ sync_accumulate("tu", &fd, tmp_fd);
+ close(tmp_fd);
+ }
}
- if (grallocUsage & (GRALLOC_USAGE_HW_FB |
- GRALLOC_USAGE_HW_COMPOSER |
- GRALLOC_USAGE_EXTERNAL_DISP)) {
- *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET;
- *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_HWCOMPOSER;
+ if (pNativeFenceFd) {
+ *pNativeFenceFd = fd;
+ } else if (fd >= 0) {
+ close(fd);
+ /* We still need to do the exports, to reset the semaphores, but
+ * otherwise we don't wait on them. */
}
-
return VK_SUCCESS;
}
-#endif
diff --git a/lib/mesa/src/freedreno/vulkan/tu_cmd_buffer.c b/lib/mesa/src/freedreno/vulkan/tu_cmd_buffer.c
index 0acb45d71..fe436e595 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -1,879 +1,807 @@
/*
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
- * SPDX-License-Identifier: MIT
*
* based in part on anv driver which is:
* Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
-#include "tu_cmd_buffer.h"
+#include "tu_private.h"
+
+#include "registers/adreno_pm4.xml.h"
+#include "registers/adreno_common.xml.h"
+#include "registers/a6xx.xml.h"
-#include "vk_render_pass.h"
-#include "vk_util.h"
-#include "vk_common_entrypoints.h"
+#include "vk_format.h"
-#include "tu_clear_blit.h"
#include "tu_cs.h"
-#include "tu_image.h"
-#include "tu_tracepoints.h"
-static void
-tu_clone_trace_range(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
- struct u_trace_iterator begin, struct u_trace_iterator end)
+void
+tu_bo_list_init(struct tu_bo_list *list)
{
- if (u_trace_iterator_equal(begin, end))
- return;
-
- tu_cs_emit_wfi(cs);
- tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
- u_trace_clone_append(begin, end, &cmd->trace, cs,
- tu_copy_timestamp_buffer);
+ list->count = list->capacity = 0;
+ list->bo_infos = NULL;
}
-static void
-tu_clone_trace(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
- struct u_trace *trace)
+void
+tu_bo_list_destroy(struct tu_bo_list *list)
{
- tu_clone_trace_range(cmd, cs, u_trace_begin_iterator(trace),
- u_trace_end_iterator(trace));
+ free(list->bo_infos);
}
void
-tu6_emit_event_write(struct tu_cmd_buffer *cmd,
- struct tu_cs *cs,
- enum vgt_event_type event)
+tu_bo_list_reset(struct tu_bo_list *list)
{
- bool need_seqno = false;
- switch (event) {
- case CACHE_FLUSH_TS:
- case WT_DONE_TS:
- case RB_DONE_TS:
- case PC_CCU_FLUSH_DEPTH_TS:
- case PC_CCU_FLUSH_COLOR_TS:
- case PC_CCU_RESOLVE_TS:
- need_seqno = true;
- break;
- default:
- break;
+ list->count = 0;
+}
+
+/**
+ * \a flags consists of MSM_SUBMIT_BO_FLAGS.
+ */
+static uint32_t
+tu_bo_list_add_info(struct tu_bo_list *list,
+ const struct drm_msm_gem_submit_bo *bo_info)
+{
+ for (uint32_t i = 0; i < list->count; ++i) {
+ if (list->bo_infos[i].handle == bo_info->handle) {
+ assert(list->bo_infos[i].presumed == bo_info->presumed);
+ list->bo_infos[i].flags |= bo_info->flags;
+ return i;
+ }
}
- tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
- tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
- if (need_seqno) {
- tu_cs_emit_qw(cs, global_iova(cmd, seqno_dummy));
- tu_cs_emit(cs, 0);
+ /* grow list->bo_infos if needed */
+ if (list->count == list->capacity) {
+ uint32_t new_capacity = MAX2(2 * list->count, 16);
+ struct drm_msm_gem_submit_bo *new_bo_infos = realloc(
+ list->bo_infos, new_capacity * sizeof(struct drm_msm_gem_submit_bo));
+ if (!new_bo_infos)
+ return TU_BO_LIST_FAILED;
+ list->bo_infos = new_bo_infos;
+ list->capacity = new_capacity;
}
+
+ list->bo_infos[list->count] = *bo_info;
+ return list->count++;
}
-/* Emits the tessfactor address to the top-level CS if it hasn't been already.
- * Updating this register requires a WFI if outstanding drawing is using it, but
- * tu6_init_hardware() will have WFIed before we started and no other draws
- * could be using the tessfactor address yet since we only emit one per cmdbuf.
- */
-static void
-tu6_lazy_emit_tessfactor_addr(struct tu_cmd_buffer *cmd)
+uint32_t
+tu_bo_list_add(struct tu_bo_list *list,
+ const struct tu_bo *bo,
+ uint32_t flags)
{
- if (cmd->state.tessfactor_addr_set)
- return;
-
- tu_cs_emit_regs(&cmd->cs, A6XX_PC_TESSFACTOR_ADDR(.qword = cmd->device->tess_bo->iova));
- /* Updating PC_TESSFACTOR_ADDR could race with the next draw which uses it. */
- cmd->state.cache.flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
- cmd->state.tessfactor_addr_set = true;
+ return tu_bo_list_add_info(list, &(struct drm_msm_gem_submit_bo) {
+ .flags = flags,
+ .handle = bo->gem_handle,
+ .presumed = bo->iova,
+ });
}
-static void
-tu6_lazy_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+VkResult
+tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other)
{
- struct tu_device *dev = cmd->device;
-
- /* VSC buffers:
- * use vsc pitches from the largest values used so far with this device
- * if there hasn't been overflow, there will already be a scratch bo
- * allocated for these sizes
- *
- * if overflow is detected, the stream size is increased by 2x
- */
- mtx_lock(&dev->mutex);
+ for (uint32_t i = 0; i < other->count; i++) {
+ if (tu_bo_list_add_info(list, other->bo_infos + i) == TU_BO_LIST_FAILED)
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
+ }
- struct tu6_global *global = dev->global_bo->map;
+ return VK_SUCCESS;
+}
- uint32_t vsc_draw_overflow = global->vsc_draw_overflow;
- uint32_t vsc_prim_overflow = global->vsc_prim_overflow;
+static VkResult
+tu_tiling_config_update_gmem_layout(struct tu_tiling_config *tiling,
+ const struct tu_device *dev)
+{
+ const uint32_t gmem_size = dev->physical_device->gmem_size;
+ uint32_t offset = 0;
- if (vsc_draw_overflow >= dev->vsc_draw_strm_pitch)
- dev->vsc_draw_strm_pitch = (dev->vsc_draw_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
+ for (uint32_t i = 0; i < tiling->buffer_count; i++) {
+ /* 16KB-aligned */
+ offset = align(offset, 0x4000);
- if (vsc_prim_overflow >= dev->vsc_prim_strm_pitch)
- dev->vsc_prim_strm_pitch = (dev->vsc_prim_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
+ tiling->gmem_offsets[i] = offset;
+ offset += tiling->tile0.extent.width * tiling->tile0.extent.height *
+ tiling->buffer_cpp[i];
+ }
- cmd->vsc_prim_strm_pitch = dev->vsc_prim_strm_pitch;
- cmd->vsc_draw_strm_pitch = dev->vsc_draw_strm_pitch;
+ return offset <= gmem_size ? VK_SUCCESS : VK_ERROR_OUT_OF_DEVICE_MEMORY;
+}
- mtx_unlock(&dev->mutex);
+static void
+tu_tiling_config_update_tile_layout(struct tu_tiling_config *tiling,
+ const struct tu_device *dev)
+{
+ const uint32_t tile_align_w = dev->physical_device->tile_align_w;
+ const uint32_t tile_align_h = dev->physical_device->tile_align_h;
+ const uint32_t max_tile_width = 1024; /* A6xx */
+
+ tiling->tile0.offset = (VkOffset2D) {
+ .x = tiling->render_area.offset.x & ~(tile_align_w - 1),
+ .y = tiling->render_area.offset.y & ~(tile_align_h - 1),
+ };
+
+ const uint32_t ra_width =
+ tiling->render_area.extent.width +
+ (tiling->render_area.offset.x - tiling->tile0.offset.x);
+ const uint32_t ra_height =
+ tiling->render_area.extent.height +
+ (tiling->render_area.offset.y - tiling->tile0.offset.y);
+
+ /* start from 1 tile */
+ tiling->tile_count = (VkExtent2D) {
+ .width = 1,
+ .height = 1,
+ };
+ tiling->tile0.extent = (VkExtent2D) {
+ .width = align(ra_width, tile_align_w),
+ .height = align(ra_height, tile_align_h),
+ };
+
+ /* do not exceed max tile width */
+ while (tiling->tile0.extent.width > max_tile_width) {
+ tiling->tile_count.width++;
+ tiling->tile0.extent.width =
+ align(ra_width / tiling->tile_count.width, tile_align_w);
+ }
+
+ /* do not exceed gmem size */
+ while (tu_tiling_config_update_gmem_layout(tiling, dev) != VK_SUCCESS) {
+ if (tiling->tile0.extent.width > tiling->tile0.extent.height) {
+ tiling->tile_count.width++;
+ tiling->tile0.extent.width =
+ align(ra_width / tiling->tile_count.width, tile_align_w);
+ } else {
+ tiling->tile_count.height++;
+ tiling->tile0.extent.height =
+ align(ra_height / tiling->tile_count.height, tile_align_h);
+ }
+ }
+}
- struct tu_bo *vsc_bo;
- uint32_t size0 = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES +
- cmd->vsc_draw_strm_pitch * MAX_VSC_PIPES;
+static void
+tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
+ const struct tu_device *dev)
+{
+ const uint32_t max_pipe_count = 32; /* A6xx */
- tu_get_scratch_bo(dev, size0 + MAX_VSC_PIPES * 4, &vsc_bo);
+ /* start from 1 tile per pipe */
+ tiling->pipe0 = (VkExtent2D) {
+ .width = 1,
+ .height = 1,
+ };
+ tiling->pipe_count = tiling->tile_count;
- tu_cs_emit_regs(cs,
- A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0));
- tu_cs_emit_regs(cs,
- A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo));
- tu_cs_emit_regs(cs,
- A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo,
- .bo_offset = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES));
+ /* do not exceed max pipe count vertically */
+ while (tiling->pipe_count.height > max_pipe_count) {
+ tiling->pipe0.height += 2;
+ tiling->pipe_count.height =
+ (tiling->tile_count.height + tiling->pipe0.height - 1) /
+ tiling->pipe0.height;
+ }
- cmd->vsc_initialized = true;
+ /* do not exceed max pipe count */
+ while (tiling->pipe_count.width * tiling->pipe_count.height >
+ max_pipe_count) {
+ tiling->pipe0.width += 1;
+ tiling->pipe_count.width =
+ (tiling->tile_count.width + tiling->pipe0.width - 1) /
+ tiling->pipe0.width;
+ }
}
static void
-tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
- struct tu_cs *cs,
- enum tu_cmd_flush_bits flushes)
+tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
+ const struct tu_device *dev)
{
- if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_FLUSHALL))
- flushes |= TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE;
-
- if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_SYNCDRAW))
- flushes |= TU_CMD_FLAG_WAIT_MEM_WRITES |
- TU_CMD_FLAG_WAIT_FOR_IDLE |
- TU_CMD_FLAG_WAIT_FOR_ME;
-
- /* Experiments show that invalidating CCU while it still has data in it
- * doesn't work, so make sure to always flush before invalidating in case
- * any data remains that hasn't yet been made available through a barrier.
- * However it does seem to work for UCHE.
- */
- if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR |
- TU_CMD_FLAG_CCU_INVALIDATE_COLOR))
- tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS);
- if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH |
- TU_CMD_FLAG_CCU_INVALIDATE_DEPTH))
- tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS);
- if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR)
- tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR);
- if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)
- tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH);
- if (flushes & TU_CMD_FLAG_CACHE_FLUSH)
- tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS);
- if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE)
- tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE);
- if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
- tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
- if ((flushes & TU_CMD_FLAG_WAIT_FOR_IDLE) ||
- (cmd_buffer->device->physical_device->info->a6xx.has_ccu_flush_bug &&
- (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CCU_FLUSH_DEPTH))))
- tu_cs_emit_wfi(cs);
- if (flushes & TU_CMD_FLAG_WAIT_FOR_ME)
- tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
-}
+ const uint32_t max_pipe_count = 32; /* A6xx */
+ const uint32_t used_pipe_count =
+ tiling->pipe_count.width * tiling->pipe_count.height;
+ const VkExtent2D last_pipe = {
+ .width = tiling->tile_count.width % tiling->pipe0.width,
+ .height = tiling->tile_count.height % tiling->pipe0.height,
+ };
+
+ assert(used_pipe_count <= max_pipe_count);
+ assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
+
+ for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
+ for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
+ const uint32_t pipe_x = tiling->pipe0.width * x;
+ const uint32_t pipe_y = tiling->pipe0.height * y;
+ const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
+ ? last_pipe.width
+ : tiling->pipe0.width;
+ const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
+ ? last_pipe.height
+ : tiling->pipe0.height;
+ const uint32_t n = tiling->pipe_count.width * y + x;
+
+ tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
+ A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
+ A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
+ A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
+ tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
+ }
+ }
-/* "Normal" cache flushes, that don't require any special handling */
+ memset(tiling->pipe_config + used_pipe_count, 0,
+ sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
+}
static void
-tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer,
- struct tu_cs *cs)
-{
- tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.cache.flush_bits);
- cmd_buffer->state.cache.flush_bits = 0;
+tu_tiling_config_update(struct tu_tiling_config *tiling,
+ const struct tu_device *dev,
+ const uint32_t *buffer_cpp,
+ uint32_t buffer_count,
+ const VkRect2D *render_area)
+{
+ /* see if there is any real change */
+ const bool ra_changed =
+ render_area &&
+ memcmp(&tiling->render_area, render_area, sizeof(*render_area));
+ const bool buf_changed = tiling->buffer_count != buffer_count ||
+ memcmp(tiling->buffer_cpp, buffer_cpp,
+ sizeof(*buffer_cpp) * buffer_count);
+ if (!ra_changed && !buf_changed)
+ return;
+
+ if (ra_changed)
+ tiling->render_area = *render_area;
+
+ if (buf_changed) {
+ memcpy(tiling->buffer_cpp, buffer_cpp,
+ sizeof(*buffer_cpp) * buffer_count);
+ tiling->buffer_count = buffer_count;
+ }
+
+ tu_tiling_config_update_tile_layout(tiling, dev);
+ tu_tiling_config_update_pipe_layout(tiling, dev);
+ tu_tiling_config_update_pipes(tiling, dev);
}
-/* Renderpass cache flushes */
+static void
+tu_tiling_config_get_tile(const struct tu_tiling_config *tiling,
+ const struct tu_device *dev,
+ uint32_t tx,
+ uint32_t ty,
+ struct tu_tile *tile)
+{
+ /* find the pipe and the slot for tile (tx, ty) */
+ const uint32_t px = tx / tiling->pipe0.width;
+ const uint32_t py = ty / tiling->pipe0.height;
+ const uint32_t sx = tx - tiling->pipe0.width * px;
+ const uint32_t sy = ty - tiling->pipe0.height * py;
+
+ assert(tx < tiling->tile_count.width && ty < tiling->tile_count.height);
+ assert(px < tiling->pipe_count.width && py < tiling->pipe_count.height);
+ assert(sx < tiling->pipe0.width && sy < tiling->pipe0.height);
+
+ /* convert to 1D indices */
+ tile->pipe = tiling->pipe_count.width * py + px;
+ tile->slot = tiling->pipe0.width * sy + sx;
+
+ /* get the blit area for the tile */
+ tile->begin = (VkOffset2D) {
+ .x = tiling->tile0.offset.x + tiling->tile0.extent.width * tx,
+ .y = tiling->tile0.offset.y + tiling->tile0.extent.height * ty,
+ };
+ tile->end.x =
+ (tx == tiling->tile_count.width - 1)
+ ? tiling->render_area.offset.x + tiling->render_area.extent.width
+ : tile->begin.x + tiling->tile0.extent.width;
+ tile->end.y =
+ (ty == tiling->tile_count.height - 1)
+ ? tiling->render_area.offset.y + tiling->render_area.extent.height
+ : tile->begin.y + tiling->tile0.extent.height;
+}
+
+static enum a3xx_msaa_samples
+tu6_msaa_samples(uint32_t samples)
+{
+ switch (samples) {
+ case 1:
+ return MSAA_ONE;
+ case 2:
+ return MSAA_TWO;
+ case 4:
+ return MSAA_FOUR;
+ case 8:
+ return MSAA_EIGHT;
+ default:
+ assert(!"invalid sample count");
+ return MSAA_ONE;
+ }
+}
-void
-tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
- struct tu_cs *cs)
+static enum a4xx_index_size
+tu6_index_size(VkIndexType type)
{
- if (!cmd_buffer->state.renderpass_cache.flush_bits &&
- likely(!cmd_buffer->device->physical_device->instance->debug_flags))
- return;
- tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.renderpass_cache.flush_bits);
- cmd_buffer->state.renderpass_cache.flush_bits = 0;
+ switch (type) {
+ case VK_INDEX_TYPE_UINT16:
+ return INDEX4_SIZE_16_BIT;
+ case VK_INDEX_TYPE_UINT32:
+ return INDEX4_SIZE_32_BIT;
+ default:
+ unreachable("invalid VkIndexType");
+ return INDEX4_SIZE_8_BIT;
+ }
}
-/* Cache flushes for things that use the color/depth read/write path (i.e.
- * blits and draws). This deals with changing CCU state as well as the usual
- * cache flushing.
- */
+static void
+tu6_emit_marker(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+ tu_cs_emit_write_reg(cs, cmd->marker_reg, ++cmd->marker_seqno);
+}
void
-tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
- struct tu_cs *cs,
- enum tu_cmd_ccu_state ccu_state)
+tu6_emit_event_write(struct tu_cmd_buffer *cmd,
+ struct tu_cs *cs,
+ enum vgt_event_type event,
+ bool need_seqno)
{
- enum tu_cmd_flush_bits flushes = cmd_buffer->state.cache.flush_bits;
+ tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
+ tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
+ if (need_seqno) {
+ tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
+ tu_cs_emit(cs, ++cmd->scratch_seqno);
+ }
+}
- assert(ccu_state != TU_CMD_CCU_UNKNOWN);
- /* It's unsafe to flush inside condition because we clear flush_bits */
- assert(!cs->cond_stack_depth);
+static void
+tu6_emit_cache_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+ tu6_emit_event_write(cmd, cs, 0x31, false);
+}
- /* Changing CCU state must involve invalidating the CCU. In sysmem mode,
- * the CCU may also contain data that we haven't flushed out yet, so we
- * also need to flush. Also, in order to program RB_CCU_CNTL, we need to
- * emit a WFI as it isn't pipelined.
- */
- if (ccu_state != cmd_buffer->state.ccu_state) {
- if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) {
- flushes |=
- TU_CMD_FLAG_CCU_FLUSH_COLOR |
- TU_CMD_FLAG_CCU_FLUSH_DEPTH;
- cmd_buffer->state.cache.pending_flush_bits &= ~(
- TU_CMD_FLAG_CCU_FLUSH_COLOR |
- TU_CMD_FLAG_CCU_FLUSH_DEPTH);
- }
- flushes |=
- TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
- TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
- TU_CMD_FLAG_WAIT_FOR_IDLE;
- cmd_buffer->state.cache.pending_flush_bits &= ~(
- TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
- TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
- TU_CMD_FLAG_WAIT_FOR_IDLE);
- }
+static void
+tu6_emit_lrz_flush(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+ tu6_emit_event_write(cmd, cs, LRZ_FLUSH, false);
+}
- tu6_emit_flushes(cmd_buffer, cs, flushes);
- cmd_buffer->state.cache.flush_bits = 0;
-
- if (ccu_state != cmd_buffer->state.ccu_state) {
- struct tu_physical_device *phys_dev = cmd_buffer->device->physical_device;
- tu_cs_emit_regs(cs,
- A6XX_RB_CCU_CNTL(.color_offset =
- ccu_state == TU_CMD_CCU_GMEM ?
- phys_dev->ccu_offset_gmem :
- phys_dev->ccu_offset_bypass,
- .gmem = ccu_state == TU_CMD_CCU_GMEM));
- cmd_buffer->state.ccu_state = ccu_state;
+static void
+tu6_emit_wfi(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+ if (cmd->wait_for_idle) {
+ tu_cs_emit_wfi(cs);
+ cmd->wait_for_idle = false;
}
}
static void
-tu6_emit_zs(struct tu_cmd_buffer *cmd,
- const struct tu_subpass *subpass,
- struct tu_cs *cs)
+tu6_emit_zs(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
+ const struct tu_subpass *subpass = cmd->state.subpass;
+
const uint32_t a = subpass->depth_stencil_attachment.attachment;
if (a == VK_ATTACHMENT_UNUSED) {
- tu_cs_emit_regs(cs,
- A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),
- A6XX_RB_DEPTH_BUFFER_PITCH(0),
- A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
- A6XX_RB_DEPTH_BUFFER_BASE(0),
- A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6);
+ tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH6_NONE));
+ tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_PITCH */
+ tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_ARRAY_PITCH */
+ tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_BASE_LO */
+ tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_BASE_HI */
+ tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_BUFFER_BASE_GMEM */
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_DEPTH_BUFFER_INFO, 1);
+ tu_cs_emit(cs,
+ A6XX_GRAS_SU_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH6_NONE));
- tu_cs_emit_regs(cs,
- A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_BUFFER_BASE_LO, 5);
+ tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_LO */
+ tu_cs_emit(cs, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_HI */
+ tu_cs_emit(cs, 0x00000000); /* GRAS_LRZ_BUFFER_PITCH */
+ tu_cs_emit(cs, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO */
+ tu_cs_emit(cs, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_HI */
- tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 1);
+ tu_cs_emit(cs, 0x00000000); /* RB_STENCIL_INFO */
return;
}
- const struct tu_image_view *iview = cmd->state.attachments[a];
- const struct tu_render_pass_attachment *attachment =
- &cmd->state.pass->attachments[a];
- enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format);
-
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6);
- tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value);
- if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT)
- tu_cs_image_depth_ref(cs, iview, 0);
- else
- tu_cs_image_ref(cs, &iview->view, 0);
- tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, attachment));
-
- tu_cs_emit_regs(cs,
- A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
-
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3);
- tu_cs_image_flag_ref(cs, &iview->view, 0);
-
- if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
- attachment->format == VK_FORMAT_S8_UINT) {
-
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6);
- tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value);
- if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
- tu_cs_image_stencil_ref(cs, iview, 0);
- tu_cs_emit(cs, tu_attachment_gmem_offset_stencil(cmd, attachment));
- } else {
- tu_cs_image_ref(cs, &iview->view, 0);
- tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, attachment));
- }
- } else {
- tu_cs_emit_regs(cs,
- A6XX_RB_STENCIL_INFO(0));
- }
+ /* enable zs? */
}
static void
-tu6_emit_mrt(struct tu_cmd_buffer *cmd,
- const struct tu_subpass *subpass,
- struct tu_cs *cs)
+tu6_emit_mrt(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
const struct tu_framebuffer *fb = cmd->state.framebuffer;
+ const struct tu_subpass *subpass = cmd->state.subpass;
+ const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
+ unsigned char mrt_comp[MAX_RTS] = { 0 };
+ unsigned srgb_cntl = 0;
- enum a6xx_format mrt0_format = 0;
-
+ uint32_t gmem_index = 0;
for (uint32_t i = 0; i < subpass->color_count; ++i) {
uint32_t a = subpass->color_attachments[i].attachment;
- if (a == VK_ATTACHMENT_UNUSED) {
- /* From the VkPipelineRenderingCreateInfo definition:
- *
- * Valid formats indicate that an attachment can be used - but it
- * is still valid to set the attachment to NULL when beginning
- * rendering.
- *
- * This means that with dynamic rendering, pipelines may write to
- * some attachments that are UNUSED here. Setting the format to 0
- * here should prevent them from writing to anything. This also seems
- * to also be required for alpha-to-coverage which can use the alpha
- * value for an otherwise-unused attachment.
- */
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
- for (unsigned i = 0; i < 6; i++)
- tu_cs_emit(cs, 0);
-
- tu_cs_emit_regs(cs,
- A6XX_SP_FS_MRT_REG(i, .dword = 0));
+ if (a == VK_ATTACHMENT_UNUSED)
continue;
- }
- const struct tu_image_view *iview = cmd->state.attachments[a];
+ const struct tu_image_view *iview = fb->attachments[a].attachment;
+ const struct tu_image_level *slice =
+ &iview->image->levels[iview->base_mip];
+ const enum a6xx_tile_mode tile_mode = TILE6_LINEAR;
+ uint32_t stride = 0;
+ uint32_t offset = 0;
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
- tu_cs_emit(cs, iview->view.RB_MRT_BUF_INFO);
- tu_cs_image_ref(cs, &iview->view, 0);
- tu_cs_emit(cs, tu_attachment_gmem_offset(cmd, &cmd->state.pass->attachments[a]));
+ mrt_comp[i] = 0xf;
- tu_cs_emit_regs(cs,
- A6XX_SP_FS_MRT_REG(i, .dword = iview->view.SP_FS_MRT_REG));
+ if (vk_format_is_srgb(iview->vk_format))
+ srgb_cntl |= (1 << i);
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR(i), 3);
- tu_cs_image_flag_ref(cs, &iview->view, 0);
+ const struct tu_native_format *format =
+ tu6_get_native_format(iview->vk_format);
+ assert(format && format->rb >= 0);
- if (i == 0)
- mrt0_format = iview->view.SP_FS_MRT_REG & 0xff;
- }
+ offset = slice->offset + slice->size * iview->base_layer;
+ stride = slice->pitch * vk_format_get_blocksize(iview->vk_format);
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
+ tu_cs_emit(cs, A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format->rb) |
+ A6XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) |
+ A6XX_RB_MRT_BUF_INFO_COLOR_SWAP(format->swap));
+ tu_cs_emit(cs, A6XX_RB_MRT_PITCH(stride));
+ tu_cs_emit(cs, A6XX_RB_MRT_ARRAY_PITCH(slice->size));
+ tu_cs_emit_qw(cs, iview->image->bo->iova + iview->image->bo_offset +
+ offset); /* BASE_LO/HI */
+ tu_cs_emit(
+ cs, tiling->gmem_offsets[gmem_index++]); /* RB_MRT[i].BASE_GMEM */
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_MRT_REG(i), 1);
+ tu_cs_emit(cs, A6XX_SP_FS_MRT_REG_COLOR_FORMAT(format->rb));
+
+#if 0
+ /* when we support UBWC, these would be the system memory
+ * addr/pitch/etc:
+ */
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(i), 4);
+ tu_cs_emit(cs, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_LO */
+ tu_cs_emit(cs, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_HI */
+ tu_cs_emit(cs, A6XX_RB_MRT_FLAG_BUFFER_PITCH(0));
+ tu_cs_emit(cs, A6XX_RB_MRT_FLAG_BUFFER_ARRAY_PITCH(0));
+#endif
+ }
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_SRGB_CNTL, 1);
+ tu_cs_emit(cs, srgb_cntl);
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_SRGB_CNTL, 1);
+ tu_cs_emit(cs, srgb_cntl);
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_COMPONENTS, 1);
+ tu_cs_emit(cs, A6XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+ A6XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+ A6XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+ A6XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+ A6XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+ A6XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+ A6XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+ A6XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_RENDER_COMPONENTS, 1);
+ tu_cs_emit(cs, A6XX_SP_FS_RENDER_COMPONENTS_RT0(mrt_comp[0]) |
+ A6XX_SP_FS_RENDER_COMPONENTS_RT1(mrt_comp[1]) |
+ A6XX_SP_FS_RENDER_COMPONENTS_RT2(mrt_comp[2]) |
+ A6XX_SP_FS_RENDER_COMPONENTS_RT3(mrt_comp[3]) |
+ A6XX_SP_FS_RENDER_COMPONENTS_RT4(mrt_comp[4]) |
+ A6XX_SP_FS_RENDER_COMPONENTS_RT5(mrt_comp[5]) |
+ A6XX_SP_FS_RENDER_COMPONENTS_RT6(mrt_comp[6]) |
+ A6XX_SP_FS_RENDER_COMPONENTS_RT7(mrt_comp[7]));
+}
- tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = mrt0_format));
+static void
+tu6_emit_msaa(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+ const struct tu_subpass *subpass = cmd->state.subpass;
+ const enum a3xx_msaa_samples samples =
+ tu6_msaa_samples(subpass->max_sample_count);
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_RAS_MSAA_CNTL, 2);
+ tu_cs_emit(cs, A6XX_SP_TP_RAS_MSAA_CNTL_SAMPLES(samples));
+ tu_cs_emit(
+ cs, A6XX_SP_TP_DEST_MSAA_CNTL_SAMPLES(samples) |
+ ((samples == MSAA_ONE) ? A6XX_SP_TP_DEST_MSAA_CNTL_MSAA_DISABLE
+ : 0));
- tu_cs_emit_regs(cs,
- A6XX_RB_SRGB_CNTL(.dword = subpass->srgb_cntl));
- tu_cs_emit_regs(cs,
- A6XX_SP_SRGB_CNTL(.dword = subpass->srgb_cntl));
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_RAS_MSAA_CNTL, 2);
+ tu_cs_emit(cs, A6XX_GRAS_RAS_MSAA_CNTL_SAMPLES(samples));
+ tu_cs_emit(
+ cs,
+ A6XX_GRAS_DEST_MSAA_CNTL_SAMPLES(samples) |
+ ((samples == MSAA_ONE) ? A6XX_GRAS_DEST_MSAA_CNTL_MSAA_DISABLE : 0));
- unsigned layers = MAX2(fb->layers, util_logbase2(subpass->multiview_mask) + 1);
- tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(layers - 1));
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_RAS_MSAA_CNTL, 2);
+ tu_cs_emit(cs, A6XX_RB_RAS_MSAA_CNTL_SAMPLES(samples));
+ tu_cs_emit(
+ cs,
+ A6XX_RB_DEST_MSAA_CNTL_SAMPLES(samples) |
+ ((samples == MSAA_ONE) ? A6XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE : 0));
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_MSAA_CNTL, 1);
+ tu_cs_emit(cs, A6XX_RB_MSAA_CNTL_SAMPLES(samples));
}
static void
-tu6_emit_bin_size(struct tu_cs *cs,
- uint32_t bin_w, uint32_t bin_h, uint32_t flags)
+tu6_emit_bin_size(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t flags)
{
- tu_cs_emit_regs(cs,
- A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
- .binh = bin_h,
- .dword = flags));
+ const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
+ const uint32_t bin_w = tiling->tile0.extent.width;
+ const uint32_t bin_h = tiling->tile0.extent.height;
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_BIN_CONTROL, 1);
+ tu_cs_emit(cs, A6XX_GRAS_BIN_CONTROL_BINW(bin_w) |
+ A6XX_GRAS_BIN_CONTROL_BINH(bin_h) | flags);
- tu_cs_emit_regs(cs,
- A6XX_RB_BIN_CONTROL(.binw = bin_w,
- .binh = bin_h,
- .dword = flags));
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_BIN_CONTROL, 1);
+ tu_cs_emit(cs, A6XX_RB_BIN_CONTROL_BINW(bin_w) |
+ A6XX_RB_BIN_CONTROL_BINH(bin_h) | flags);
/* no flag for RB_BIN_CONTROL2... */
- tu_cs_emit_regs(cs,
- A6XX_RB_BIN_CONTROL2(.binw = bin_w,
- .binh = bin_h));
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_BIN_CONTROL2, 1);
+ tu_cs_emit(cs, A6XX_RB_BIN_CONTROL2_BINW(bin_w) |
+ A6XX_RB_BIN_CONTROL2_BINH(bin_h));
}
static void
tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
- const struct tu_subpass *subpass,
struct tu_cs *cs,
bool binning)
{
- /* doesn't RB_RENDER_CNTL set differently for binning pass: */
- bool no_track = !cmd->device->physical_device->info->a6xx.has_cp_reg_write;
uint32_t cntl = 0;
- cntl |= A6XX_RB_RENDER_CNTL_CCUSINGLECACHELINESIZE(2);
- if (binning) {
- if (no_track)
- return;
+ cntl |= A6XX_RB_RENDER_CNTL_UNK4;
+ if (binning)
cntl |= A6XX_RB_RENDER_CNTL_BINNING;
- } else {
- uint32_t mrts_ubwc_enable = 0;
- for (uint32_t i = 0; i < subpass->color_count; ++i) {
- uint32_t a = subpass->color_attachments[i].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
-
- const struct tu_image_view *iview = cmd->state.attachments[a];
- if (iview->view.ubwc_enabled)
- mrts_ubwc_enable |= 1 << i;
- }
-
- cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
-
- const uint32_t a = subpass->depth_stencil_attachment.attachment;
- if (a != VK_ATTACHMENT_UNUSED) {
- const struct tu_image_view *iview = cmd->state.attachments[a];
- if (iview->view.ubwc_enabled)
- cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
- }
-
- if (no_track) {
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CNTL, 1);
- tu_cs_emit(cs, cntl);
- return;
- }
-
- /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
- * in order to set it correctly for the different subpasses. However,
- * that means the packets we're emitting also happen during binning. So
- * we need to guard the write on !BINNING at CP execution time.
- */
- tu_cs_reserve(cs, 3 + 4);
- tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
- tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
- CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
- tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));
- }
tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
- tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
+ tu_cs_emit(cs, 0x2);
tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
tu_cs_emit(cs, cntl);
}
static void
-tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
+tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
- struct tu_physical_device *phys_dev = cmd->device->physical_device;
- const VkRect2D *render_area = &cmd->state.render_area;
-
- /* Avoid assertion fails with an empty render area at (0, 0) where the
- * subtraction below wraps around. Empty render areas should be forced to
- * the sysmem path by use_sysmem_rendering(). It's not even clear whether
- * an empty scissor here works, and the blob seems to force sysmem too as
- * it sets something wrong (non-empty) for the scissor.
- */
- if (render_area->extent.width == 0 ||
- render_area->extent.height == 0)
- return;
-
- uint32_t x1 = render_area->offset.x;
- uint32_t y1 = render_area->offset.y;
- uint32_t x2 = x1 + render_area->extent.width - 1;
- uint32_t y2 = y1 + render_area->extent.height - 1;
-
- if (align) {
- x1 = x1 & ~(phys_dev->info->gmem_align_w - 1);
- y1 = y1 & ~(phys_dev->info->gmem_align_h - 1);
- x2 = ALIGN_POT(x2 + 1, phys_dev->info->gmem_align_w) - 1;
- y2 = ALIGN_POT(y2 + 1, phys_dev->info->gmem_align_h) - 1;
- }
+ const VkRect2D *render_area = &cmd->state.tiling_config.render_area;
+ const uint32_t x1 = render_area->offset.x;
+ const uint32_t y1 = render_area->offset.y;
+ const uint32_t x2 = x1 + render_area->extent.width - 1;
+ const uint32_t y2 = y1 + render_area->extent.height - 1;
- tu_cs_emit_regs(cs,
- A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
- A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
+ tu_cs_emit(cs,
+ A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
+ tu_cs_emit(cs,
+ A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
}
-void
-tu6_emit_window_scissor(struct tu_cs *cs,
- uint32_t x1,
- uint32_t y1,
- uint32_t x2,
- uint32_t y2)
-{
- tu_cs_emit_regs(cs,
- A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
- A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
-
- tu_cs_emit_regs(cs,
- A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = x1, .y = y1),
- A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2));
+static void
+tu6_emit_blit_info(struct tu_cmd_buffer *cmd,
+ struct tu_cs *cs,
+ const struct tu_image_view *iview,
+ uint32_t gmem_offset,
+ uint32_t blit_info)
+{
+ const struct tu_image_level *slice =
+ &iview->image->levels[iview->base_mip];
+ const uint32_t offset = slice->offset + slice->size * iview->base_layer;
+ const uint32_t stride =
+ slice->pitch * vk_format_get_blocksize(iview->vk_format);
+ const enum a6xx_tile_mode tile_mode = TILE6_LINEAR;
+ const enum a3xx_msaa_samples samples = tu6_msaa_samples(1);
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
+ tu_cs_emit(cs, blit_info);
+
+ /* tile mode? */
+ const struct tu_native_format *format =
+ tu6_get_native_format(iview->vk_format);
+ assert(format && format->rb >= 0);
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 5);
+ tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_TILE_MODE(tile_mode) |
+ A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) |
+ A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(format->rb) |
+ A6XX_RB_BLIT_DST_INFO_COLOR_SWAP(format->swap));
+ tu_cs_emit_qw(cs,
+ iview->image->bo->iova + iview->image->bo_offset + offset);
+ tu_cs_emit(cs, A6XX_RB_BLIT_DST_PITCH(stride));
+ tu_cs_emit(cs, A6XX_RB_BLIT_DST_ARRAY_PITCH(slice->size));
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
+ tu_cs_emit(cs, gmem_offset);
}
-void
-tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1)
-{
- tu_cs_emit_regs(cs,
- A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
-
- tu_cs_emit_regs(cs,
- A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
+static void
+tu6_emit_blit_clear(struct tu_cmd_buffer *cmd,
+ struct tu_cs *cs,
+ const struct tu_image_view *iview,
+ uint32_t gmem_offset,
+ const VkClearValue *clear_value)
+{
+ const enum a6xx_tile_mode tile_mode = TILE6_LINEAR;
+ const enum a3xx_msaa_samples samples = tu6_msaa_samples(1);
+
+ const struct tu_native_format *format =
+ tu6_get_native_format(iview->vk_format);
+ assert(format && format->rb >= 0);
+ /* must be WZYX; other values are ignored */
+ const enum a3xx_color_swap swap = WZYX;
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
+ tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_TILE_MODE(tile_mode) |
+ A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) |
+ A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(format->rb) |
+ A6XX_RB_BLIT_DST_INFO_COLOR_SWAP(swap));
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_INFO, 1);
+ tu_cs_emit(cs, A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(0xf));
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
+ tu_cs_emit(cs, gmem_offset);
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
+ tu_cs_emit(cs, 0);
- tu_cs_emit_regs(cs,
- A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));
+ /* pack clear_value into WZYX order */
+ uint32_t clear_vals[4] = { 0 };
+ tu_pack_clear_value(clear_value, iview->vk_format, clear_vals);
- tu_cs_emit_regs(cs,
- A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
+ tu_cs_emit(cs, clear_vals[0]);
+ tu_cs_emit(cs, clear_vals[1]);
+ tu_cs_emit(cs, clear_vals[2]);
+ tu_cs_emit(cs, clear_vals[3]);
}
-void
-tu6_apply_depth_bounds_workaround(struct tu_device *device,
- uint32_t *rb_depth_cntl)
+static void
+tu6_emit_blit(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
- if (!device->physical_device->info->a6xx.depth_bounds_require_depth_test_quirk)
- return;
-
- /* On some GPUs it is necessary to enable z test for depth bounds test when
- * UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is required to
- * pass z test. Relevant tests:
- * dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable
- * dEQP-VK.dynamic_state.ds_state.depth_bounds_1
- */
- *rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE |
- A6XX_RB_DEPTH_CNTL_ZFUNC(FUNC_ALWAYS);
+ tu6_emit_marker(cmd, cs);
+ tu6_emit_event_write(cmd, cs, BLIT, false);
+ tu6_emit_marker(cmd, cs);
}
static void
-tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
+tu6_emit_window_scissor(struct tu_cmd_buffer *cmd,
+ struct tu_cs *cs,
+ uint32_t x1,
+ uint32_t y1,
+ uint32_t x2,
+ uint32_t y2)
{
- uint32_t enable_mask;
- switch (id) {
- case TU_DRAW_STATE_PROGRAM:
- /* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even
- * when resources would actually be used in the binning shader.
- * Presumably the overhead of prefetching the resources isn't
- * worth it.
- */
- case TU_DRAW_STATE_DESC_SETS_LOAD:
- enable_mask = CP_SET_DRAW_STATE__0_GMEM |
- CP_SET_DRAW_STATE__0_SYSMEM;
- break;
- case TU_DRAW_STATE_PROGRAM_BINNING:
- enable_mask = CP_SET_DRAW_STATE__0_BINNING;
- break;
- case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM:
- case TU_DRAW_STATE_PRIM_MODE_GMEM:
- enable_mask = CP_SET_DRAW_STATE__0_GMEM;
- break;
- case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM:
- case TU_DRAW_STATE_PRIM_MODE_SYSMEM:
- enable_mask = CP_SET_DRAW_STATE__0_SYSMEM;
- break;
- default:
- enable_mask = CP_SET_DRAW_STATE__0_GMEM |
- CP_SET_DRAW_STATE__0_SYSMEM |
- CP_SET_DRAW_STATE__0_BINNING;
- break;
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_WINDOW_SCISSOR_TL, 2);
+ tu_cs_emit(cs, A6XX_GRAS_SC_WINDOW_SCISSOR_TL_X(x1) |
+ A6XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(y1));
+ tu_cs_emit(cs, A6XX_GRAS_SC_WINDOW_SCISSOR_BR_X(x2) |
+ A6XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(y2));
- STATIC_ASSERT(TU_DRAW_STATE_COUNT <= 32);
-
- /* We need to reload the descriptors every time the descriptor sets
- * change. However, the commands we send only depend on the pipeline
- * because the whole point is to cache descriptors which are used by the
- * pipeline. There's a problem here, in that the firmware has an
- * "optimization" which skips executing groups that are set to the same
- * value as the last draw. This means that if the descriptor sets change
- * but not the pipeline, we'd try to re-execute the same buffer which
- * the firmware would ignore and we wouldn't pre-load the new
- * descriptors. Set the DIRTY bit to avoid this optimization
- */
- if (id == TU_DRAW_STATE_DESC_SETS_LOAD)
- enable_mask |= CP_SET_DRAW_STATE__0_DIRTY;
-
- tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(state.size) |
- enable_mask |
- CP_SET_DRAW_STATE__0_GROUP_ID(id) |
- COND(!state.size || !state.iova, CP_SET_DRAW_STATE__0_DISABLE));
- tu_cs_emit_qw(cs, state.iova);
-}
-
-void
-tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples,
- bool msaa_disable)
-{
- const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples);
- msaa_disable |= (samples == MSAA_ONE);
- tu_cs_emit_regs(cs,
- A6XX_SP_TP_RAS_MSAA_CNTL(samples),
- A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
- .msaa_disable = msaa_disable));
-
- tu_cs_emit_regs(cs,
- A6XX_GRAS_RAS_MSAA_CNTL(samples),
- A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
- .msaa_disable = msaa_disable));
-
- tu_cs_emit_regs(cs,
- A6XX_RB_RAS_MSAA_CNTL(samples),
- A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
- .msaa_disable = msaa_disable));
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_RESOLVE_CNTL_1, 2);
+ tu_cs_emit(
+ cs, A6XX_GRAS_RESOLVE_CNTL_1_X(x1) | A6XX_GRAS_RESOLVE_CNTL_1_Y(y1));
+ tu_cs_emit(
+ cs, A6XX_GRAS_RESOLVE_CNTL_2_X(x2) | A6XX_GRAS_RESOLVE_CNTL_2_Y(y2));
}
static void
-tu6_update_msaa(struct tu_cmd_buffer *cmd, VkSampleCountFlagBits samples)
-{
- bool is_line =
- tu6_primtype_line(cmd->state.primtype) ||
- (tu6_primtype_patches(cmd->state.primtype) &&
- cmd->state.pipeline &&
- cmd->state.pipeline->tess.patch_type == IR3_TESS_ISOLINES);
- bool msaa_disable = is_line && cmd->state.line_mode == BRESENHAM;
-
- if (cmd->state.msaa_disable != msaa_disable ||
- cmd->state.samples != samples) {
- struct tu_cs cs;
- cmd->state.msaa = tu_cs_draw_state(&cmd->sub_cs, &cs, 9);
- tu6_emit_msaa(&cs, samples, msaa_disable);
- if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
- tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
- tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_MSAA, cmd->state.msaa);
- }
- cmd->state.msaa_disable = msaa_disable;
- cmd->state.samples = samples;
- }
-}
-
-static bool
-use_hw_binning(struct tu_cmd_buffer *cmd)
+tu6_emit_window_offset(struct tu_cmd_buffer *cmd,
+ struct tu_cs *cs,
+ uint32_t x1,
+ uint32_t y1)
{
- const struct tu_framebuffer *fb = cmd->state.framebuffer;
- const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout];
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_WINDOW_OFFSET, 1);
+ tu_cs_emit(cs, A6XX_RB_WINDOW_OFFSET_X(x1) | A6XX_RB_WINDOW_OFFSET_Y(y1));
- /* XFB commands are emitted for BINNING || SYSMEM, which makes it
- * incompatible with non-hw binning GMEM rendering. this is required because
- * some of the XFB commands need to only be executed once.
- * use_sysmem_rendering() should have made sure we only ended up here if no
- * XFB was used.
- */
- if (cmd->state.rp.xfb_used) {
- assert(tiling->binning_possible);
- return true;
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_WINDOW_OFFSET2, 1);
+ tu_cs_emit(cs,
+ A6XX_RB_WINDOW_OFFSET2_X(x1) | A6XX_RB_WINDOW_OFFSET2_Y(y1));
- /* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT emulates GL_PRIMITIVES_GENERATED,
- * which wasn't designed to care about tilers and expects the result not to
- * be multiplied by tile count.
- * See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3131
- */
- if (cmd->state.rp.has_prim_generated_query_in_rp ||
- cmd->state.prim_generated_query_running_before_rp) {
- assert(tiling->binning_possible);
- return true;
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_WINDOW_OFFSET, 1);
+ tu_cs_emit(cs, A6XX_SP_WINDOW_OFFSET_X(x1) | A6XX_SP_WINDOW_OFFSET_Y(y1));
- return tiling->binning;
-}
-
-static bool
-use_sysmem_rendering(struct tu_cmd_buffer *cmd,
- struct tu_renderpass_result **autotune_result)
-{
- if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
- return true;
-
- /* can't fit attachments into gmem */
- if (!cmd->state.pass->gmem_pixels[cmd->state.gmem_layout])
- return true;
-
- if (cmd->state.framebuffer->layers > 1)
- return true;
-
- /* Use sysmem for empty render areas */
- if (cmd->state.render_area.extent.width == 0 ||
- cmd->state.render_area.extent.height == 0)
- return true;
-
- if (cmd->state.rp.has_tess)
- return true;
-
- if (cmd->state.rp.disable_gmem)
- return true;
-
- /* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */
- if (cmd->state.rp.xfb_used && !cmd->state.tiling->binning_possible)
- return true;
-
- /* QUERY_TYPE_PRIMITIVES_GENERATED is incompatible with non-hw binning
- * GMEM rendering, see use_hw_binning.
- */
- if ((cmd->state.rp.has_prim_generated_query_in_rp ||
- cmd->state.prim_generated_query_running_before_rp) &&
- !cmd->state.tiling->binning_possible)
- return true;
-
- if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_GMEM))
- return false;
-
- bool use_sysmem = tu_autotune_use_bypass(&cmd->device->autotune,
- cmd, autotune_result);
- if (*autotune_result) {
- list_addtail(&(*autotune_result)->node, &cmd->renderpass_autotune_results);
- }
-
- return use_sysmem;
-}
-
-/* Optimization: there is no reason to load gmem if there is no
- * geometry to process. COND_REG_EXEC predicate is set here,
- * but the actual skip happens in tu6_emit_tile_load() and tile_store_cs,
- * for each blit separately.
- */
-static void
-tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
- uint32_t pipe, uint32_t slot, bool wfm)
-{
- if (cmd->state.tiling->binning_possible) {
- tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
- tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(pipe)) |
- A6XX_CP_REG_TEST_0_BIT(slot) |
- COND(wfm, A6XX_CP_REG_TEST_0_WAIT_FOR_ME));
- } else {
- /* COND_REG_EXECs are not emitted in non-binning case */
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_WINDOW_OFFSET, 1);
+ tu_cs_emit(
+ cs, A6XX_SP_TP_WINDOW_OFFSET_X(x1) | A6XX_SP_TP_WINDOW_OFFSET_Y(y1));
}
static void
tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
- uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot)
+ const struct tu_tile *tile)
{
- const struct tu_tiling_config *tiling = cmd->state.tiling;
+ tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
+ tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(0x7));
+ tu6_emit_marker(cmd, cs);
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
- tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
+ tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM) | 0x10);
+ tu6_emit_marker(cmd, cs);
- const uint32_t x1 = tiling->tile0.width * tx;
- const uint32_t y1 = tiling->tile0.height * ty;
- const uint32_t x2 = MIN2(x1 + tiling->tile0.width - 1, MAX_VIEWPORT_SIZE - 1);
- const uint32_t y2 = MIN2(y1 + tiling->tile0.height - 1, MAX_VIEWPORT_SIZE - 1);
- tu6_emit_window_scissor(cs, x1, y1, x2, y2);
- tu6_emit_window_offset(cs, x1, y1);
+ const uint32_t x1 = tile->begin.x;
+ const uint32_t y1 = tile->begin.y;
+ const uint32_t x2 = tile->end.x - 1;
+ const uint32_t y2 = tile->end.y - 1;
+ tu6_emit_window_scissor(cmd, cs, x1, y1, x2, y2);
+ tu6_emit_window_offset(cmd, cs, x1, y1);
- bool hw_binning = use_hw_binning(cmd);
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_OVERRIDE, 1);
+ tu_cs_emit(cs, A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
- if (hw_binning) {
- tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
+ if (false) {
+ /* hw binning? */
+ } else {
+ tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
+ tu_cs_emit(cs, 0x1);
tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
tu_cs_emit(cs, 0x0);
-
- tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4);
- tu_cs_emit(cs, tiling->pipe_sizes[pipe] |
- CP_SET_BIN_DATA5_0_VSC_N(slot));
- tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch);
- tu_cs_emit(cs, pipe * 4);
- tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch);
}
-
- tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, hw_binning);
-
- tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
- tu_cs_emit(cs, !hw_binning);
-
- tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
- tu_cs_emit(cs, 0x0);
}
static void
-tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
- struct tu_cs *cs,
- uint32_t layer_mask,
- uint32_t a,
- uint32_t gmem_a)
+tu6_emit_tile_load(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
const struct tu_framebuffer *fb = cmd->state.framebuffer;
- const struct tu_image_view *dst = cmd->state.attachments[a];
- const struct tu_image_view *src = cmd->state.attachments[gmem_a];
-
- tu_resolve_sysmem(cmd, cs, src, dst, layer_mask, fb->layers, &cmd->state.render_area);
-}
-
-static void
-tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,
- struct tu_cs *cs,
- const struct tu_subpass *subpass)
-{
- if (subpass->resolve_attachments) {
- /* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass
- * Commands":
- *
- * End-of-subpass multisample resolves are treated as color
- * attachment writes for the purposes of synchronization.
- * This applies to resolve operations for both color and
- * depth/stencil attachments. That is, they are considered to
- * execute in the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT
- * pipeline stage and their writes are synchronized with
- * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between
- * rendering within a subpass and any resolve operations at the end
- * of the subpass occurs automatically, without need for explicit
- * dependencies or pipeline barriers. However, if the resolve
- * attachment is also used in a different subpass, an explicit
- * dependency is needed.
- *
- * We use the CP_BLIT path for sysmem resolves, which is really a
- * transfer command, so we have to manually flush similar to the gmem
- * resolve case. However, a flush afterwards isn't needed because of the
- * last sentence and the fact that we're in sysmem mode.
- */
- tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
- if (subpass->resolve_depth_stencil)
- tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
+ const struct tu_subpass *subpass = cmd->state.subpass;
+ const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
+ const struct tu_attachment_state *attachments = cmd->state.attachments;
- tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
+ tu6_emit_blit_scissor(cmd, cs);
- /* Wait for the flushes to land before using the 2D engine */
- tu_cs_emit_wfi(cs);
-
- for (unsigned i = 0; i < subpass->resolve_count; i++) {
- uint32_t a = subpass->resolve_attachments[i].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
-
- uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
+ uint32_t gmem_index = 0;
+ for (uint32_t i = 0; i < subpass->color_count; ++i) {
+ const uint32_t a = subpass->color_attachments[i].attachment;
+ if (a == VK_ATTACHMENT_UNUSED)
+ continue;
- tu6_emit_sysmem_resolve(cmd, cs, subpass->multiview_mask, a, gmem_a);
+ const struct tu_image_view *iview = fb->attachments[a].attachment;
+ const struct tu_attachment_state *att = attachments + a;
+ if (att->pending_clear_aspects) {
+ assert(att->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+ tu6_emit_blit_clear(cmd, cs, iview,
+ tiling->gmem_offsets[gmem_index++],
+ &att->clear_value);
+ } else {
+ tu6_emit_blit_info(cmd, cs, iview,
+ tiling->gmem_offsets[gmem_index++],
+ A6XX_RB_BLIT_INFO_UNK0 | A6XX_RB_BLIT_INFO_GMEM);
}
- }
-}
-static void
-tu6_emit_tile_load(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
-{
- tu6_emit_blit_scissor(cmd, cs, true);
+ tu6_emit_blit(cmd, cs);
+ }
- for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
- tu_load_gmem_attachment(cmd, cs, i, cmd->state.tiling->binning, false);
+ /* load/clear zs? */
}
static void
tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
- const struct tu_render_pass *pass = cmd->state.pass;
- const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
-
- tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
- tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
-
- tu6_emit_blit_scissor(cmd, cs, true);
+ const struct tu_framebuffer *fb = cmd->state.framebuffer;
+ const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
- for (uint32_t a = 0; a < pass->attachment_count; ++a) {
- if (pass->attachments[a].gmem)
- tu_store_gmem_attachment(cmd, cs, a, a, cmd->state.tiling->binning_possible);
+ if (false) {
+ /* hw binning? */
}
- if (subpass->resolve_attachments) {
- for (unsigned i = 0; i < subpass->resolve_count; i++) {
- uint32_t a = subpass->resolve_attachments[i].attachment;
- if (a != VK_ATTACHMENT_UNUSED) {
- uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
- tu_store_gmem_attachment(cmd, cs, a, gmem_a, false);
- }
- }
- }
-}
-
-void
-tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
-{
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
@@ -881,80 +809,84 @@ tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
- cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
+ tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
+ tu_cs_emit(cs, 0x0);
+
+ tu6_emit_marker(cmd, cs);
+ tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
+ tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE) | 0x10);
+ tu6_emit_marker(cmd, cs);
+
+ tu6_emit_blit_scissor(cmd, cs);
+
+ uint32_t gmem_index = 0;
+ for (uint32_t i = 0; i < cmd->state.subpass->color_count; ++i) {
+ uint32_t a = cmd->state.subpass->color_attachments[i].attachment;
+ if (a == VK_ATTACHMENT_UNUSED)
+ continue;
+
+ const struct tu_image_view *iview = fb->attachments[a].attachment;
+ tu6_emit_blit_info(cmd, cs, iview, tiling->gmem_offsets[gmem_index++],
+ 0);
+ tu6_emit_blit(cmd, cs);
+ }
+}
+
+static void
+tu6_emit_restart_index(struct tu_cs *cs, uint32_t restart_index)
+{
+ tu_cs_emit_pkt4(cs, REG_A6XX_PC_RESTART_INDEX, 1);
+ tu_cs_emit(cs, restart_index);
}
static void
tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
- struct tu_device *dev = cmd->device;
- const struct tu_physical_device *phys_dev = dev->physical_device;
-
- tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
-
- tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
- .vs_state = true,
- .hs_state = true,
- .ds_state = true,
- .gs_state = true,
- .fs_state = true,
- .cs_state = true,
- .gfx_ibo = true,
- .cs_ibo = true,
- .gfx_shared_const = true,
- .cs_shared_const = true,
- .gfx_bindless = 0x1f,
- .cs_bindless = 0x1f));
-
- tu_cs_emit_wfi(cs);
-
- cmd->state.cache.pending_flush_bits &=
- ~(TU_CMD_FLAG_WAIT_FOR_IDLE | TU_CMD_FLAG_CACHE_INVALIDATE);
-
- tu_cs_emit_regs(cs,
- A6XX_RB_CCU_CNTL(.color_offset = phys_dev->ccu_offset_bypass));
- cmd->state.ccu_state = TU_CMD_CCU_SYSMEM;
- tu_cs_emit_write_reg(cs, REG_A6XX_RB_DBG_ECO_CNTL, 0x00100000);
- tu_cs_emit_write_reg(cs, REG_A6XX_SP_FLOAT_CNTL, 0);
- tu_cs_emit_write_reg(cs, REG_A6XX_SP_DBG_ECO_CNTL,
- phys_dev->info->a6xx.magic.SP_DBG_ECO_CNTL);
- tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
- tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
- tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_DBG_ECO_CNTL,
- phys_dev->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
+ VkResult result = tu_cs_reserve_space(cmd->device, cs, 256);
+ if (result != VK_SUCCESS) {
+ cmd->record_result = result;
+ return;
+ }
+
+ tu6_emit_cache_flush(cmd, cs);
+
+ tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 0xfffff);
+
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_CCU_CNTL, 0x7c400004);
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
+ tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE04, 0x8);
+ tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE0F, 0x3f);
+ tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B605, 0x44);
+ tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B600, 0x100000);
tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
- tu_cs_emit_write_reg(cs, REG_A6XX_VPC_DBG_ECO_CNTL,
- phys_dev->info->a6xx.magic.VPC_DBG_ECO_CNTL);
- tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_DBG_ECO_CNTL,
- phys_dev->info->a6xx.magic.GRAS_DBG_ECO_CNTL);
- tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_DBG_ECO_CNTL,
- phys_dev->info->a6xx.magic.HLSQ_DBG_ECO_CNTL);
- tu_cs_emit_write_reg(cs, REG_A6XX_SP_CHICKEN_BITS,
- phys_dev->info->a6xx.magic.SP_CHICKEN_BITS);
+ tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8600, 0x880);
+ tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE03, 0x00000410);
tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
- tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = false));
- tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12,
- phys_dev->info->a6xx.magic.UCHE_UNKNOWN_0E12);
- tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF,
- phys_dev->info->a6xx.magic.UCHE_CLIENT_PF);
- tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01,
- phys_dev->info->a6xx.magic.RB_UNKNOWN_8E01);
- tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);
- tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
- .isammode = ISAMMODE_GL,
- .shared_consts_enable = false));
-
- /* TODO: set A6XX_VFD_ADD_OFFSET_INSTANCE and fix ir3 to avoid adding base instance */
- tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
+ tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BB11, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
+ tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AB00, 0x5);
+ tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A009, 0x00000001);
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
- tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL,
- phys_dev->info->a6xx.magic.PC_MODE_CNTL);
+ tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
+
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_SRGB_CNTL, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8101, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL0, 0x401);
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_RENDER_CONTROL1, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_SAMPLE_CNTL, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
@@ -964,834 +896,705 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
- tu_cs_emit_regs(cs, A6XX_VPC_POINT_COORD_INVERT(false));
+ tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9101, 0xffff00);
+ tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9107, 0);
+
+ tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9236, 1);
tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
- tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));
+ tu_cs_emit_write_reg(cs, REG_A6XX_VPC_SO_OVERRIDE,
+ A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
+
+ tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9801, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9806, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9980, 0);
+
+ tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B06, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9B06, 0);
+
+ tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A81B, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
- tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8099, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_809B, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A0, 2);
tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9981, 0x3);
tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
- tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_MODE_CNTL,
- 0x000000a0 |
- A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL));
+ tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9108, 0x3);
+ tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B304, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_UNKNOWN_B309, 0x000000a2);
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8804, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A4, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A5, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80A6, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8805, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8806, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8878, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8879, 0);
tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
+ tu6_emit_marker(cmd, cs);
+
tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
+ tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0);
+
tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);
- tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL()); /* always disable alpha test */
- tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL()); /* always disable dithering */
+ /* we don't use this yet.. probably best to disable.. */
+ tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
+ tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
+ CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
+ CP_SET_DRAW_STATE__0_GROUP_ID(0));
+ tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
+ tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
- tu_disable_draw_states(cmd, cs);
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE_LO(0), 3);
+ tu_cs_emit(cs, 0x00000000); /* VPC_SO_BUFFER_BASE_LO_0 */
+ tu_cs_emit(cs, 0x00000000); /* VPC_SO_BUFFER_BASE_HI_0 */
+ tu_cs_emit(cs, 0x00000000); /* VPC_SO_BUFFER_SIZE_0 */
- tu_cs_emit_regs(cs,
- A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = dev->global_bo,
- .bo_offset = gb_offset(bcolor_builtin)));
- tu_cs_emit_regs(cs,
- A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = dev->global_bo,
- .bo_offset = gb_offset(bcolor_builtin)));
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE_LO(0), 2);
+ tu_cs_emit(cs, 0x00000000); /* VPC_SO_FLUSH_BASE_LO_0 */
+ tu_cs_emit(cs, 0x00000000); /* VPC_SO_FLUSH_BASE_HI_0 */
- tu_cs_sanity_check(cs);
-}
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUF_CNTL, 1);
+ tu_cs_emit(cs, 0x00000000); /* VPC_SO_BUF_CNTL */
-static void
-update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
-{
- const struct tu_tiling_config *tiling = cmd->state.tiling;
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_OFFSET(0), 1);
+ tu_cs_emit(cs, 0x00000000); /* UNKNOWN_E2AB */
- tu_cs_emit_regs(cs,
- A6XX_VSC_BIN_SIZE(.width = tiling->tile0.width,
- .height = tiling->tile0.height));
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE_LO(1), 3);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
- tu_cs_emit_regs(cs,
- A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width,
- .ny = tiling->tile_count.height));
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_OFFSET(1), 6);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
- tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
- tu_cs_emit_array(cs, tiling->pipe_config, 32);
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_OFFSET(2), 6);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
- tu_cs_emit_regs(cs,
- A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
- A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - VSC_PAD));
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_OFFSET(3), 3);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
+ tu_cs_emit(cs, 0x00000000);
- tu_cs_emit_regs(cs,
- A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch),
- A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - VSC_PAD));
-}
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_CTRL_REG0, 1);
+ tu_cs_emit(cs, 0x00000000);
-static void
-emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
-{
- const struct tu_tiling_config *tiling = cmd->state.tiling;
- const uint32_t used_pipe_count =
- tiling->pipe_count.width * tiling->pipe_count.height;
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_CTRL_REG0, 1);
+ tu_cs_emit(cs, 0x00000000);
- for (int i = 0; i < used_pipe_count; i++) {
- tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
- tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
- CP_COND_WRITE5_0_WRITE_MEMORY);
- tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
- tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
- tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - VSC_PAD));
- tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
- tu_cs_emit_qw(cs, global_iova(cmd, vsc_draw_overflow));
- tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_draw_strm_pitch));
-
- tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
- tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
- CP_COND_WRITE5_0_WRITE_MEMORY);
- tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
- tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
- tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - VSC_PAD));
- tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
- tu_cs_emit_qw(cs, global_iova(cmd, vsc_prim_overflow));
- tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_prim_strm_pitch));
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_CNTL, 1);
+ tu_cs_emit(cs, 0x00000000);
- tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_LRZ_CNTL, 1);
+ tu_cs_emit(cs, 0x00000000);
+
+ tu_cs_sanity_check(cs);
}
static void
-tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+tu6_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
- struct tu_physical_device *phys_dev = cmd->device->physical_device;
- const struct tu_framebuffer *fb = cmd->state.framebuffer;
-
- tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
-
- tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
- tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
-
- tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
- tu_cs_emit(cs, 0x1);
-
- tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
- tu_cs_emit(cs, 0x1);
-
- tu_cs_emit_wfi(cs);
-
- tu_cs_emit_regs(cs,
- A6XX_VFD_MODE_CNTL(.render_mode = BINNING_PASS));
-
- update_vsc_pipe(cmd, cs);
-
- tu_cs_emit_regs(cs,
- A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
-
- tu_cs_emit_regs(cs,
- A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
-
- tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
- tu_cs_emit(cs, UNK_2C);
-
- tu_cs_emit_regs(cs,
- A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
-
- tu_cs_emit_regs(cs,
- A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
-
- trace_start_binning_ib(&cmd->trace, cs);
-
- /* emit IB to binning drawcmds: */
- tu_cs_emit_call(cs, &cmd->draw_cs);
+ VkResult result = tu_cs_reserve_space(cmd->device, cs, 256);
+ if (result != VK_SUCCESS) {
+ cmd->record_result = result;
+ return;
+ }
- trace_end_binning_ib(&cmd->trace, cs);
+ tu6_emit_lrz_flush(cmd, cs);
- /* switching from binning pass to GMEM pass will cause a switch from
- * PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states)
- * so make sure these states are re-emitted
- * (eventually these states shouldn't exist at all with shader prologue)
- * only VS and GS are invalidated, as FS isn't emitted in binning pass,
- * and we don't use HW binning when tesselation is used
- */
- tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
- tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
- CP_SET_DRAW_STATE__0_DISABLE |
- CP_SET_DRAW_STATE__0_GROUP_ID(TU_DRAW_STATE_CONST));
- tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
- tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
+ /* lrz clear? */
- tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
- tu_cs_emit(cs, UNK_2D);
+ tu6_emit_cache_flush(cmd, cs);
- /* This flush is probably required because the VSC, which produces the
- * visibility stream, is a client of UCHE, whereas the CP needs to read the
- * visibility stream (without caching) to do draw skipping. The
- * WFI+WAIT_FOR_ME combination guarantees that the binning commands
- * submitted are finished before reading the VSC regs (in
- * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as
- * part of draws).
- */
- tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS);
+ tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
+ tu_cs_emit(cs, 0x0);
- tu_cs_emit_wfi(cs);
+ /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */
+ tu6_emit_wfi(cmd, cs);
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_CCU_CNTL, 1);
+ tu_cs_emit(cs, 0x7c400004); /* RB_CCU_CNTL */
- tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
+ tu6_emit_zs(cmd, cs);
+ tu6_emit_mrt(cmd, cs);
+ tu6_emit_msaa(cmd, cs);
- emit_vsc_overflow_test(cmd, cs);
+ if (false) {
+ /* hw binning? */
+ } else {
+ tu6_emit_bin_size(cmd, cs, 0x6000000);
+ /* no draws */
+ }
- tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
- tu_cs_emit(cs, 0x0);
+ tu6_emit_render_cntl(cmd, cs, false);
- tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
- tu_cs_emit(cs, 0x0);
+ tu_cs_sanity_check(cs);
}
-static struct tu_draw_state
-tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
- const struct tu_subpass *subpass,
- bool gmem)
+static void
+tu6_render_tile(struct tu_cmd_buffer *cmd,
+ struct tu_cs *cs,
+ const struct tu_tile *tile)
{
- const struct tu_tiling_config *tiling = cmd->state.tiling;
-
- /* note: we can probably emit input attachments just once for the whole
- * renderpass, this would avoid emitting both sysmem/gmem versions
- *
- * emit two texture descriptors for each input, as a workaround for
- * d24s8/d32s8, which can be sampled as both float (depth) and integer (stencil)
- * tu_shader lowers uint input attachment loads to use the 2nd descriptor
- * in the pair
- * TODO: a smarter workaround
- */
-
- if (!subpass->input_count)
- return (struct tu_draw_state) {};
-
- struct tu_cs_memory texture;
- VkResult result = tu_cs_alloc(&cmd->sub_cs, subpass->input_count * 2,
- A6XX_TEX_CONST_DWORDS, &texture);
+ const uint32_t render_tile_space = 64 + tu_cs_get_call_size(&cmd->draw_cs);
+ VkResult result = tu_cs_reserve_space(cmd->device, cs, render_tile_space);
if (result != VK_SUCCESS) {
- vk_command_buffer_set_error(&cmd->vk, result);
- return (struct tu_draw_state) {};
+ cmd->record_result = result;
+ return;
}
- for (unsigned i = 0; i < subpass->input_count * 2; i++) {
- uint32_t a = subpass->input_attachments[i / 2].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
-
- const struct tu_image_view *iview = cmd->state.attachments[a];
- const struct tu_render_pass_attachment *att =
- &cmd->state.pass->attachments[a];
- uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i];
- uint32_t gmem_offset = tu_attachment_gmem_offset(cmd, att);
- uint32_t cpp = att->cpp;
-
- memcpy(dst, iview->view.descriptor, A6XX_TEX_CONST_DWORDS * 4);
-
- /* Cube descriptors require a different sampling instruction in shader,
- * however we don't know whether image is a cube or not until the start
- * of a renderpass. We have to patch the descriptor to make it compatible
- * with how it is sampled in shader.
- */
- enum a6xx_tex_type tex_type = (dst[2] & A6XX_TEX_CONST_2_TYPE__MASK) >>
- A6XX_TEX_CONST_2_TYPE__SHIFT;
- if (tex_type == A6XX_TEX_CUBE) {
- dst[2] &= ~A6XX_TEX_CONST_2_TYPE__MASK;
- dst[2] |= A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
-
- uint32_t depth = (dst[5] & A6XX_TEX_CONST_5_DEPTH__MASK) >>
- A6XX_TEX_CONST_5_DEPTH__SHIFT;
- dst[5] &= ~A6XX_TEX_CONST_5_DEPTH__MASK;
- dst[5] |= A6XX_TEX_CONST_5_DEPTH(depth * 6);
- }
+ tu6_emit_tile_select(cmd, cs, tile);
+ tu_cs_emit_ib(cs, &cmd->state.tile_load_ib);
- if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {
- /* note this works because spec says fb and input attachments
- * must use identity swizzle
- *
- * Also we clear swap to WZYX. This is because the view might have
- * picked XYZW to work better with border colors.
- */
- dst[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
- A6XX_TEX_CONST_0_SWAP__MASK |
- A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
- A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
- if (!cmd->device->physical_device->info->a6xx.has_z24uint_s8uint) {
- dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UINT) |
- A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_W) |
- A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
- A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
- A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
- } else {
- dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UINT_S8_UINT) |
- A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_Y) |
- A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
- A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
- A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
- }
- }
+ tu_cs_emit_call(cs, &cmd->draw_cs);
+ cmd->wait_for_idle = true;
- if (i % 2 == 1 && att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
- dst[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
- dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT);
- dst[2] &= ~(A6XX_TEX_CONST_2_PITCHALIGN__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
- dst[2] |= A6XX_TEX_CONST_2_PITCH(iview->stencil_PITCH << 6);
- dst[3] = 0;
- dst[4] = iview->stencil_base_addr;
- dst[5] = (dst[5] & 0xffff) | iview->stencil_base_addr >> 32;
-
- cpp = att->samples;
- gmem_offset = att->gmem_offset_stencil[cmd->state.gmem_layout];
- }
+ tu_cs_emit_ib(cs, &cmd->state.tile_store_ib);
- if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem)
- continue;
+ tu_cs_sanity_check(cs);
+}
- /* patched for gmem */
- dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
- dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
- dst[2] =
- A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
- A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp);
- dst[3] = 0;
- dst[4] = cmd->device->physical_device->gmem_base + gmem_offset;
- dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
- for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
- dst[i] = 0;
+static void
+tu6_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+ VkResult result = tu_cs_reserve_space(cmd->device, cs, 16);
+ if (result != VK_SUCCESS) {
+ cmd->record_result = result;
+ return;
}
- struct tu_cs cs;
- struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 9);
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_CNTL, 1);
+ tu_cs_emit(cs, A6XX_GRAS_LRZ_CNTL_ENABLE | A6XX_GRAS_LRZ_CNTL_UNK3);
- tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_FRAG, 3);
- tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
- CP_LOAD_STATE6_0_NUM_UNIT(subpass->input_count * 2));
- tu_cs_emit_qw(&cs, texture.iova);
+ tu6_emit_lrz_flush(cmd, cs);
- tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
+ tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS, true);
- tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_COUNT(subpass->input_count * 2));
-
- assert(cs.cur == cs.end); /* validate draw state size */
-
- return ds;
+ tu_cs_sanity_check(cs);
}
static void
-tu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass)
+tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
{
- struct tu_cs *cs = &cmd->draw_cs;
+ const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
- tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 6);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
- tu_emit_input_attachments(cmd, subpass, true));
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
- tu_emit_input_attachments(cmd, subpass, false));
-}
+ tu6_render_begin(cmd, &cmd->cs);
+ for (uint32_t y = 0; y < tiling->tile_count.height; y++) {
+ for (uint32_t x = 0; x < tiling->tile_count.width; x++) {
+ struct tu_tile tile;
+ tu_tiling_config_get_tile(tiling, cmd->device, x, y, &tile);
+ tu6_render_tile(cmd, &cmd->cs, &tile);
+ }
+ }
+
+ tu6_render_end(cmd, &cmd->cs);
+}
static void
-tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
- const VkClearValue *clear_values)
+tu_cmd_prepare_tile_load_ib(struct tu_cmd_buffer *cmd)
{
- struct tu_cs *cs = &cmd->draw_cs;
-
- tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
-
- tu6_emit_tile_load(cmd, cs);
+ const uint32_t tile_load_space = 16 + 32 * MAX_RTS;
+ const struct tu_subpass *subpass = cmd->state.subpass;
+ struct tu_attachment_state *attachments = cmd->state.attachments;
+ struct tu_cs sub_cs;
- tu6_emit_blit_scissor(cmd, cs, false);
-
- for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
- tu_clear_gmem_attachment(cmd, cs, i, &clear_values[i]);
-
- tu_cond_exec_end(cs);
+ VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->tile_cs,
+ tile_load_space, &sub_cs);
+ if (result != VK_SUCCESS) {
+ cmd->record_result = result;
+ return;
+ }
- tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
+ /* emit to tile-load sub_cs */
+ tu6_emit_tile_load(cmd, &sub_cs);
- for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
- tu_clear_sysmem_attachment(cmd, cs, i, &clear_values[i]);
+ cmd->state.tile_load_ib = tu_cs_end_sub_stream(&cmd->tile_cs, &sub_cs);
- tu_cond_exec_end(cs);
+ for (uint32_t i = 0; i < subpass->color_count; ++i) {
+ const uint32_t a = subpass->color_attachments[i].attachment;
+ if (a != VK_ATTACHMENT_UNUSED)
+ attachments[a].pending_clear_aspects = 0;
+ }
}
static void
-tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
- struct tu_renderpass_result *autotune_result)
+tu_cmd_prepare_tile_store_ib(struct tu_cmd_buffer *cmd)
{
- const struct tu_framebuffer *fb = cmd->state.framebuffer;
-
- tu_lrz_sysmem_begin(cmd, cs);
-
- assert(fb->width > 0 && fb->height > 0);
- tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
- tu6_emit_window_offset(cs, 0, 0);
-
- tu6_emit_bin_size(cs, 0, 0,
- A6XX_RB_BIN_CONTROL_BUFFERS_LOCATION(BUFFERS_IN_SYSMEM) |
- A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS);
-
- tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
- tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
-
- tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
- tu_cs_emit(cs, 0x0);
-
- tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
-
- tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
- tu_cs_emit(cs, 0x1);
+ const uint32_t tile_store_space = 32 + 32 * MAX_RTS;
+ struct tu_cs sub_cs;
- tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
- tu_cs_emit(cs, 0x0);
+ VkResult result = tu_cs_begin_sub_stream(cmd->device, &cmd->tile_cs,
+ tile_store_space, &sub_cs);
+ if (result != VK_SUCCESS) {
+ cmd->record_result = result;
+ return;
+ }
- tu_autotune_begin_renderpass(cmd, cs, autotune_result);
+ /* emit to tile-store sub_cs */
+ tu6_emit_tile_store(cmd, &sub_cs);
- tu_cs_sanity_check(cs);
+ cmd->state.tile_store_ib = tu_cs_end_sub_stream(&cmd->tile_cs, &sub_cs);
}
static void
-tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
- struct tu_renderpass_result *autotune_result)
+tu_cmd_update_tiling_config(struct tu_cmd_buffer *cmd,
+ const VkRect2D *render_area)
{
- tu_autotune_end_renderpass(cmd, cs, autotune_result);
-
- /* Do any resolves of the last subpass. These are handled in the
- * tile_store_cs in the gmem path.
- */
- tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass);
+ const struct tu_device *dev = cmd->device;
+ const struct tu_render_pass *pass = cmd->state.pass;
+ const struct tu_subpass *subpass = cmd->state.subpass;
+ struct tu_tiling_config *tiling = &cmd->state.tiling_config;
- tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
+ uint32_t buffer_cpp[MAX_RTS + 2];
+ uint32_t buffer_count = 0;
- tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
- tu_cs_emit(cs, 0x0);
+ for (uint32_t i = 0; i < subpass->color_count; ++i) {
+ const uint32_t a = subpass->color_attachments[i].attachment;
+ if (a == VK_ATTACHMENT_UNUSED)
+ continue;
- tu_lrz_sysmem_end(cmd, cs);
+ const struct tu_render_pass_attachment *att = &pass->attachments[a];
+ buffer_cpp[buffer_count++] =
+ vk_format_get_blocksize(att->format) * att->samples;
+ }
- tu_cs_sanity_check(cs);
-}
+ if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+ const uint32_t a = subpass->depth_stencil_attachment.attachment;
+ const struct tu_render_pass_attachment *att = &pass->attachments[a];
+
+ /* TODO */
+ assert(att->format != VK_FORMAT_D32_SFLOAT_S8_UINT);
+
+ buffer_cpp[buffer_count++] =
+ vk_format_get_blocksize(att->format) * att->samples;
+ }
+
+ tu_tiling_config_update(tiling, dev, buffer_cpp, buffer_count,
+ render_area);
+}
+
+const struct tu_dynamic_state default_dynamic_state = {
+ .viewport =
+ {
+ .count = 0,
+ },
+ .scissor =
+ {
+ .count = 0,
+ },
+ .line_width = 1.0f,
+ .depth_bias =
+ {
+ .bias = 0.0f,
+ .clamp = 0.0f,
+ .slope = 0.0f,
+ },
+ .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
+ .depth_bounds =
+ {
+ .min = 0.0f,
+ .max = 1.0f,
+ },
+ .stencil_compare_mask =
+ {
+ .front = ~0u,
+ .back = ~0u,
+ },
+ .stencil_write_mask =
+ {
+ .front = ~0u,
+ .back = ~0u,
+ },
+ .stencil_reference =
+ {
+ .front = 0u,
+ .back = 0u,
+ },
+};
-static void
-tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
- struct tu_renderpass_result *autotune_result)
+static void UNUSED /* FINISHME */
+tu_bind_dynamic_state(struct tu_cmd_buffer *cmd_buffer,
+ const struct tu_dynamic_state *src)
{
- struct tu_physical_device *phys_dev = cmd->device->physical_device;
- const struct tu_tiling_config *tiling = cmd->state.tiling;
- tu_lrz_tiling_begin(cmd, cs);
+ struct tu_dynamic_state *dest = &cmd_buffer->state.dynamic;
+ uint32_t copy_mask = src->mask;
+ uint32_t dest_mask = 0;
- tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
- tu_cs_emit(cs, 0x0);
-
- tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM);
+ tu_use_args(cmd_buffer); /* FINISHME */
- if (use_hw_binning(cmd)) {
- if (!cmd->vsc_initialized) {
- tu6_lazy_emit_vsc(cmd, cs);
+ /* Make sure to copy the number of viewports/scissors because they can
+ * only be specified at pipeline creation time.
+ */
+ dest->viewport.count = src->viewport.count;
+ dest->scissor.count = src->scissor.count;
+ dest->discard_rectangle.count = src->discard_rectangle.count;
+
+ if (copy_mask & TU_DYNAMIC_VIEWPORT) {
+ if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
+ src->viewport.count * sizeof(VkViewport))) {
+ typed_memcpy(dest->viewport.viewports, src->viewport.viewports,
+ src->viewport.count);
+ dest_mask |= TU_DYNAMIC_VIEWPORT;
}
+ }
- tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height,
- A6XX_RB_BIN_CONTROL_RENDER_MODE(BINNING_PASS) |
- A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
-
- tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);
+ if (copy_mask & TU_DYNAMIC_SCISSOR) {
+ if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
+ src->scissor.count * sizeof(VkRect2D))) {
+ typed_memcpy(dest->scissor.scissors, src->scissor.scissors,
+ src->scissor.count);
+ dest_mask |= TU_DYNAMIC_SCISSOR;
+ }
+ }
- tu6_emit_binning_pass(cmd, cs);
+ if (copy_mask & TU_DYNAMIC_LINE_WIDTH) {
+ if (dest->line_width != src->line_width) {
+ dest->line_width = src->line_width;
+ dest_mask |= TU_DYNAMIC_LINE_WIDTH;
+ }
+ }
- tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height,
- A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS |
- A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
+ if (copy_mask & TU_DYNAMIC_DEPTH_BIAS) {
+ if (memcmp(&dest->depth_bias, &src->depth_bias,
+ sizeof(src->depth_bias))) {
+ dest->depth_bias = src->depth_bias;
+ dest_mask |= TU_DYNAMIC_DEPTH_BIAS;
+ }
+ }
- tu_cs_emit_regs(cs,
- A6XX_VFD_MODE_CNTL(0));
+ if (copy_mask & TU_DYNAMIC_BLEND_CONSTANTS) {
+ if (memcmp(&dest->blend_constants, &src->blend_constants,
+ sizeof(src->blend_constants))) {
+ typed_memcpy(dest->blend_constants, src->blend_constants, 4);
+ dest_mask |= TU_DYNAMIC_BLEND_CONSTANTS;
+ }
+ }
- tu_cs_emit_regs(cs,
- A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
+ if (copy_mask & TU_DYNAMIC_DEPTH_BOUNDS) {
+ if (memcmp(&dest->depth_bounds, &src->depth_bounds,
+ sizeof(src->depth_bounds))) {
+ dest->depth_bounds = src->depth_bounds;
+ dest_mask |= TU_DYNAMIC_DEPTH_BOUNDS;
+ }
+ }
- tu_cs_emit_regs(cs,
- A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
+ if (copy_mask & TU_DYNAMIC_STENCIL_COMPARE_MASK) {
+ if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
+ sizeof(src->stencil_compare_mask))) {
+ dest->stencil_compare_mask = src->stencil_compare_mask;
+ dest_mask |= TU_DYNAMIC_STENCIL_COMPARE_MASK;
+ }
+ }
- tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
- tu_cs_emit(cs, 0x1);
- tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_LOCAL, 1);
- tu_cs_emit(cs, 0x1);
- } else {
- tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height,
- A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
-
- if (tiling->binning_possible) {
- /* Mark all tiles as visible for tu6_emit_cond_for_load_stores(), since
- * the actual binner didn't run.
- */
- int pipe_count = tiling->pipe_count.width * tiling->pipe_count.height;
- tu_cs_emit_pkt4(cs, REG_A6XX_VSC_STATE_REG(0), pipe_count);
- for (int i = 0; i < pipe_count; i++)
- tu_cs_emit(cs, ~0);
+ if (copy_mask & TU_DYNAMIC_STENCIL_WRITE_MASK) {
+ if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
+ sizeof(src->stencil_write_mask))) {
+ dest->stencil_write_mask = src->stencil_write_mask;
+ dest_mask |= TU_DYNAMIC_STENCIL_WRITE_MASK;
}
}
- tu_autotune_begin_renderpass(cmd, cs, autotune_result);
+ if (copy_mask & TU_DYNAMIC_STENCIL_REFERENCE) {
+ if (memcmp(&dest->stencil_reference, &src->stencil_reference,
+ sizeof(src->stencil_reference))) {
+ dest->stencil_reference = src->stencil_reference;
+ dest_mask |= TU_DYNAMIC_STENCIL_REFERENCE;
+ }
+ }
- tu_cs_sanity_check(cs);
+ if (copy_mask & TU_DYNAMIC_DISCARD_RECTANGLE) {
+ if (memcmp(&dest->discard_rectangle.rectangles,
+ &src->discard_rectangle.rectangles,
+ src->discard_rectangle.count * sizeof(VkRect2D))) {
+ typed_memcpy(dest->discard_rectangle.rectangles,
+ src->discard_rectangle.rectangles,
+ src->discard_rectangle.count);
+ dest_mask |= TU_DYNAMIC_DISCARD_RECTANGLE;
+ }
+ }
}
-static void
-tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
- uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot)
+static VkResult
+tu_create_cmd_buffer(struct tu_device *device,
+ struct tu_cmd_pool *pool,
+ VkCommandBufferLevel level,
+ VkCommandBuffer *pCommandBuffer)
{
- tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot);
-
- trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
-
- /* Primitives that passed all tests are still counted in in each
- * tile even with HW binning beforehand. Do not permit it.
- */
- if (cmd->state.prim_generated_query_running_before_rp)
- tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS);
+ struct tu_cmd_buffer *cmd_buffer;
+ cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (cmd_buffer == NULL)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- tu_cs_emit_call(cs, &cmd->draw_cs);
+ cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
+ cmd_buffer->device = device;
+ cmd_buffer->pool = pool;
+ cmd_buffer->level = level;
- if (cmd->state.prim_generated_query_running_before_rp)
- tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS);
+ if (pool) {
+ list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
+ cmd_buffer->queue_family_index = pool->queue_family_index;
- if (use_hw_binning(cmd)) {
- tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
- tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
+ } else {
+ /* Init the pool_link so we can safely call list_del when we destroy
+ * the command buffer
+ */
+ list_inithead(&cmd_buffer->pool_link);
+ cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
}
- /* Predicate is changed in draw_cs so we have to re-emit it */
- if (cmd->state.rp.draw_cs_writes_to_cond_pred)
- tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, false);
+ tu_bo_list_init(&cmd_buffer->bo_list);
+ tu_cs_init(&cmd_buffer->cs, TU_CS_MODE_GROW, 4096);
+ tu_cs_init(&cmd_buffer->draw_cs, TU_CS_MODE_GROW, 4096);
+ tu_cs_init(&cmd_buffer->tile_cs, TU_CS_MODE_SUB_STREAM, 1024);
- tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
- tu_cs_emit(cs, 0x0);
+ *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
- tu_cs_emit_call(cs, &cmd->tile_store_cs);
+ list_inithead(&cmd_buffer->upload.list);
- tu_clone_trace_range(cmd, cs, cmd->trace_renderpass_start,
- cmd->trace_renderpass_end);
+ cmd_buffer->marker_reg = REG_A6XX_CP_SCRATCH_REG(
+ cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ? 7 : 6);
- tu_cs_sanity_check(cs);
+ VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000);
+ if (result != VK_SUCCESS)
+ return result;
- trace_end_draw_ib_gmem(&cmd->trace, &cmd->cs);
+ return VK_SUCCESS;
}
static void
-tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
- struct tu_renderpass_result *autotune_result)
+tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
{
- tu_autotune_end_renderpass(cmd, cs, autotune_result);
+ tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo);
- tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
+ list_del(&cmd_buffer->pool_link);
- tu_lrz_tiling_end(cmd, cs);
+ for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++)
+ free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
- tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS);
+ tu_cs_finish(cmd_buffer->device, &cmd_buffer->cs);
+ tu_cs_finish(cmd_buffer->device, &cmd_buffer->draw_cs);
+ tu_cs_finish(cmd_buffer->device, &cmd_buffer->tile_cs);
- tu_cs_sanity_check(cs);
+ tu_bo_list_destroy(&cmd_buffer->bo_list);
+ vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
}
-static void
-tu_cmd_render_tiles(struct tu_cmd_buffer *cmd,
- struct tu_renderpass_result *autotune_result)
+static VkResult
+tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
{
- const struct tu_framebuffer *fb = cmd->state.framebuffer;
- const struct tu_tiling_config *tiling = cmd->state.tiling;
+ cmd_buffer->wait_for_idle = true;
- /* Create gmem stores now (at EndRenderPass time)) because they needed to
- * know whether to allow their conditional execution, which was tied to a
- * state that was known only at the end of the renderpass. They will be
- * called from tu6_render_tile().
- */
- tu_cs_begin(&cmd->tile_store_cs);
- tu6_emit_tile_store(cmd, &cmd->tile_store_cs);
- tu_cs_end(&cmd->tile_store_cs);
-
- cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);
+ cmd_buffer->record_result = VK_SUCCESS;
- tu6_tile_render_begin(cmd, &cmd->cs, autotune_result);
+ tu_bo_list_reset(&cmd_buffer->bo_list);
+ tu_cs_reset(cmd_buffer->device, &cmd_buffer->cs);
+ tu_cs_reset(cmd_buffer->device, &cmd_buffer->draw_cs);
+ tu_cs_reset(cmd_buffer->device, &cmd_buffer->tile_cs);
- /* Note: we reverse the order of walking the pipes and tiles on every
- * other row, to improve texture cache locality compared to raster order.
- */
- for (uint32_t py = 0; py < tiling->pipe_count.height; py++) {
- uint32_t pipe_row = py * tiling->pipe_count.width;
- for (uint32_t pipe_row_i = 0; pipe_row_i < tiling->pipe_count.width; pipe_row_i++) {
- uint32_t px;
- if (py & 1)
- px = tiling->pipe_count.width - 1 - pipe_row_i;
- else
- px = pipe_row_i;
- uint32_t pipe = pipe_row + px;
- uint32_t tx1 = px * tiling->pipe0.width;
- uint32_t ty1 = py * tiling->pipe0.height;
- uint32_t tx2 = MIN2(tx1 + tiling->pipe0.width, tiling->tile_count.width);
- uint32_t ty2 = MIN2(ty1 + tiling->pipe0.height, tiling->tile_count.height);
- uint32_t tile_row_stride = tx2 - tx1;
- uint32_t slot_row = 0;
- for (uint32_t ty = ty1; ty < ty2; ty++) {
- for (uint32_t tile_row_i = 0; tile_row_i < tile_row_stride; tile_row_i++) {
- uint32_t tx;
- if (ty & 1)
- tx = tile_row_stride - 1 - tile_row_i;
- else
- tx = tile_row_i;
- uint32_t slot = slot_row + tx;
- tu6_render_tile(cmd, &cmd->cs, tx1 + tx, ty, pipe, slot);
- }
- slot_row += tile_row_stride;
- }
- }
+ for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
+ cmd_buffer->descriptors[i].dirty = 0;
+ cmd_buffer->descriptors[i].valid = 0;
+ cmd_buffer->descriptors[i].push_dirty = false;
}
- tu6_tile_render_end(cmd, &cmd->cs, autotune_result);
-
- trace_end_render_pass(&cmd->trace, &cmd->cs, fb, tiling);
-
- /* tu6_render_tile has cloned these tracepoints for each tile */
- if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end))
- u_trace_disable_event_range(cmd->trace_renderpass_start,
- cmd->trace_renderpass_end);
+ cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
- /* Reset the gmem store CS entry lists so that the next render pass
- * does its own stores.
- */
- tu_cs_discard_entries(&cmd->tile_store_cs);
+ return cmd_buffer->record_result;
}
-static void
-tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd,
- struct tu_renderpass_result *autotune_result)
+static VkResult
+tu_cmd_state_setup_attachments(struct tu_cmd_buffer *cmd_buffer,
+ const VkRenderPassBeginInfo *info)
{
- cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);
-
- tu6_sysmem_render_begin(cmd, &cmd->cs, autotune_result);
+ struct tu_cmd_state *state = &cmd_buffer->state;
+ const struct tu_framebuffer *fb = state->framebuffer;
+ const struct tu_render_pass *pass = state->pass;
- trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs);
-
- tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
+ for (uint32_t i = 0; i < fb->attachment_count; ++i) {
+ const struct tu_image_view *iview = fb->attachments[i].attachment;
+ tu_bo_list_add(&cmd_buffer->bo_list, iview->image->bo,
+ MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
+ }
- trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
+ if (pass->attachment_count == 0) {
+ state->attachments = NULL;
+ return VK_SUCCESS;
+ }
- tu6_sysmem_render_end(cmd, &cmd->cs, autotune_result);
+ state->attachments =
+ vk_alloc(&cmd_buffer->pool->alloc,
+ pass->attachment_count * sizeof(state->attachments[0]), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (state->attachments == NULL) {
+ cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
+ return cmd_buffer->record_result;
+ }
- trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer, cmd->state.tiling);
-}
+ for (uint32_t i = 0; i < pass->attachment_count; ++i) {
+ const struct tu_render_pass_attachment *att = &pass->attachments[i];
+ VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
+ VkImageAspectFlags clear_aspects = 0;
-void
-tu_cmd_render(struct tu_cmd_buffer *cmd_buffer)
-{
- if (cmd_buffer->state.rp.has_tess)
- tu6_lazy_emit_tessfactor_addr(cmd_buffer);
+ if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
+ /* color attachment */
+ if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+ clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
+ }
+ } else {
+ /* depthstencil attachment */
+ if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
+ att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+ clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
+ if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
+ att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
+ clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
+ }
+ if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
+ att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+ clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
+ }
+ }
- struct tu_renderpass_result *autotune_result = NULL;
- if (use_sysmem_rendering(cmd_buffer, &autotune_result))
- tu_cmd_render_sysmem(cmd_buffer, autotune_result);
- else
- tu_cmd_render_tiles(cmd_buffer, autotune_result);
+ state->attachments[i].pending_clear_aspects = clear_aspects;
+ state->attachments[i].cleared_views = 0;
+ if (clear_aspects && info) {
+ assert(info->clearValueCount > i);
+ state->attachments[i].clear_value = info->pClearValues[i];
+ }
- /* Outside of renderpasses we assume all draw states are disabled. We do
- * this outside the draw CS for the normal case where 3d gmem stores aren't
- * used.
- */
- tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);
+ state->attachments[i].current_layout = att->initial_layout;
+ }
+ return VK_SUCCESS;
}
-static void tu_reset_render_pass(struct tu_cmd_buffer *cmd_buffer)
+VkResult
+tu_AllocateCommandBuffers(VkDevice _device,
+ const VkCommandBufferAllocateInfo *pAllocateInfo,
+ VkCommandBuffer *pCommandBuffers)
{
- /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
- rendered */
- tu_cs_discard_entries(&cmd_buffer->draw_cs);
- tu_cs_begin(&cmd_buffer->draw_cs);
- tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
- tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
+ TU_FROM_HANDLE(tu_device, device, _device);
+ TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);
- cmd_buffer->state.pass = NULL;
- cmd_buffer->state.subpass = NULL;
- cmd_buffer->state.framebuffer = NULL;
- cmd_buffer->state.attachments = NULL;
- cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* invalid value to prevent looking up gmem offsets */
- memset(&cmd_buffer->state.rp, 0, sizeof(cmd_buffer->state.rp));
+ VkResult result = VK_SUCCESS;
+ uint32_t i;
- /* LRZ is not valid next time we use it */
- cmd_buffer->state.lrz.valid = false;
- cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ;
-}
+ for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
-static VkResult
-tu_create_cmd_buffer(struct vk_command_pool *pool,
- struct vk_command_buffer **cmd_buffer_out)
-{
- struct tu_device *device =
- container_of(pool->base.device, struct tu_device, vk);
- struct tu_cmd_buffer *cmd_buffer;
+ if (!list_empty(&pool->free_cmd_buffers)) {
+ struct tu_cmd_buffer *cmd_buffer = list_first_entry(
+ &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);
- cmd_buffer = vk_zalloc2(&device->vk.alloc, NULL, sizeof(*cmd_buffer), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ list_del(&cmd_buffer->pool_link);
+ list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
- if (cmd_buffer == NULL)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ result = tu_reset_cmd_buffer(cmd_buffer);
+ cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
+ cmd_buffer->level = pAllocateInfo->level;
- VkResult result = vk_command_buffer_init(pool, &cmd_buffer->vk,
- &tu_cmd_buffer_ops, 0);
- if (result != VK_SUCCESS) {
- vk_free2(&device->vk.alloc, NULL, cmd_buffer);
- return result;
+ pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
+ } else {
+ result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,
+ &pCommandBuffers[i]);
+ }
+ if (result != VK_SUCCESS)
+ break;
}
- cmd_buffer->device = device;
-
- u_trace_init(&cmd_buffer->trace, &device->trace_context);
- list_inithead(&cmd_buffer->renderpass_autotune_results);
-
- tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096, "cmd cs");
- tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096, "draw cs");
- tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048, "tile store cs");
- tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096, "draw epilogue cs");
- tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048, "draw sub cs");
- tu_cs_init(&cmd_buffer->pre_chain.draw_cs, device, TU_CS_MODE_GROW, 4096, "prechain draw cs");
- tu_cs_init(&cmd_buffer->pre_chain.draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096, "prechain draw epiligoue cs");
-
- *cmd_buffer_out = &cmd_buffer->vk;
-
- return VK_SUCCESS;
-}
+ if (result != VK_SUCCESS) {
+ tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
+ pCommandBuffers);
-static void
-tu_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
-{
- struct tu_cmd_buffer *cmd_buffer =
- container_of(vk_cmd_buffer, struct tu_cmd_buffer, vk);
-
- tu_cs_finish(&cmd_buffer->cs);
- tu_cs_finish(&cmd_buffer->draw_cs);
- tu_cs_finish(&cmd_buffer->tile_store_cs);
- tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
- tu_cs_finish(&cmd_buffer->sub_cs);
- tu_cs_finish(&cmd_buffer->pre_chain.draw_cs);
- tu_cs_finish(&cmd_buffer->pre_chain.draw_epilogue_cs);
-
- u_trace_fini(&cmd_buffer->trace);
-
- tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
-
- for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
- if (cmd_buffer->descriptors[i].push_set.layout)
- vk_descriptor_set_layout_unref(&cmd_buffer->device->vk,
- &cmd_buffer->descriptors[i].push_set.layout->vk);
- vk_free(&cmd_buffer->device->vk.alloc,
- cmd_buffer->descriptors[i].push_set.mapped_ptr);
+ /* From the Vulkan 1.0.66 spec:
+ *
+ * "vkAllocateCommandBuffers can be used to create multiple
+ * command buffers. If the creation of any of those command
+ * buffers fails, the implementation must destroy all
+ * successfully created command buffer objects from this
+ * command, set all entries of the pCommandBuffers array to
+ * NULL and return the error."
+ */
+ memset(pCommandBuffers, 0,
+ sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
}
- vk_command_buffer_finish(&cmd_buffer->vk);
- vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->vk.pool->alloc,
- cmd_buffer);
+ return result;
}
-static void
-tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
- UNUSED VkCommandBufferResetFlags flags)
+void
+tu_FreeCommandBuffers(VkDevice device,
+ VkCommandPool commandPool,
+ uint32_t commandBufferCount,
+ const VkCommandBuffer *pCommandBuffers)
{
- struct tu_cmd_buffer *cmd_buffer =
- container_of(vk_cmd_buffer, struct tu_cmd_buffer, vk);
-
- vk_command_buffer_reset(&cmd_buffer->vk);
-
- tu_cs_reset(&cmd_buffer->cs);
- tu_cs_reset(&cmd_buffer->draw_cs);
- tu_cs_reset(&cmd_buffer->tile_store_cs);
- tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
- tu_cs_reset(&cmd_buffer->sub_cs);
- tu_cs_reset(&cmd_buffer->pre_chain.draw_cs);
- tu_cs_reset(&cmd_buffer->pre_chain.draw_epilogue_cs);
-
- tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
-
- for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
- memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
- if (cmd_buffer->descriptors[i].push_set.layout) {
- vk_descriptor_set_layout_unref(&cmd_buffer->device->vk,
- &cmd_buffer->descriptors[i].push_set.layout->vk);
+ for (uint32_t i = 0; i < commandBufferCount; i++) {
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
+
+ if (cmd_buffer) {
+ if (cmd_buffer->pool) {
+ list_del(&cmd_buffer->pool_link);
+ list_addtail(&cmd_buffer->pool_link,
+ &cmd_buffer->pool->free_cmd_buffers);
+ } else
+ tu_cmd_buffer_destroy(cmd_buffer);
}
- memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set));
- cmd_buffer->descriptors[i].push_set.base.type = VK_OBJECT_TYPE_DESCRIPTOR_SET;
- cmd_buffer->descriptors[i].max_sets_bound = 0;
- cmd_buffer->descriptors[i].dynamic_bound = 0;
}
-
- u_trace_fini(&cmd_buffer->trace);
- u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->trace_context);
-
- cmd_buffer->state.max_vbs_bound = 0;
- cmd_buffer->state.last_prim_params.valid = false;
-
- cmd_buffer->vsc_initialized = false;
-
- cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
}
-const struct vk_command_buffer_ops tu_cmd_buffer_ops = {
- .create = tu_create_cmd_buffer,
- .reset = tu_reset_cmd_buffer,
- .destroy = tu_cmd_buffer_destroy,
-};
-
-/* Initialize the cache, assuming all necessary flushes have happened but *not*
- * invalidations.
- */
-static void
-tu_cache_init(struct tu_cache_state *cache)
+VkResult
+tu_ResetCommandBuffer(VkCommandBuffer commandBuffer,
+ VkCommandBufferResetFlags flags)
{
- cache->flush_bits = 0;
- cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE;
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
+ return tu_reset_cmd_buffer(cmd_buffer);
}
-/* Unlike the public entrypoint, this doesn't handle cache tracking, and
- * tracking the CCU state. It's used for the driver to insert its own command
- * buffer in the middle of a submit.
- */
VkResult
-tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
- VkCommandBufferUsageFlags usage_flags)
+tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
+ const VkCommandBufferBeginInfo *pBeginInfo)
{
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
+ VkResult result = VK_SUCCESS;
+
if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
/* If the command buffer has already been resetted with
* vkResetCommandBuffer, no need to do it again.
*/
- tu_reset_cmd_buffer(&cmd_buffer->vk, 0);
+ result = tu_reset_cmd_buffer(cmd_buffer);
+ if (result != VK_SUCCESS)
+ return result;
}
memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
- cmd_buffer->state.index_size = 0xff; /* dirty restart index */
- cmd_buffer->state.line_mode = RECTANGULAR;
- cmd_buffer->state.gmem_layout = TU_GMEM_LAYOUT_COUNT; /* dirty value */
-
- tu_cache_init(&cmd_buffer->state.cache);
- tu_cache_init(&cmd_buffer->state.renderpass_cache);
- cmd_buffer->usage_flags = usage_flags;
+ cmd_buffer->usage_flags = pBeginInfo->flags;
tu_cs_begin(&cmd_buffer->cs);
- tu_cs_begin(&cmd_buffer->draw_cs);
- tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
- cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
- const VkCommandBufferBeginInfo *pBeginInfo)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
- VkResult result = tu_cmd_buffer_begin(cmd_buffer, pBeginInfo->flags);
- if (result != VK_SUCCESS)
- return result;
+ cmd_buffer->marker_seqno = 0;
+ cmd_buffer->scratch_seqno = 0;
/* setup initial configuration into command buffer */
- if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
- trace_start_cmd_buffer(&cmd_buffer->trace, &cmd_buffer->cs);
-
+ if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
switch (cmd_buffer->queue_family_index) {
case TU_QUEUE_GENERAL:
tu6_init_hw(cmd_buffer, &cmd_buffer->cs);
@@ -1799,212 +1602,35 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
default:
break;
}
- } else if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
- const bool pass_continue =
- pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
-
- trace_start_cmd_buffer(&cmd_buffer->trace,
- pass_continue ? &cmd_buffer->draw_cs : &cmd_buffer->cs);
-
- assert(pBeginInfo->pInheritanceInfo);
-
- cmd_buffer->inherited_pipeline_statistics =
- pBeginInfo->pInheritanceInfo->pipelineStatistics;
-
- vk_foreach_struct_const(ext, pBeginInfo->pInheritanceInfo) {
- switch (ext->sType) {
- case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: {
- const VkCommandBufferInheritanceConditionalRenderingInfoEXT *cond_rend = (void *) ext;
- cmd_buffer->state.predication_active = cond_rend->conditionalRenderingEnable;
- break;
- default:
- break;
- }
- }
- }
-
- if (pass_continue) {
- const VkCommandBufferInheritanceRenderingInfo *rendering_info =
- vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext,
- COMMAND_BUFFER_INHERITANCE_RENDERING_INFO);
-
- if (unlikely(cmd_buffer->device->instance->debug_flags & TU_DEBUG_DYNAMIC)) {
- rendering_info =
- vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
- pBeginInfo);
- }
-
- if (rendering_info) {
- tu_setup_dynamic_inheritance(cmd_buffer, rendering_info);
- cmd_buffer->state.pass = &cmd_buffer->dynamic_pass;
- cmd_buffer->state.subpass = &cmd_buffer->dynamic_subpass;
- } else {
- cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
- cmd_buffer->state.subpass =
- &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
- }
-
- /* We can't set the gmem layout here, because the state.pass only has
- * to be compatible (same formats/sample counts) with the primary's
- * renderpass, rather than exactly equal.
- */
-
- tu_lrz_begin_secondary_cmdbuf(cmd_buffer);
- } else {
- /* When executing in the middle of another command buffer, the CCU
- * state is unknown.
- */
- cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN;
- }
}
- return VK_SUCCESS;
-}
-
-static void
-tu6_emit_vertex_strides(struct tu_cmd_buffer *cmd, unsigned num_vbs)
-{
- struct tu_cs cs;
- cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].iova =
- tu_cs_draw_state(&cmd->sub_cs, &cs, 2 * num_vbs).iova;
-
- for (uint32_t i = 0; i < num_vbs; i++)
- tu_cs_emit_regs(&cs, A6XX_VFD_FETCH_STRIDE(i, cmd->state.vb[i].stride));
-
- cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE;
-}
-
-static struct tu_cs
-tu_cmd_dynamic_state(struct tu_cmd_buffer *cmd, uint32_t id, uint32_t size)
-{
- struct tu_cs cs;
-
- assert(id < ARRAY_SIZE(cmd->state.dynamic_state));
- cmd->state.dynamic_state[id] = tu_cs_draw_state(&cmd->sub_cs, &cs, size);
-
- /* note: this also avoids emitting draw states before renderpass clears,
- * which may use the 3D clear path (for MSAA cases)
- */
- if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)
- return cs;
-
- tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
- tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]);
-
- return cs;
-}
-
-static void
-tu_cmd_end_dynamic_state(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
- uint32_t id)
-{
- assert(id < ARRAY_SIZE(cmd->state.dynamic_state));
- cmd->state.dynamic_state[id] = tu_cs_end_draw_state(&cmd->sub_cs, cs);
-
- /* note: this also avoids emitting draw states before renderpass clears,
- * which may use the 3D clear path (for MSAA cases)
- */
- if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)
- return;
-
- tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
- tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]);
-}
-
-static void
-tu_update_num_vbs(struct tu_cmd_buffer *cmd, unsigned num_vbs)
-{
- /* the vertex_buffers draw state always contains all the currently
- * bound vertex buffers. update its size to only emit the vbs which
- * are actually used by the pipeline
- * note there is a HW optimization which makes it so the draw state
- * is not re-executed completely when only the size changes
- */
- if (cmd->state.vertex_buffers.size != num_vbs * 4) {
- cmd->state.vertex_buffers.size = num_vbs * 4;
- cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
- }
-
- if (cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size != num_vbs * 2) {
- cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size = num_vbs * 2;
- cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE;
- }
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer,
- uint32_t vertexBindingDescriptionCount,
- const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
- uint32_t vertexAttributeDescriptionCount,
- const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs cs;
-
- unsigned num_vbs = 0;
- for (unsigned i = 0; i < vertexBindingDescriptionCount; i++) {
- const VkVertexInputBindingDescription2EXT *binding =
- &pVertexBindingDescriptions[i];
- num_vbs = MAX2(num_vbs, binding->binding + 1);
- cmd->state.vb[binding->binding].stride = binding->stride;
- }
-
- tu6_emit_vertex_strides(cmd, num_vbs);
- tu_update_num_vbs(cmd, num_vbs);
+ cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
- tu_cs_begin_sub_stream(&cmd->sub_cs, TU6_EMIT_VERTEX_INPUT_MAX_DWORDS, &cs);
- tu6_emit_vertex_input(&cs, vertexBindingDescriptionCount,
- pVertexBindingDescriptions,
- vertexAttributeDescriptionCount,
- pVertexAttributeDescriptions);
- tu_cmd_end_dynamic_state(cmd, &cs, TU_DYNAMIC_STATE_VERTEX_INPUT);
+ return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer,
- uint32_t firstBinding,
- uint32_t bindingCount,
- const VkBuffer* pBuffers,
- const VkDeviceSize* pOffsets,
- const VkDeviceSize* pSizes,
- const VkDeviceSize* pStrides)
+void
+tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
+ uint32_t firstBinding,
+ uint32_t bindingCount,
+ const VkBuffer *pBuffers,
+ const VkDeviceSize *pOffsets)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs cs;
- cmd->state.max_vbs_bound = MAX2(
- cmd->state.max_vbs_bound, firstBinding + bindingCount);
-
- cmd->state.vertex_buffers.iova =
- tu_cs_draw_state(&cmd->sub_cs, &cs, 4 * cmd->state.max_vbs_bound).iova;
+ assert(firstBinding + bindingCount <= MAX_VBS);
for (uint32_t i = 0; i < bindingCount; i++) {
- if (pBuffers[i] == VK_NULL_HANDLE) {
- cmd->state.vb[firstBinding + i].base = 0;
- cmd->state.vb[firstBinding + i].size = 0;
- } else {
- struct tu_buffer *buf = tu_buffer_from_handle(pBuffers[i]);
- cmd->state.vb[firstBinding + i].base = buf->iova + pOffsets[i];
- cmd->state.vb[firstBinding + i].size = pSizes ? pSizes[i] : (buf->vk.size - pOffsets[i]);
- }
-
- if (pStrides)
- cmd->state.vb[firstBinding + i].stride = pStrides[i];
- }
-
- for (uint32_t i = 0; i < cmd->state.max_vbs_bound; i++) {
- tu_cs_emit_regs(&cs,
- A6XX_VFD_FETCH_BASE(i, .qword = cmd->state.vb[i].base),
- A6XX_VFD_FETCH_SIZE(i, cmd->state.vb[i].size));
+ cmd->state.vb.buffers[firstBinding + i] =
+ tu_buffer_from_handle(pBuffers[i]);
+ cmd->state.vb.offsets[firstBinding + i] = pOffsets[i];
}
+ /* VB states depend on VkPipelineVertexInputStateCreateInfo */
cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
-
- if (pStrides)
- tu6_emit_vertex_strides(cmd, cmd->state.max_vbs_bound);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
VkBuffer buffer,
VkDeviceSize offset,
@@ -2013,42 +1639,31 @@ tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
TU_FROM_HANDLE(tu_buffer, buf, buffer);
+ /* initialize/update the restart index */
+ if (!cmd->state.index_buffer || cmd->state.index_type != indexType) {
+ struct tu_cs *draw_cs = &cmd->draw_cs;
+ VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 2);
+ if (result != VK_SUCCESS) {
+ cmd->record_result = result;
+ return;
+ }
+ tu6_emit_restart_index(
+ draw_cs, indexType == VK_INDEX_TYPE_UINT32 ? 0xffffffff : 0xffff);
- uint32_t index_size, index_shift, restart_index;
-
- switch (indexType) {
- case VK_INDEX_TYPE_UINT16:
- index_size = INDEX4_SIZE_16_BIT;
- index_shift = 1;
- restart_index = 0xffff;
- break;
- case VK_INDEX_TYPE_UINT32:
- index_size = INDEX4_SIZE_32_BIT;
- index_shift = 2;
- restart_index = 0xffffffff;
- break;
- case VK_INDEX_TYPE_UINT8_EXT:
- index_size = INDEX4_SIZE_8_BIT;
- index_shift = 0;
- restart_index = 0xff;
- break;
- default:
- unreachable("invalid VkIndexType");
+ tu_cs_sanity_check(draw_cs);
}
- /* initialize/update the restart index */
- if (cmd->state.index_size != index_size)
- tu_cs_emit_regs(&cmd->draw_cs, A6XX_PC_RESTART_INDEX(restart_index));
-
- assert(buf->vk.size >= offset);
+ /* track the BO */
+ if (cmd->state.index_buffer != buf)
+ tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
- cmd->state.index_va = buf->iova + offset;
- cmd->state.max_index_count = (buf->vk.size - offset) >> index_shift;
- cmd->state.index_size = index_size;
+ cmd->state.index_buffer = buf;
+ cmd->state.index_offset = offset;
+ cmd->state.index_type = indexType;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
VkPipelineBindPoint pipelineBindPoint,
VkPipelineLayout _layout,
@@ -2058,404 +1673,9 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
uint32_t dynamicOffsetCount,
const uint32_t *pDynamicOffsets)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);
- unsigned dyn_idx = 0;
-
- struct tu_descriptor_state *descriptors_state =
- tu_get_descriptors_state(cmd, pipelineBindPoint);
-
- descriptors_state->max_sets_bound =
- MAX2(descriptors_state->max_sets_bound, firstSet + descriptorSetCount);
-
- for (unsigned i = 0; i < descriptorSetCount; ++i) {
- unsigned idx = i + firstSet;
- TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
-
- descriptors_state->sets[idx] = set;
-
- if (!set)
- continue;
-
- if (!set->layout->dynamic_offset_size)
- continue;
-
- uint32_t *src = set->dynamic_descriptors;
- uint32_t *dst = descriptors_state->dynamic_descriptors +
- layout->set[idx].dynamic_offset_start / 4;
- for (unsigned j = 0; j < set->layout->binding_count; j++) {
- struct tu_descriptor_set_binding_layout *binding =
- &set->layout->binding[j];
- if (binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
- binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
- for (unsigned k = 0; k < binding->array_size; k++, dyn_idx++) {
- assert(dyn_idx < dynamicOffsetCount);
- uint32_t offset = pDynamicOffsets[dyn_idx];
- memcpy(dst, src, binding->size);
-
- if (binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
- /* Note: we can assume here that the addition won't roll
- * over and change the SIZE field.
- */
- uint64_t va = src[0] | ((uint64_t)src[1] << 32);
- va += offset;
- dst[0] = va;
- dst[1] = va >> 32;
- } else {
- uint32_t *dst_desc = dst;
- for (unsigned i = 0;
- i < binding->size / (4 * A6XX_TEX_CONST_DWORDS);
- i++, dst_desc += A6XX_TEX_CONST_DWORDS) {
- /* Note: A6XX_TEX_CONST_5_DEPTH is always 0 */
- uint64_t va = dst_desc[4] | ((uint64_t)dst_desc[5] << 32);
- va += offset;
- dst_desc[4] = va;
- dst_desc[5] = va >> 32;
- }
- }
-
- dst += binding->size / 4;
- src += binding->size / 4;
- }
- }
- }
- }
- assert(dyn_idx == dynamicOffsetCount);
-
- uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_invalidate_value;
- uint64_t addr[MAX_SETS] = {};
- uint64_t dynamic_addr = 0;
- struct tu_cs *cs, state_cs;
-
- for (uint32_t i = 0; i < descriptors_state->max_sets_bound; i++) {
- struct tu_descriptor_set *set = descriptors_state->sets[i];
- if (set)
- addr[i] = set->va | 3;
- }
-
- if (layout->dynamic_offset_size) {
- /* allocate and fill out dynamic descriptor set */
- struct tu_cs_memory dynamic_desc_set;
- VkResult result = tu_cs_alloc(&cmd->sub_cs,
- layout->dynamic_offset_size / (4 * A6XX_TEX_CONST_DWORDS),
- A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
- if (result != VK_SUCCESS) {
- vk_command_buffer_set_error(&cmd->vk, result);
- return;
- }
-
- memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors,
- layout->dynamic_offset_size);
- dynamic_addr = dynamic_desc_set.iova | 3;
- descriptors_state->dynamic_bound = true;
- }
-
- if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
- sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);
- hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
- hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f);
-
- cmd->state.desc_sets =
- tu_cs_draw_state(&cmd->sub_cs, &state_cs,
- 4 + 4 * descriptors_state->max_sets_bound +
- (descriptors_state->dynamic_bound ? 6 : 0));
- cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD;
- cs = &state_cs;
- } else {
- assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE);
-
- sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);
- hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
- hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_CS_BINDLESS(0x1f);
-
- cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
- cs = &cmd->cs;
- }
-
- tu_cs_emit_pkt4(cs, sp_bindless_base_reg, 2 * descriptors_state->max_sets_bound);
- tu_cs_emit_array(cs, (const uint32_t*) addr, 2 * descriptors_state->max_sets_bound);
- tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg, 2 * descriptors_state->max_sets_bound);
- tu_cs_emit_array(cs, (const uint32_t*) addr, 2 * descriptors_state->max_sets_bound);
-
- /* Dynamic descriptors get the last descriptor set. */
- if (descriptors_state->dynamic_bound) {
- tu_cs_emit_pkt4(cs, sp_bindless_base_reg + 4 * 2, 2);
- tu_cs_emit_qw(cs, dynamic_addr);
- tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg + 4 * 2, 2);
- tu_cs_emit_qw(cs, dynamic_addr);
- }
-
- tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(.dword = hlsq_invalidate_value));
-
- if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
- assert(cs->cur == cs->end); /* validate draw state size */
- /* note: this also avoids emitting draw states before renderpass clears,
- * which may use the 3D clear path (for MSAA cases)
- */
- if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
- tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
- tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);
- }
- }
}
-static enum VkResult
-tu_push_descriptor_set_update_layout(struct tu_device *device,
- struct tu_descriptor_set *set,
- struct tu_descriptor_set_layout *layout)
-{
- if (set->layout == layout)
- return VK_SUCCESS;
-
- if (set->layout)
- vk_descriptor_set_layout_unref(&device->vk, &set->layout->vk);
- vk_descriptor_set_layout_ref(&layout->vk);
- set->layout = layout;
-
- if (set->host_size < layout->size) {
- void *new_buf =
- vk_realloc(&device->vk.alloc, set->mapped_ptr, layout->size, 8,
- VK_QUERY_SCOPE_COMMAND_BUFFER_KHR);
- if (!new_buf)
- return VK_ERROR_OUT_OF_HOST_MEMORY;
- set->mapped_ptr = new_buf;
- set->host_size = layout->size;
- }
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer,
- VkPipelineBindPoint pipelineBindPoint,
- VkPipelineLayout _layout,
- uint32_t _set,
- uint32_t descriptorWriteCount,
- const VkWriteDescriptorSet *pDescriptorWrites)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout);
- struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout;
- struct tu_descriptor_set *set =
- &tu_get_descriptors_state(cmd, pipelineBindPoint)->push_set;
-
- struct tu_cs_memory set_mem;
- VkResult result = tu_cs_alloc(&cmd->sub_cs,
- DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4),
- A6XX_TEX_CONST_DWORDS, &set_mem);
- if (result != VK_SUCCESS) {
- vk_command_buffer_set_error(&cmd->vk, result);
- return;
- }
-
- result = tu_push_descriptor_set_update_layout(cmd->device, set, layout);
- if (result != VK_SUCCESS) {
- vk_command_buffer_set_error(&cmd->vk, result);
- return;
- }
-
- tu_update_descriptor_sets(cmd->device, tu_descriptor_set_to_handle(set),
- descriptorWriteCount, pDescriptorWrites, 0, NULL);
-
- memcpy(set_mem.map, set->mapped_ptr, layout->size);
- set->va = set_mem.iova;
-
- tu_CmdBindDescriptorSets(commandBuffer, pipelineBindPoint, _layout, _set,
- 1, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) },
- 0, NULL);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,
- VkDescriptorUpdateTemplate descriptorUpdateTemplate,
- VkPipelineLayout _layout,
- uint32_t _set,
- const void* pData)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout);
- TU_FROM_HANDLE(tu_descriptor_update_template, templ, descriptorUpdateTemplate);
- struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout;
- struct tu_descriptor_set *set =
- &tu_get_descriptors_state(cmd, templ->bind_point)->push_set;
-
- struct tu_cs_memory set_mem;
- VkResult result = tu_cs_alloc(&cmd->sub_cs,
- DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4),
- A6XX_TEX_CONST_DWORDS, &set_mem);
- if (result != VK_SUCCESS) {
- vk_command_buffer_set_error(&cmd->vk, result);
- return;
- }
-
- result = tu_push_descriptor_set_update_layout(cmd->device, set, layout);
- if (result != VK_SUCCESS) {
- vk_command_buffer_set_error(&cmd->vk, result);
- return;
- }
-
- tu_update_descriptor_set_with_template(cmd->device, set, descriptorUpdateTemplate, pData);
-
- memcpy(set_mem.map, set->mapped_ptr, layout->size);
- set->va = set_mem.iova;
-
- tu_CmdBindDescriptorSets(commandBuffer, templ->bind_point, _layout, _set,
- 1, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) },
- 0, NULL);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
- uint32_t firstBinding,
- uint32_t bindingCount,
- const VkBuffer *pBuffers,
- const VkDeviceSize *pOffsets,
- const VkDeviceSize *pSizes)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs *cs = &cmd->draw_cs;
-
- /* using COND_REG_EXEC for xfb commands matches the blob behavior
- * presumably there isn't any benefit using a draw state when the
- * condition is (SYSMEM | BINNING)
- */
- tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
- CP_COND_REG_EXEC_0_SYSMEM |
- CP_COND_REG_EXEC_0_BINNING);
-
- for (uint32_t i = 0; i < bindingCount; i++) {
- TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]);
- uint64_t iova = buf->iova + pOffsets[i];
- uint32_t size = buf->bo->size - (iova - buf->bo->iova);
- uint32_t idx = i + firstBinding;
-
- if (pSizes && pSizes[i] != VK_WHOLE_SIZE)
- size = pSizes[i];
-
- /* BUFFER_BASE is 32-byte aligned, add remaining offset to BUFFER_OFFSET */
- uint32_t offset = iova & 0x1f;
- iova &= ~(uint64_t) 0x1f;
-
- tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE(idx), 3);
- tu_cs_emit_qw(cs, iova);
- tu_cs_emit(cs, size + offset);
-
- cmd->state.streamout_offset[idx] = offset;
- }
-
- tu_cond_exec_end(cs);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
- uint32_t firstCounterBuffer,
- uint32_t counterBufferCount,
- const VkBuffer *pCounterBuffers,
- const VkDeviceSize *pCounterBufferOffsets)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs *cs = &cmd->draw_cs;
-
- tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
- CP_COND_REG_EXEC_0_SYSMEM |
- CP_COND_REG_EXEC_0_BINNING);
-
- tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
-
- /* TODO: only update offset for active buffers */
- for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++)
- tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, cmd->state.streamout_offset[i]));
-
- for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) {
- uint32_t idx = firstCounterBuffer + i;
- uint32_t offset = cmd->state.streamout_offset[idx];
- uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u;
-
- if (!pCounterBuffers[i])
- continue;
-
- TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
-
- tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
- tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
- CP_MEM_TO_REG_0_UNK31 |
- CP_MEM_TO_REG_0_CNT(1));
- tu_cs_emit_qw(cs, buf->iova + counter_buffer_offset);
-
- if (offset) {
- tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
- tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
- CP_REG_RMW_0_SRC1_ADD);
- tu_cs_emit(cs, 0xffffffff);
- tu_cs_emit(cs, offset);
- }
- }
-
- tu_cond_exec_end(cs);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
- uint32_t firstCounterBuffer,
- uint32_t counterBufferCount,
- const VkBuffer *pCounterBuffers,
- const VkDeviceSize *pCounterBufferOffsets)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs *cs = &cmd->draw_cs;
-
- tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
- CP_COND_REG_EXEC_0_SYSMEM |
- CP_COND_REG_EXEC_0_BINNING);
-
- tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));
-
- /* TODO: only flush buffers that need to be flushed */
- for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
- /* note: FLUSH_BASE is always the same, so it could go in init_hw()? */
- tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
- tu_cs_emit_qw(cs, global_iova(cmd, flush_base[i]));
- tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i);
- }
-
- for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) {
- uint32_t idx = firstCounterBuffer + i;
- uint32_t offset = cmd->state.streamout_offset[idx];
- uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u;
-
- if (!pCounterBuffers[i])
- continue;
-
- TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
-
- /* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */
- tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
- tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
- CP_MEM_TO_REG_0_SHIFT_BY_2 |
- 0x40000 | /* ??? */
- CP_MEM_TO_REG_0_UNK31 |
- CP_MEM_TO_REG_0_CNT(1));
- tu_cs_emit_qw(cs, global_iova(cmd, flush_base[idx]));
-
- if (offset) {
- tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
- tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
- CP_REG_RMW_0_SRC1_ADD);
- tu_cs_emit(cs, 0xffffffff);
- tu_cs_emit(cs, -offset);
- }
-
- tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
- tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
- CP_REG_TO_MEM_0_CNT(1));
- tu_cs_emit_qw(cs, buf->iova + counter_buffer_offset);
- }
-
- tu_cond_exec_end(cs);
-
- cmd->state.rp.xfb_used = true;
-}
-
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdPushConstants(VkCommandBuffer commandBuffer,
VkPipelineLayout layout,
VkShaderStageFlags stageFlags,
@@ -2463,63 +1683,38 @@ tu_CmdPushConstants(VkCommandBuffer commandBuffer,
uint32_t size,
const void *pValues)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- memcpy((void*) cmd->push_constants + offset, pValues, size);
- cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
}
-/* Flush everything which has been made available but we haven't actually
- * flushed yet.
- */
-static void
-tu_flush_all_pending(struct tu_cache_state *cache)
-{
- cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
- cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
- /* We currently flush CCU at the end of the command buffer, like
- * what the blob does. There's implicit synchronization around every
- * vkQueueSubmit, but the kernel only flushes the UCHE, and we don't
- * know yet if this command buffer will be the last in the submit so we
- * have to defensively flush everything else.
- *
- * TODO: We could definitely do better than this, since these flushes
- * aren't required by Vulkan, but we'd need kernel support to do that.
- * Ideally, we'd like the kernel to flush everything afterwards, so that we
- * wouldn't have to do any flushes here, and when submitting multiple
- * command buffers there wouldn't be any unnecessary flushes in between.
- */
- if (cmd_buffer->state.pass) {
- tu_flush_all_pending(&cmd_buffer->state.renderpass_cache);
- tu_emit_cache_flush_renderpass(cmd_buffer, &cmd_buffer->draw_cs);
+ if (cmd_buffer->scratch_seqno) {
+ tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->scratch_bo,
+ MSM_SUBMIT_BO_WRITE);
+ }
- trace_end_cmd_buffer(&cmd_buffer->trace, &cmd_buffer->draw_cs, cmd_buffer);
- } else {
- tu_flush_all_pending(&cmd_buffer->state.cache);
- cmd_buffer->state.cache.flush_bits |=
- TU_CMD_FLAG_CCU_FLUSH_COLOR |
- TU_CMD_FLAG_CCU_FLUSH_DEPTH;
- tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs);
+ for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) {
+ tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i],
+ MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
+ }
- trace_end_cmd_buffer(&cmd_buffer->trace, &cmd_buffer->cs, cmd_buffer);
+ for (uint32_t i = 0; i < cmd_buffer->tile_cs.bo_count; i++) {
+ tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->tile_cs.bos[i],
+ MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
}
tu_cs_end(&cmd_buffer->cs);
- tu_cs_end(&cmd_buffer->draw_cs);
- tu_cs_end(&cmd_buffer->draw_epilogue_cs);
+
+ assert(!cmd_buffer->state.attachments);
cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;
- return vk_command_buffer_get_record_result(&cmd_buffer->vk);
+ return cmd_buffer->record_result;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
VkPipelineBindPoint pipelineBindPoint,
VkPipeline _pipeline)
@@ -2527,2300 +1722,654 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
- if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) {
- cmd->state.compute_pipeline = pipeline;
- tu_cs_emit_state_ib(&cmd->cs, pipeline->program.state);
- return;
- }
-
- assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);
-
- cmd->state.pipeline = pipeline;
- cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS |
- TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_VS_PARAMS;
-
- if (pipeline->output.feedback_loop_may_involve_textures &&
- !cmd->state.rp.disable_gmem) {
- /* VK_EXT_attachment_feedback_loop_layout allows feedback loop to involve
- * not only input attachments but also sampled images or image resources.
- * But we cannot just patch gmem for image in the descriptors.
- *
- * At the moment, in context of DXVK, it is expected that only a few
- * drawcalls in a frame would use feedback loop and they would be wrapped
- * in their own renderpasses, so it should be ok to force sysmem.
- *
- * However, there are two further possible optimizations if need would
- * arise for other translation layer:
- * - Tiling could be enabled if we ensure that there is no barrier in
- * the renderpass;
- * - Check that both pipeline and attachments agree that feedback loop
- * is needed.
- */
- perf_debug(
- cmd->device,
- "Disabling gmem due to VK_EXT_attachment_feedback_loop_layout");
- cmd->state.rp.disable_gmem = true;
- }
-
- if (pipeline->prim_order.sysmem_single_prim_mode &&
- !cmd->state.rp.sysmem_single_prim_mode) {
- if (pipeline->output.subpass_feedback_loop_color ||
- pipeline->output.subpass_feedback_loop_ds) {
- perf_debug(cmd->device, "single_prim_mode due to feedback loop");
- } else {
- perf_debug(cmd->device, "single_prim_mode due to rast order access");
- }
- cmd->state.rp.sysmem_single_prim_mode = true;
- }
-
- struct tu_cs *cs = &cmd->draw_cs;
-
- /* note: this also avoids emitting draw states before renderpass clears,
- * which may use the 3D clear path (for MSAA cases)
- */
- if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
- uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT);
-
- tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (6 + util_bitcount(mask)));
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast.state);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order.state_sysmem);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order.state_gmem);
-
- u_foreach_bit(i, mask)
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]);
- }
-
- if (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) {
- cmd->state.rp.has_tess = true;
-
- if (!(pipeline->dynamic_state_mask &
- BIT(TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS))) {
- cmd->state.patch_control_points = pipeline->tess.patch_control_points;
- cmd->state.dirty &= ~TU_CMD_DIRTY_PATCH_CONTROL_POINTS;
- } else {
- cmd->state.dirty |= TU_CMD_DIRTY_PATCH_CONTROL_POINTS;
- }
- }
-
- cmd->state.line_mode = pipeline->rast.line_mode;
- if (!(pipeline->dynamic_state_mask &
- BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY)))
- cmd->state.primtype = pipeline->ia.primtype;
-
- tu6_update_msaa(cmd, pipeline->output.samples);
-
- if ((pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_VIEWPORT)) &&
- (pipeline->viewport.z_negative_one_to_one != cmd->state.z_negative_one_to_one)) {
- cmd->state.z_negative_one_to_one = pipeline->viewport.z_negative_one_to_one;
- cmd->state.dirty |= TU_CMD_DIRTY_VIEWPORTS;
- }
-
- if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VERTEX_INPUT)))
- tu_update_num_vbs(cmd, pipeline->vi.num_vbs);
-
-#define UPDATE_REG(group, X, Y) { \
- /* note: would be better to have pipeline bits already masked */ \
- uint32_t pipeline_bits = pipeline->group.X & pipeline->group.X##_mask; \
- if ((cmd->state.X & pipeline->group.X##_mask) != pipeline_bits) { \
- cmd->state.X &= ~pipeline->group.X##_mask; \
- cmd->state.X |= pipeline_bits; \
- cmd->state.dirty |= TU_CMD_DIRTY_##Y; \
- } \
- if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_##Y))) \
- cmd->state.dirty &= ~TU_CMD_DIRTY_##Y; \
-}
-
- /* these registers can have bits set from both pipeline and dynamic state
- * this updates the bits set by the pipeline
- * if the pipeline doesn't use a dynamic state for the register, then
- * the relevant dirty bit is cleared to avoid overriding the non-dynamic
- * state with a dynamic state the next draw.
- */
- UPDATE_REG(rast, gras_su_cntl, GRAS_SU_CNTL);
- UPDATE_REG(rast_ds, rb_depth_cntl, RB_DEPTH_CNTL);
- UPDATE_REG(ds, rb_stencil_cntl, RB_STENCIL_CNTL);
- UPDATE_REG(rast, pc_raster_cntl, RASTERIZER_DISCARD);
- UPDATE_REG(rast, vpc_unknown_9107, RASTERIZER_DISCARD);
- UPDATE_REG(blend, sp_blend_cntl, BLEND);
- UPDATE_REG(blend, rb_blend_cntl, BLEND);
-
- for (unsigned i = 0; i < pipeline->blend.num_rts; i++) {
- if ((cmd->state.rb_mrt_control[i] & pipeline->blend.rb_mrt_control_mask) !=
- pipeline->blend.rb_mrt_control[i]) {
- cmd->state.rb_mrt_control[i] &= ~pipeline->blend.rb_mrt_control_mask;
- cmd->state.rb_mrt_control[i] |= pipeline->blend.rb_mrt_control[i];
- cmd->state.dirty |= TU_CMD_DIRTY_BLEND;
- }
-
- if (cmd->state.rb_mrt_blend_control[i] != pipeline->blend.rb_mrt_blend_control[i]) {
- cmd->state.rb_mrt_blend_control[i] = pipeline->blend.rb_mrt_blend_control[i];
- cmd->state.dirty |= TU_CMD_DIRTY_BLEND;
- }
- }
-#undef UPDATE_REG
-
- if (cmd->state.pipeline_color_write_enable != pipeline->blend.color_write_enable) {
- cmd->state.pipeline_color_write_enable = pipeline->blend.color_write_enable;
- cmd->state.dirty |= TU_CMD_DIRTY_BLEND;
- }
- if (cmd->state.pipeline_blend_enable != pipeline->blend.blend_enable) {
- cmd->state.pipeline_blend_enable = pipeline->blend.blend_enable;
- cmd->state.dirty |= TU_CMD_DIRTY_BLEND;
- }
- if (cmd->state.logic_op_enabled != pipeline->blend.logic_op_enabled) {
- cmd->state.logic_op_enabled = pipeline->blend.logic_op_enabled;
- cmd->state.dirty |= TU_CMD_DIRTY_BLEND;
- }
- if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_LOGIC_OP)) &&
- cmd->state.rop_reads_dst != pipeline->blend.rop_reads_dst) {
- cmd->state.rop_reads_dst = pipeline->blend.rop_reads_dst;
- cmd->state.dirty |= TU_CMD_DIRTY_BLEND;
- }
- if (cmd->state.dynamic_state[TU_DYNAMIC_STATE_BLEND].size != pipeline->blend.num_rts * 3 + 4) {
- cmd->state.dirty |= TU_CMD_DIRTY_BLEND;
- }
- if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_BLEND))) {
- cmd->state.dirty &= ~TU_CMD_DIRTY_BLEND;
+ switch (pipelineBindPoint) {
+ case VK_PIPELINE_BIND_POINT_GRAPHICS:
+ cmd->state.pipeline = pipeline;
+ cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE;
+ break;
+ case VK_PIPELINE_BIND_POINT_COMPUTE:
+ tu_finishme("binding compute pipeline");
+ break;
+ default:
+ unreachable("unrecognized pipeline bind point");
+ break;
}
-
- if (pipeline->output.rb_depth_cntl_disable)
- cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdSetViewport(VkCommandBuffer commandBuffer,
uint32_t firstViewport,
uint32_t viewportCount,
const VkViewport *pViewports)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+ struct tu_cs *draw_cs = &cmd->draw_cs;
- memcpy(&cmd->state.viewport[firstViewport], pViewports, viewportCount * sizeof(*pViewports));
- cmd->state.max_viewport = MAX2(cmd->state.max_viewport, firstViewport + viewportCount);
+ VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 12);
+ if (result != VK_SUCCESS) {
+ cmd->record_result = result;
+ return;
+ }
- /* With VK_EXT_depth_clip_control we have to take into account
- * negativeOneToOne property of the pipeline, so the viewport calculations
- * are deferred until it is known.
- */
- cmd->state.dirty |= TU_CMD_DIRTY_VIEWPORTS;
+ assert(firstViewport == 0 && viewportCount == 1);
+ tu6_emit_viewport(draw_cs, pViewports);
+
+ tu_cs_sanity_check(draw_cs);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdSetScissor(VkCommandBuffer commandBuffer,
uint32_t firstScissor,
uint32_t scissorCount,
const VkRect2D *pScissors)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs cs;
+ struct tu_cs *draw_cs = &cmd->draw_cs;
+
+ VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 3);
+ if (result != VK_SUCCESS) {
+ cmd->record_result = result;
+ return;
+ }
- memcpy(&cmd->state.scissor[firstScissor], pScissors, scissorCount * sizeof(*pScissors));
- cmd->state.max_scissor = MAX2(cmd->state.max_scissor, firstScissor + scissorCount);
+ assert(firstScissor == 0 && scissorCount == 1);
+ tu6_emit_scissor(draw_cs, pScissors);
- cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * cmd->state.max_scissor);
- tu6_emit_scissor(&cs, cmd->state.scissor, cmd->state.max_scissor);
+ tu_cs_sanity_check(draw_cs);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
- cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(lineWidth / 2.0f);
+ cmd->state.dynamic.line_width = lineWidth;
- cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
+ /* line width depends on VkPipelineRasterizationStateCreateInfo */
+ cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
float depthBiasConstantFactor,
float depthBiasClamp,
float depthBiasSlopeFactor)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BIAS, 4);
+ struct tu_cs *draw_cs = &cmd->draw_cs;
+
+ VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 4);
+ if (result != VK_SUCCESS) {
+ cmd->record_result = result;
+ return;
+ }
+
+ tu6_emit_depth_bias(draw_cs, depthBiasConstantFactor, depthBiasClamp,
+ depthBiasSlopeFactor);
- tu6_emit_depth_bias(&cs, depthBiasConstantFactor, depthBiasClamp, depthBiasSlopeFactor);
+ tu_cs_sanity_check(draw_cs);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
const float blendConstants[4])
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5);
+ struct tu_cs *draw_cs = &cmd->draw_cs;
- tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
- tu_cs_emit_array(&cs, (const uint32_t *) blendConstants, 4);
+ VkResult result = tu_cs_reserve_space(cmd->device, draw_cs, 5);
+ if (result != VK_SUCCESS) {
+ cmd->record_result = result;
+ return;
+ }
+
+ tu6_emit_blend_constants(draw_cs, blendConstants);
+
+ tu_cs_sanity_check(draw_cs);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
float minDepthBounds,
float maxDepthBounds)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3);
-
- tu_cs_emit_regs(&cs,
- A6XX_RB_Z_BOUNDS_MIN(minDepthBounds),
- A6XX_RB_Z_BOUNDS_MAX(maxDepthBounds));
}
void
-update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask)
-{
- if (face & VK_STENCIL_FACE_FRONT_BIT)
- *value = (*value & 0xff00) | (mask & 0xff);
- if (face & VK_STENCIL_FACE_BACK_BIT)
- *value = (*value & 0xff) | (mask & 0xff) << 8;
-}
-
-VKAPI_ATTR void VKAPI_CALL
tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
VkStencilFaceFlags faceMask,
uint32_t compareMask)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2);
- update_stencil_mask(&cmd->state.dynamic_stencil_mask, faceMask, compareMask);
+ if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
+ cmd->state.dynamic.stencil_compare_mask.front = compareMask;
+ if (faceMask & VK_STENCIL_FACE_BACK_BIT)
+ cmd->state.dynamic.stencil_compare_mask.back = compareMask;
- tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.dword = cmd->state.dynamic_stencil_mask));
+ /* the front/back compare masks must be updated together */
+ cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
VkStencilFaceFlags faceMask,
uint32_t writeMask)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2);
-
- update_stencil_mask(&cmd->state.dynamic_stencil_wrmask, faceMask, writeMask);
- tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = cmd->state.dynamic_stencil_wrmask));
+ if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
+ cmd->state.dynamic.stencil_write_mask.front = writeMask;
+ if (faceMask & VK_STENCIL_FACE_BACK_BIT)
+ cmd->state.dynamic.stencil_write_mask.back = writeMask;
- cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
+ /* the front/back write masks must be updated together */
+ cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdSetStencilReference(VkCommandBuffer commandBuffer,
VkStencilFaceFlags faceMask,
uint32_t reference)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2);
-
- update_stencil_mask(&cmd->state.dynamic_stencil_ref, faceMask, reference);
-
- tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.dword = cmd->state.dynamic_stencil_ref));
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
- const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, 9);
- assert(pSampleLocationsInfo);
+ if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
+ cmd->state.dynamic.stencil_reference.front = reference;
+ if (faceMask & VK_STENCIL_FACE_BACK_BIT)
+ cmd->state.dynamic.stencil_reference.back = reference;
- tu6_emit_sample_locations(&cs, pSampleLocationsInfo);
+ /* the front/back references must be updated together */
+ cmd->state.dirty |= TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-
- cmd->state.gras_su_cntl &=
- ~(A6XX_GRAS_SU_CNTL_CULL_FRONT | A6XX_GRAS_SU_CNTL_CULL_BACK);
-
- if (cullMode & VK_CULL_MODE_FRONT_BIT)
- cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT;
- if (cullMode & VK_CULL_MODE_BACK_BIT)
- cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK;
-
- cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-
- cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_FRONT_CW;
-
- if (frontFace == VK_FRONT_FACE_CLOCKWISE)
- cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW;
-
- cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer,
- VkPrimitiveTopology primitiveTopology)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-
- cmd->state.primtype = tu6_primtype(primitiveTopology);
- tu6_update_msaa(cmd, cmd->state.samples);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer,
- uint32_t viewportCount,
- const VkViewport* pViewports)
-{
- tu_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer,
- uint32_t scissorCount,
- const VkRect2D* pScissors)
-{
- tu_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer,
- VkBool32 depthTestEnable)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-
- cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
-
- if (depthTestEnable)
- cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
-
- cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer,
- VkBool32 depthWriteEnable)
+void
+tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
+ uint32_t commandBufferCount,
+ const VkCommandBuffer *pCmdBuffers)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-
- cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
-
- if (depthWriteEnable)
- cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
-
- cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer,
- VkCompareOp depthCompareOp)
+VkResult
+tu_CreateCommandPool(VkDevice _device,
+ const VkCommandPoolCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkCommandPool *pCmdPool)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-
- cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK;
+ TU_FROM_HANDLE(tu_device, device, _device);
+ struct tu_cmd_pool *pool;
- cmd->state.rb_depth_cntl |=
- A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(depthCompareOp));
+ pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (pool == NULL)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
-}
+ if (pAllocator)
+ pool->alloc = *pAllocator;
+ else
+ pool->alloc = device->alloc;
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer,
- VkBool32 depthBoundsTestEnable)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+ list_inithead(&pool->cmd_buffers);
+ list_inithead(&pool->free_cmd_buffers);
- cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
+ pool->queue_family_index = pCreateInfo->queueFamilyIndex;
- if (depthBoundsTestEnable)
- cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
+ *pCmdPool = tu_cmd_pool_to_handle(pool);
- cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
+ return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer,
- VkBool32 stencilTestEnable)
+void
+tu_DestroyCommandPool(VkDevice _device,
+ VkCommandPool commandPool,
+ const VkAllocationCallbacks *pAllocator)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-
- cmd->state.rb_stencil_cntl &= ~(
- A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
- A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
- A6XX_RB_STENCIL_CONTROL_STENCIL_READ);
-
- if (stencilTestEnable) {
- cmd->state.rb_stencil_cntl |=
- A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
- A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
- A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
- }
+ TU_FROM_HANDLE(tu_device, device, _device);
+ TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
- cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer,
- VkStencilFaceFlags faceMask,
- VkStencilOp failOp,
- VkStencilOp passOp,
- VkStencilOp depthFailOp,
- VkCompareOp compareOp)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+ if (!pool)
+ return;
- if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
- cmd->state.rb_stencil_cntl &= ~(
- A6XX_RB_STENCIL_CONTROL_FUNC__MASK |
- A6XX_RB_STENCIL_CONTROL_FAIL__MASK |
- A6XX_RB_STENCIL_CONTROL_ZPASS__MASK |
- A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK);
-
- cmd->state.rb_stencil_cntl |=
- A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(compareOp)) |
- A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(failOp)) |
- A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(passOp)) |
- A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(depthFailOp));
+ list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
+ &pool->cmd_buffers, pool_link)
+ {
+ tu_cmd_buffer_destroy(cmd_buffer);
}
- if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
- cmd->state.rb_stencil_cntl &= ~(
- A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK |
- A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK |
- A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK |
- A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK);
-
- cmd->state.rb_stencil_cntl |=
- A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(compareOp)) |
- A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(failOp)) |
- A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(passOp)) |
- A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(depthFailOp));
+ list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
+ &pool->free_cmd_buffers, pool_link)
+ {
+ tu_cmd_buffer_destroy(cmd_buffer);
}
- cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL;
+ vk_free2(&device->alloc, pAllocator, pool);
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer,
- VkBool32 depthBiasEnable)
+VkResult
+tu_ResetCommandPool(VkDevice device,
+ VkCommandPool commandPool,
+ VkCommandPoolResetFlags flags)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+ TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
+ VkResult result;
- cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET;
- if (depthBiasEnable)
- cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET;
+ list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,
+ pool_link)
+ {
+ result = tu_reset_cmd_buffer(cmd_buffer);
+ if (result != VK_SUCCESS)
+ return result;
+ }
- cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
+ return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer,
- VkBool32 primitiveRestartEnable)
+void
+tu_TrimCommandPool(VkDevice device,
+ VkCommandPool commandPool,
+ VkCommandPoolTrimFlags flags)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+ TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
- cmd->state.primitive_restart_enable = primitiveRestartEnable;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer,
- VkBool32 rasterizerDiscardEnable)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+ if (!pool)
+ return;
- cmd->state.pc_raster_cntl &= ~A6XX_PC_RASTER_CNTL_DISCARD;
- cmd->state.vpc_unknown_9107 &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
- if (rasterizerDiscardEnable) {
- cmd->state.pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD;
- cmd->state.vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
+ list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
+ &pool->free_cmd_buffers, pool_link)
+ {
+ tu_cmd_buffer_destroy(cmd_buffer);
}
-
- cmd->state.dirty |= TU_CMD_DIRTY_RASTERIZER_DISCARD;
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer,
- VkLogicOp logicOp)
+void
+tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
+ const VkRenderPassBeginInfo *pRenderPassBegin,
+ VkSubpassContents contents)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-
- cmd->state.rb_mrt_control_rop =
- tu6_rb_mrt_control_rop(logicOp, &cmd->state.rop_reads_dst);
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
+ TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
+ TU_FROM_HANDLE(tu_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
+ VkResult result;
- cmd->state.dirty |= TU_CMD_DIRTY_BLEND;
-}
+ cmd_buffer->state.pass = pass;
+ cmd_buffer->state.subpass = pass->subpasses;
+ cmd_buffer->state.framebuffer = framebuffer;
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer,
- uint32_t patchControlPoints)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+ result = tu_cmd_state_setup_attachments(cmd_buffer, pRenderPassBegin);
+ if (result != VK_SUCCESS)
+ return;
- cmd->state.patch_control_points = patchControlPoints;
+ tu_cmd_update_tiling_config(cmd_buffer, &pRenderPassBegin->renderArea);
+ tu_cmd_prepare_tile_load_ib(cmd_buffer);
+ tu_cmd_prepare_tile_store_ib(cmd_buffer);
- cmd->state.dirty |= TU_CMD_DIRTY_PATCH_CONTROL_POINTS;
+ /* draw_cs should contain entries only for this render pass */
+ assert(!cmd_buffer->draw_cs.entry_count);
+ tu_cs_begin(&cmd_buffer->draw_cs);
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,
- uint32_t lineStippleFactor,
- uint16_t lineStipplePattern)
+void
+tu_CmdBeginRenderPass2KHR(VkCommandBuffer commandBuffer,
+ const VkRenderPassBeginInfo *pRenderPassBeginInfo,
+ const VkSubpassBeginInfoKHR *pSubpassBeginInfo)
{
- tu_stub();
+ tu_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
+ pSubpassBeginInfo->contents);
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
- const VkBool32 *pColorWriteEnables)
+void
+tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- uint32_t color_write_enable = 0;
-
- for (unsigned i = 0; i < attachmentCount; i++) {
- if (pColorWriteEnables[i])
- color_write_enable |= BIT(i);
- }
-
- cmd->state.color_write_enable = color_write_enable;
- cmd->state.dirty |= TU_CMD_DIRTY_BLEND;
-}
-
-static void
-tu_flush_for_access(struct tu_cache_state *cache,
- enum tu_cmd_access_mask src_mask,
- enum tu_cmd_access_mask dst_mask)
-{
- enum tu_cmd_flush_bits flush_bits = 0;
-
- if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
- cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
- }
-
- if (src_mask & TU_ACCESS_CP_WRITE) {
- /* Flush the CP write queue.
- */
- cache->pending_flush_bits |=
- TU_CMD_FLAG_WAIT_MEM_WRITES |
- TU_CMD_FLAG_ALL_INVALIDATE;
- }
-
-#define SRC_FLUSH(domain, flush, invalidate) \
- if (src_mask & TU_ACCESS_##domain##_WRITE) { \
- cache->pending_flush_bits |= TU_CMD_FLAG_##flush | \
- (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \
- }
- SRC_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
- SRC_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
- SRC_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
+ tu_cmd_render_tiles(cmd);
-#undef SRC_FLUSH
+ cmd->state.subpass++;
-#define SRC_INCOHERENT_FLUSH(domain, flush, invalidate) \
- if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) { \
- flush_bits |= TU_CMD_FLAG_##flush; \
- cache->pending_flush_bits |= \
- (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate); \
- }
-
- SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
- SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
-
-#undef SRC_INCOHERENT_FLUSH
-
- /* Treat host & sysmem write accesses the same, since the kernel implicitly
- * drains the queue before signalling completion to the host.
- */
- if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) {
- flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
- }
-
-#define DST_FLUSH(domain, flush, invalidate) \
- if (dst_mask & (TU_ACCESS_##domain##_READ | \
- TU_ACCESS_##domain##_WRITE)) { \
- flush_bits |= cache->pending_flush_bits & \
- (TU_CMD_FLAG_##invalidate | \
- (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \
- }
-
- DST_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
- DST_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
- DST_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
-
-#undef DST_FLUSH
-
-#define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \
- if (dst_mask & (TU_ACCESS_##domain##_INCOHERENT_READ | \
- TU_ACCESS_##domain##_INCOHERENT_WRITE)) { \
- flush_bits |= TU_CMD_FLAG_##invalidate | \
- (cache->pending_flush_bits & \
- (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush)); \
- }
-
- DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
- DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
-
-#undef DST_INCOHERENT_FLUSH
-
- cache->flush_bits |= flush_bits;
- cache->pending_flush_bits &= ~flush_bits;
-}
-
-/* When translating Vulkan access flags to which cache is accessed
- * (CCU/UCHE/sysmem), we should take into account both the access flags and
- * the stage so that accesses with MEMORY_READ_BIT/MEMORY_WRITE_BIT + a
- * specific stage return something sensible. The specification for
- * VK_KHR_synchronization2 says that we should do this:
- *
- * Additionally, scoping the pipeline stages into the barrier structs
- * allows the use of the MEMORY_READ and MEMORY_WRITE flags without
- * sacrificing precision. The per-stage access flags should be used to
- * disambiguate specific accesses in a given stage or set of stages - for
- * instance, between uniform reads and sampling operations.
- *
- * Note that while in all known cases the stage is actually enough, we should
- * still narrow things down based on the access flags to handle "old-style"
- * barriers that may specify a wider range of stages but more precise access
- * flags. These helpers allow us to do both.
- */
-
-static bool
-filter_read_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages,
- VkAccessFlags2 tu_flags, VkPipelineStageFlags2 tu_stages)
-{
- return (flags & (tu_flags | VK_ACCESS_2_MEMORY_READ_BIT)) &&
- (stages & (tu_stages | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT));
-}
-
-static bool
-filter_write_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages,
- VkAccessFlags2 tu_flags, VkPipelineStageFlags2 tu_stages)
-{
- return (flags & (tu_flags | VK_ACCESS_2_MEMORY_WRITE_BIT)) &&
- (stages & (tu_stages | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT));
+ tu_cmd_update_tiling_config(cmd, NULL);
+ tu_cmd_prepare_tile_load_ib(cmd);
+ tu_cmd_prepare_tile_store_ib(cmd);
}
-static bool
-gfx_read_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages,
- VkAccessFlags2 tu_flags, VkPipelineStageFlags2 tu_stages)
+void
+tu_CmdNextSubpass2KHR(VkCommandBuffer commandBuffer,
+ const VkSubpassBeginInfoKHR *pSubpassBeginInfo,
+ const VkSubpassEndInfoKHR *pSubpassEndInfo)
{
- return filter_read_access(flags, stages, tu_flags,
- tu_stages | VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT);
+ tu_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
}
-static bool
-gfx_write_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages,
- VkAccessFlags2 tu_flags, VkPipelineStageFlags2 tu_stages)
-{
- return filter_write_access(flags, stages, tu_flags,
- tu_stages | VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT);
-}
-static enum tu_cmd_access_mask
-vk2tu_access(VkAccessFlags2 flags, VkPipelineStageFlags2 stages, bool image_only, bool gmem)
+struct tu_draw_info
{
- enum tu_cmd_access_mask mask = 0;
-
- if (gfx_read_access(flags, stages,
- VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT |
- VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT |
- VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT |
- VK_ACCESS_2_HOST_READ_BIT,
- VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
- VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT |
- VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
- VK_PIPELINE_STAGE_2_HOST_BIT))
- mask |= TU_ACCESS_SYSMEM_READ;
-
- if (gfx_write_access(flags, stages,
- VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT,
- VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT))
- mask |= TU_ACCESS_CP_WRITE;
-
- if (gfx_write_access(flags, stages,
- VK_ACCESS_2_HOST_WRITE_BIT,
- VK_PIPELINE_STAGE_2_HOST_BIT))
- mask |= TU_ACCESS_SYSMEM_WRITE;
-
-#define SHADER_STAGES \
- (VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT | \
- VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT | \
- VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | \
- VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT | \
- VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT | \
- VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | \
- VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)
-
-
- if (gfx_read_access(flags, stages,
- VK_ACCESS_2_INDEX_READ_BIT |
- VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT |
- VK_ACCESS_2_UNIFORM_READ_BIT |
- VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT |
- VK_ACCESS_2_SHADER_READ_BIT,
- VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
- VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
- VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
- SHADER_STAGES))
- mask |= TU_ACCESS_UCHE_READ;
-
- if (gfx_write_access(flags, stages,
- VK_ACCESS_2_SHADER_WRITE_BIT |
- VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT,
- VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
- SHADER_STAGES))
- mask |= TU_ACCESS_UCHE_WRITE;
-
- /* When using GMEM, the CCU is always flushed automatically to GMEM, and
- * then GMEM is flushed to sysmem. Furthermore, we already had to flush any
- * previous writes in sysmem mode when transitioning to GMEM. Therefore we
- * can ignore CCU and pretend that color attachments and transfers use
- * sysmem directly.
+ /**
+ * Number of vertices.
*/
+ uint32_t count;
- if (gfx_read_access(flags, stages,
- VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT |
- VK_ACCESS_2_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT,
- VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT)) {
- if (gmem)
- mask |= TU_ACCESS_SYSMEM_READ;
- else
- mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ;
- }
-
- if (gfx_read_access(flags, stages,
- VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT,
- VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
- VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT)) {
- if (gmem)
- mask |= TU_ACCESS_SYSMEM_READ;
- else
- mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ;
- }
-
- if (gfx_write_access(flags, stages,
- VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT,
- VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT)) {
- if (gmem) {
- mask |= TU_ACCESS_SYSMEM_WRITE;
- } else {
- mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
- }
- }
-
- if (gfx_write_access(flags, stages,
- VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
- VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
- VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT)) {
- if (gmem) {
- mask |= TU_ACCESS_SYSMEM_WRITE;
- } else {
- mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
- }
- }
-
- if (filter_write_access(flags, stages,
- VK_ACCESS_2_TRANSFER_WRITE_BIT,
- VK_PIPELINE_STAGE_2_COPY_BIT |
- VK_PIPELINE_STAGE_2_BLIT_BIT |
- VK_PIPELINE_STAGE_2_CLEAR_BIT |
- VK_PIPELINE_STAGE_2_RESOLVE_BIT |
- VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT)) {
- if (gmem) {
- mask |= TU_ACCESS_SYSMEM_WRITE;
- } else if (image_only) {
- /* Because we always split up blits/copies of images involving
- * multiple layers, we always access each layer in the same way, with
- * the same base address, same format, etc. This means we can avoid
- * flushing between multiple writes to the same image. This elides
- * flushes between e.g. multiple blits to the same image.
- */
- mask |= TU_ACCESS_CCU_COLOR_WRITE;
- } else {
- mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
- }
- }
-
- if (filter_read_access(flags, stages,
- VK_ACCESS_2_TRANSFER_READ_BIT,
- VK_PIPELINE_STAGE_2_COPY_BIT |
- VK_PIPELINE_STAGE_2_BLIT_BIT |
- VK_PIPELINE_STAGE_2_RESOLVE_BIT |
- VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT)) {
- mask |= TU_ACCESS_UCHE_READ;
- }
-
- return mask;
-}
-
-/* These helpers deal with legacy BOTTOM_OF_PIPE/TOP_OF_PIPE stages.
- */
-
-static VkPipelineStageFlags2
-sanitize_src_stage(VkPipelineStageFlags2 stage_mask)
-{
- /* From the Vulkan spec:
- *
- * VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT is ... equivalent to
- * VK_PIPELINE_STAGE_2_NONE in the first scope.
- *
- * VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT is equivalent to
- * VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT with VkAccessFlags2 set to 0
- * when specified in the first synchronization scope, ...
+ /**
+ * Index of the first vertex.
*/
- if (stage_mask & VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
- return VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
+ int32_t vertex_offset;
- return stage_mask & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT;
-}
-
-static VkPipelineStageFlags2
-sanitize_dst_stage(VkPipelineStageFlags2 stage_mask)
-{
- /* From the Vulkan spec:
- *
- * VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT is equivalent to
- * VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT with VkAccessFlags2 set to 0
- * when specified in the second synchronization scope, ...
- *
- * VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT is ... equivalent to
- * VK_PIPELINE_STAGE_2_NONE in the second scope.
- *
+ /**
+ * First instance id.
*/
- if (stage_mask & VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)
- return VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
-
- return stage_mask & ~VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT;
-}
-
-static enum tu_stage
-vk2tu_single_stage(VkPipelineStageFlags2 vk_stage, bool dst)
-{
- if (vk_stage == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
- return TU_STAGE_CP;
-
- if (vk_stage == VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT)
- return TU_STAGE_FE;
-
- if (vk_stage == VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT)
- return TU_STAGE_SP_VS;
-
- if (vk_stage == VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT)
- return TU_STAGE_SP_PS;
-
- if (vk_stage == VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT || /* Yes, really */
- /* See comment in TU_STAGE_GRAS about early fragment tests */
- vk_stage == VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT)
-
- return TU_STAGE_PS;
-
- if (vk_stage == VK_PIPELINE_STAGE_2_COPY_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_BLIT_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_RESOLVE_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_CLEAR_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT)
- /* Blits read in SP_PS and write in PS, in both 2d and 3d cases */
- return dst ? TU_STAGE_SP_PS : TU_STAGE_PS;
-
- if (vk_stage == VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT ||
- vk_stage == VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)
- /* Be conservative */
- return dst ? TU_STAGE_CP : TU_STAGE_PS;
-
- if (vk_stage == VK_PIPELINE_STAGE_2_HOST_BIT)
- return dst ? TU_STAGE_PS : TU_STAGE_CP;
-
- unreachable("unknown pipeline stage");
-}
+ uint32_t first_instance;
-static enum tu_stage
-vk2tu_src_stage(VkPipelineStageFlags vk_stages)
-{
- enum tu_stage stage = TU_STAGE_CP;
- u_foreach_bit (bit, vk_stages) {
- enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false);
- stage = MAX2(stage, new_stage);
- }
-
- return stage;
-}
-
-static enum tu_stage
-vk2tu_dst_stage(VkPipelineStageFlags vk_stages)
-{
- enum tu_stage stage = TU_STAGE_PS;
- u_foreach_bit (bit, vk_stages) {
- enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true);
- stage = MIN2(stage, new_stage);
- }
-
- return stage;
-}
-
-static void
-tu_flush_for_stage(struct tu_cache_state *cache,
- enum tu_stage src_stage, enum tu_stage dst_stage)
-{
- /* As far as we know, flushes take place in the last stage so if there are
- * any pending flushes then we have to move down the source stage, because
- * the data only becomes available when the flush finishes. In particular
- * this can matter when the CP writes something and we need to invalidate
- * UCHE to read it.
- */
- if (cache->flush_bits & (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE))
- src_stage = TU_STAGE_PS;
-
- /* Note: if the destination stage is the CP, then the CP also has to wait
- * for any WFI's to finish. This is already done for draw calls, including
- * before indirect param reads, for the most part, so we just need to WFI.
- *
- * However, some indirect draw opcodes, depending on firmware, don't have
- * implicit CP_WAIT_FOR_ME so we have to handle it manually.
- *
- * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly
- * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it.
- *
- * Currently we read the draw predicate using CP_MEM_TO_MEM, which
- * also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not*
- * implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to
- * complete since it's written for DX11 where you can only predicate on the
- * result of a query object. So if we implement 64-bit comparisons in the
- * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit
- * comparisons, then this will have to be dealt with.
+ /**
+ * Number of instances.
*/
- if (src_stage > dst_stage) {
- cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
- if (dst_stage == TU_STAGE_CP)
- cache->pending_flush_bits |= TU_CMD_FLAG_WAIT_FOR_ME;
- }
-}
+ uint32_t instance_count;
-void
-tu_render_pass_state_merge(struct tu_render_pass_state *dst,
- const struct tu_render_pass_state *src)
-{
- dst->xfb_used |= src->xfb_used;
- dst->has_tess |= src->has_tess;
- dst->has_prim_generated_query_in_rp |= src->has_prim_generated_query_in_rp;
- dst->disable_gmem |= src->disable_gmem;
- dst->sysmem_single_prim_mode |= src->sysmem_single_prim_mode;
- dst->draw_cs_writes_to_cond_pred |= src->draw_cs_writes_to_cond_pred;
-
- dst->drawcall_count += src->drawcall_count;
- dst->drawcall_bandwidth_per_sample_sum +=
- src->drawcall_bandwidth_per_sample_sum;
-}
-
-void
-tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
- struct tu_cmd_buffer *suspended)
-{
- cmd->state.pass = suspended->state.suspended_pass.pass;
- cmd->state.subpass = suspended->state.suspended_pass.subpass;
- cmd->state.framebuffer = suspended->state.suspended_pass.framebuffer;
- cmd->state.attachments = suspended->state.suspended_pass.attachments;
- cmd->state.render_area = suspended->state.suspended_pass.render_area;
- cmd->state.gmem_layout = suspended->state.suspended_pass.gmem_layout;
- cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
- cmd->state.lrz = suspended->state.suspended_pass.lrz;
-}
-
-/* Take the saved pre-chain in "secondary" and copy its commands to "cmd",
- * appending it after any saved-up commands in "cmd".
- */
-void
-tu_append_pre_chain(struct tu_cmd_buffer *cmd,
- struct tu_cmd_buffer *secondary)
-{
- tu_cs_add_entries(&cmd->draw_cs, &secondary->pre_chain.draw_cs);
- tu_cs_add_entries(&cmd->draw_epilogue_cs,
- &secondary->pre_chain.draw_epilogue_cs);
-
- tu_render_pass_state_merge(&cmd->state.rp,
- &secondary->pre_chain.state);
- tu_clone_trace_range(cmd, &cmd->draw_cs, secondary->pre_chain.trace_renderpass_start,
- secondary->pre_chain.trace_renderpass_end);
-}
-
-/* Take the saved post-chain in "secondary" and copy it to "cmd".
- */
-void
-tu_append_post_chain(struct tu_cmd_buffer *cmd,
- struct tu_cmd_buffer *secondary)
-{
- tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
- tu_cs_add_entries(&cmd->draw_epilogue_cs, &secondary->draw_epilogue_cs);
-
- tu_clone_trace_range(cmd, &cmd->draw_cs, secondary->trace_renderpass_start,
- secondary->trace_renderpass_end);
- cmd->state.rp = secondary->state.rp;
-}
-
-/* Assuming "secondary" is just a sequence of suspended and resuming passes,
- * copy its state to "cmd". This also works instead of tu_append_post_chain(),
- * but it's a bit slower because we don't assume that the chain begins in
- * "secondary" and therefore have to care about the command buffer's
- * renderpass state.
- */
-void
-tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
- struct tu_cmd_buffer *secondary)
-{
- tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
- tu_cs_add_entries(&cmd->draw_epilogue_cs, &secondary->draw_epilogue_cs);
-
- tu_clone_trace_range(cmd, &cmd->draw_cs, secondary->trace_renderpass_start,
- secondary->trace_renderpass_end);
- tu_render_pass_state_merge(&cmd->state.rp,
- &secondary->state.rp);
-}
-
-/* Take the current render pass state and save it to "pre_chain" to be
- * combined later.
- */
-static void
-tu_save_pre_chain(struct tu_cmd_buffer *cmd)
-{
- tu_cs_add_entries(&cmd->pre_chain.draw_cs,
- &cmd->draw_cs);
- tu_cs_add_entries(&cmd->pre_chain.draw_epilogue_cs,
- &cmd->draw_epilogue_cs);
- cmd->pre_chain.trace_renderpass_start =
- cmd->trace_renderpass_start;
- cmd->pre_chain.trace_renderpass_end =
- cmd->trace_renderpass_end;
- cmd->pre_chain.state = cmd->state.rp;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
- uint32_t commandBufferCount,
- const VkCommandBuffer *pCmdBuffers)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- VkResult result;
-
- assert(commandBufferCount > 0);
-
- /* Emit any pending flushes. */
- if (cmd->state.pass) {
- tu_flush_all_pending(&cmd->state.renderpass_cache);
- tu_emit_cache_flush_renderpass(cmd, &cmd->draw_cs);
- } else {
- tu_flush_all_pending(&cmd->state.cache);
- tu_emit_cache_flush(cmd, &cmd->cs);
- }
-
- for (uint32_t i = 0; i < commandBufferCount; i++) {
- TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
-
- if (secondary->usage_flags &
- VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
- assert(tu_cs_is_empty(&secondary->cs));
-
- result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
- if (result != VK_SUCCESS) {
- vk_command_buffer_set_error(&cmd->vk, result);
- break;
- }
-
- result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
- &secondary->draw_epilogue_cs);
- if (result != VK_SUCCESS) {
- vk_command_buffer_set_error(&cmd->vk, result);
- break;
- }
-
- /* If LRZ was made invalid in secondary - we should disable
- * LRZ retroactively for the whole renderpass.
- */
- if (!secondary->state.lrz.valid)
- cmd->state.lrz.valid = false;
-
- tu_clone_trace(cmd, &cmd->draw_cs, &secondary->trace);
- tu_render_pass_state_merge(&cmd->state.rp, &secondary->state.rp);
- } else {
- switch (secondary->state.suspend_resume) {
- case SR_NONE:
- assert(tu_cs_is_empty(&secondary->draw_cs));
- assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
- tu_cs_add_entries(&cmd->cs, &secondary->cs);
- tu_clone_trace(cmd, &cmd->cs, &secondary->trace);
- break;
-
- case SR_IN_PRE_CHAIN:
- /* cmd may be empty, which means that the chain begins before cmd
- * in which case we have to update its state.
- */
- if (cmd->state.suspend_resume == SR_NONE) {
- cmd->state.suspend_resume = SR_IN_PRE_CHAIN;
- cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
- }
-
- /* The secondary is just a continuous suspend/resume chain so we
- * just have to append it to the the command buffer.
- */
- assert(tu_cs_is_empty(&secondary->cs));
- tu_append_pre_post_chain(cmd, secondary);
- break;
-
- case SR_AFTER_PRE_CHAIN:
- case SR_IN_CHAIN:
- case SR_IN_CHAIN_AFTER_PRE_CHAIN:
- if (secondary->state.suspend_resume == SR_AFTER_PRE_CHAIN ||
- secondary->state.suspend_resume == SR_IN_CHAIN_AFTER_PRE_CHAIN) {
- /* In thse cases there is a `pre_chain` in the secondary which
- * ends that we need to append to the primary.
- */
-
- if (cmd->state.suspend_resume == SR_NONE)
- cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
-
- tu_append_pre_chain(cmd, secondary);
-
- /* We're about to render, so we need to end the command stream
- * in case there were any extra commands generated by copying
- * the trace.
- */
- tu_cs_end(&cmd->draw_cs);
- tu_cs_end(&cmd->draw_epilogue_cs);
-
- switch (cmd->state.suspend_resume) {
- case SR_NONE:
- case SR_IN_PRE_CHAIN:
- /* The renderpass chain ends in the secondary but isn't
- * started in the primary, so we have to move the state to
- * `pre_chain`.
- */
- cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);
- tu_save_pre_chain(cmd);
- cmd->state.suspend_resume = SR_AFTER_PRE_CHAIN;
- break;
- case SR_IN_CHAIN:
- case SR_IN_CHAIN_AFTER_PRE_CHAIN:
- /* The renderpass ends in the secondary and starts somewhere
- * earlier in this primary. Since the last render pass in
- * the chain is in the secondary, we are technically outside
- * of a render pass. Fix that here by reusing the dynamic
- * render pass that was setup for the last suspended render
- * pass before the secondary.
- */
- tu_restore_suspended_pass(cmd, cmd);
-
- tu_cmd_render(cmd);
- if (cmd->state.suspend_resume == SR_IN_CHAIN)
- cmd->state.suspend_resume = SR_NONE;
- else
- cmd->state.suspend_resume = SR_AFTER_PRE_CHAIN;
- break;
- case SR_AFTER_PRE_CHAIN:
- unreachable("resuming render pass is not preceded by suspending one");
- }
-
- tu_reset_render_pass(cmd);
- }
-
- tu_cs_add_entries(&cmd->cs, &secondary->cs);
-
- if (secondary->state.suspend_resume == SR_IN_CHAIN_AFTER_PRE_CHAIN ||
- secondary->state.suspend_resume == SR_IN_CHAIN) {
- /* The secondary ends in a "post-chain" (the opposite of a
- * pre-chain) that we need to copy into the current command
- * buffer.
- */
- cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
- tu_append_post_chain(cmd, secondary);
- cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace);
- cmd->state.suspended_pass = secondary->state.suspended_pass;
-
- switch (cmd->state.suspend_resume) {
- case SR_NONE:
- cmd->state.suspend_resume = SR_IN_CHAIN;
- break;
- case SR_AFTER_PRE_CHAIN:
- cmd->state.suspend_resume = SR_IN_CHAIN_AFTER_PRE_CHAIN;
- break;
- default:
- unreachable("suspending render pass is followed by a not resuming one");
- }
- }
- }
- }
-
- cmd->state.index_size = secondary->state.index_size; /* for restart index update */
- }
- cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
-
- if (!cmd->state.lrz.gpu_dir_tracking && cmd->state.pass) {
- /* After a secondary command buffer is executed, LRZ is not valid
- * until it is cleared again.
- */
- cmd->state.lrz.valid = false;
- }
-
- /* After executing secondary command buffers, there may have been arbitrary
- * flushes executed, so when we encounter a pipeline barrier with a
- * srcMask, we have to assume that we need to invalidate. Therefore we need
- * to re-initialize the cache with all pending invalidate bits set.
+ /**
+ * First index (indexed draws only).
*/
- if (cmd->state.pass) {
- tu_cache_init(&cmd->state.renderpass_cache);
- } else {
- tu_cache_init(&cmd->state.cache);
- }
-}
+ uint32_t first_index;
-static void
-tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
- const struct tu_subpass_barrier *barrier,
- bool external)
-{
- /* Note: we don't know until the end of the subpass whether we'll use
- * sysmem, so assume sysmem here to be safe.
+ /**
+ * Whether it's an indexed draw.
*/
- struct tu_cache_state *cache =
- external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache;
- VkPipelineStageFlags2 src_stage_vk =
- sanitize_src_stage(barrier->src_stage_mask);
- VkPipelineStageFlags2 dst_stage_vk =
- sanitize_dst_stage(barrier->dst_stage_mask);
- enum tu_cmd_access_mask src_flags =
- vk2tu_access(barrier->src_access_mask, src_stage_vk, false, false);
- enum tu_cmd_access_mask dst_flags =
- vk2tu_access(barrier->dst_access_mask, dst_stage_vk, false, false);
-
- if (barrier->incoherent_ccu_color)
- src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
- if (barrier->incoherent_ccu_depth)
- src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
-
- tu_flush_for_access(cache, src_flags, dst_flags);
-
- enum tu_stage src_stage = vk2tu_src_stage(src_stage_vk);
- enum tu_stage dst_stage = vk2tu_dst_stage(dst_stage_vk);
- tu_flush_for_stage(cache, src_stage, dst_stage);
-}
-
-/* emit mrt/zs/msaa/ubwc state for the subpass that is starting (either at
- * vkCmdBeginRenderPass2() or vkCmdNextSubpass2())
- */
-static void
-tu_emit_subpass_begin(struct tu_cmd_buffer *cmd)
-{
- tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
- tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
- if (cmd->state.subpass->samples != 0)
- tu6_update_msaa(cmd, cmd->state.subpass->samples);
- tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
+ bool indexed;
- tu_set_input_attachments(cmd, cmd->state.subpass);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
- const VkRenderPassBeginInfo *pRenderPassBegin,
- const VkSubpassBeginInfo *pSubpassBeginInfo)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-
- if (unlikely(cmd->device->instance->debug_flags & TU_DEBUG_DYNAMIC)) {
- vk_common_CmdBeginRenderPass2(commandBuffer, pRenderPassBegin,
- pSubpassBeginInfo);
- return;
- }
-
- TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
- TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
-
- const struct VkRenderPassAttachmentBeginInfo *pAttachmentInfo =
- vk_find_struct_const(pRenderPassBegin->pNext,
- RENDER_PASS_ATTACHMENT_BEGIN_INFO);
-
- cmd->state.pass = pass;
- cmd->state.subpass = pass->subpasses;
- cmd->state.framebuffer = fb;
- cmd->state.render_area = pRenderPassBegin->renderArea;
-
- cmd->state.attachments =
- vk_alloc(&cmd->vk.pool->alloc, pass->attachment_count *
- sizeof(cmd->state.attachments[0]), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-
- if (!cmd->state.attachments) {
- vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
- return;
- }
-
- for (unsigned i = 0; i < pass->attachment_count; i++) {
- cmd->state.attachments[i] = pAttachmentInfo ?
- tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
- cmd->state.framebuffer->attachments[i].attachment;
- }
- tu_choose_gmem_layout(cmd);
-
- trace_start_render_pass(&cmd->trace, &cmd->cs);
-
- /* Note: because this is external, any flushes will happen before draw_cs
- * gets called. However deferred flushes could have to happen later as part
- * of the subpass.
+ /**
+ * Indirect draw parameters resource.
*/
- tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true);
- cmd->state.renderpass_cache.pending_flush_bits =
- cmd->state.cache.pending_flush_bits;
- cmd->state.renderpass_cache.flush_bits = 0;
-
- if (pass->subpasses[0].feedback_invalidate)
- cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE;
-
- tu_lrz_begin_renderpass(cmd, pRenderPassBegin->pClearValues);
-
- cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
-
- tu_emit_renderpass_begin(cmd, pRenderPassBegin->pClearValues);
- tu_emit_subpass_begin(cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdBeginRendering(VkCommandBuffer commandBuffer,
- const VkRenderingInfo *pRenderingInfo)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- VkClearValue clear_values[2 * (MAX_RTS + 1)];
-
- tu_setup_dynamic_render_pass(cmd, pRenderingInfo);
- tu_setup_dynamic_framebuffer(cmd, pRenderingInfo);
-
- cmd->state.pass = &cmd->dynamic_pass;
- cmd->state.subpass = &cmd->dynamic_subpass;
- cmd->state.framebuffer = &cmd->dynamic_framebuffer;
- cmd->state.render_area = pRenderingInfo->renderArea;
-
- cmd->state.attachments = cmd->dynamic_attachments;
-
- for (unsigned i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
- uint32_t a = cmd->dynamic_subpass.color_attachments[i].attachment;
- if (!pRenderingInfo->pColorAttachments[i].imageView)
- continue;
-
- TU_FROM_HANDLE(tu_image_view, view,
- pRenderingInfo->pColorAttachments[i].imageView);
- cmd->state.attachments[a] = view;
- clear_values[a] = pRenderingInfo->pColorAttachments[i].clearValue;
-
- a = cmd->dynamic_subpass.resolve_attachments[i].attachment;
- if (a != VK_ATTACHMENT_UNUSED) {
- TU_FROM_HANDLE(tu_image_view, resolve_view,
- pRenderingInfo->pColorAttachments[i].resolveImageView);
- cmd->state.attachments[a] = resolve_view;
- }
- }
-
- uint32_t a = cmd->dynamic_subpass.depth_stencil_attachment.attachment;
- if (pRenderingInfo->pDepthAttachment || pRenderingInfo->pStencilAttachment) {
- const struct VkRenderingAttachmentInfo *common_info =
- (pRenderingInfo->pDepthAttachment &&
- pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE) ?
- pRenderingInfo->pDepthAttachment :
- pRenderingInfo->pStencilAttachment;
- if (common_info && common_info->imageView != VK_NULL_HANDLE) {
- TU_FROM_HANDLE(tu_image_view, view, common_info->imageView);
- cmd->state.attachments[a] = view;
- if (pRenderingInfo->pDepthAttachment) {
- clear_values[a].depthStencil.depth =
- pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
- }
-
- if (pRenderingInfo->pStencilAttachment) {
- clear_values[a].depthStencil.stencil =
- pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
- }
-
- if (cmd->dynamic_subpass.resolve_count >
- cmd->dynamic_subpass.color_count) {
- TU_FROM_HANDLE(tu_image_view, resolve_view,
- common_info->resolveImageView);
- a = cmd->dynamic_subpass.resolve_attachments[cmd->dynamic_subpass.color_count].attachment;
- cmd->state.attachments[a] = resolve_view;
- }
- }
- }
-
- if (unlikely(cmd->device->instance->debug_flags & TU_DEBUG_DYNAMIC)) {
- const VkRenderingSelfDependencyInfoMESA *self_dependency =
- vk_find_struct_const(pRenderingInfo->pNext, RENDERING_SELF_DEPENDENCY_INFO_MESA);
- if (self_dependency &&
- (self_dependency->colorSelfDependencies ||
- self_dependency->depthSelfDependency ||
- self_dependency->stencilSelfDependency)) {
- /* Mesa's renderpass emulation requires us to use normal attachments
- * for input attachments, and currently doesn't try to keep track of
- * which color/depth attachment an input attachment corresponds to.
- * So when there's a self-dependency, we have to use sysmem.
- */
- cmd->state.rp.disable_gmem = true;
- }
- }
-
- tu_choose_gmem_layout(cmd);
-
- cmd->state.renderpass_cache.pending_flush_bits =
- cmd->state.cache.pending_flush_bits;
- cmd->state.renderpass_cache.flush_bits = 0;
-
- bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT;
- bool suspending = pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT;
- cmd->state.suspending = suspending;
- cmd->state.resuming = resuming;
+ struct tu_buffer *indirect;
+ uint64_t indirect_offset;
+ uint32_t stride;
- /* We can't track LRZ across command buffer boundaries, so we have to
- * disable LRZ when resuming/suspending unless we can track on the GPU.
+ /**
+ * Draw count parameters resource.
*/
- if ((resuming || suspending) &&
- !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
- cmd->state.lrz.valid = false;
- } else {
- if (resuming)
- tu_lrz_begin_resumed_renderpass(cmd, clear_values);
- else
- tu_lrz_begin_renderpass(cmd, clear_values);
- }
-
-
- if (suspending) {
- cmd->state.suspended_pass.pass = cmd->state.pass;
- cmd->state.suspended_pass.subpass = cmd->state.subpass;
- cmd->state.suspended_pass.framebuffer = cmd->state.framebuffer;
- cmd->state.suspended_pass.render_area = cmd->state.render_area;
- cmd->state.suspended_pass.attachments = cmd->state.attachments;
- cmd->state.suspended_pass.gmem_layout = cmd->state.gmem_layout;
- }
-
- if (!resuming) {
- trace_start_render_pass(&cmd->trace, &cmd->cs);
- }
-
- if (!resuming || cmd->state.suspend_resume == SR_NONE) {
- cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
- }
+ struct tu_buffer *count_buffer;
+ uint64_t count_buffer_offset;
+};
- if (!resuming) {
- tu_emit_renderpass_begin(cmd, clear_values);
- tu_emit_subpass_begin(cmd);
- }
+enum tu_draw_state_group_id
+{
+ TU_DRAW_STATE_PROGRAM,
+ TU_DRAW_STATE_PROGRAM_BINNING,
+ TU_DRAW_STATE_VI,
+ TU_DRAW_STATE_VI_BINNING,
+ TU_DRAW_STATE_VP,
+ TU_DRAW_STATE_RAST,
+ TU_DRAW_STATE_DS,
+ TU_DRAW_STATE_BLEND,
- if (suspending && !resuming) {
- /* entering a chain */
- switch (cmd->state.suspend_resume) {
- case SR_NONE:
- cmd->state.suspend_resume = SR_IN_CHAIN;
- break;
- case SR_AFTER_PRE_CHAIN:
- cmd->state.suspend_resume = SR_IN_CHAIN_AFTER_PRE_CHAIN;
- break;
- case SR_IN_PRE_CHAIN:
- case SR_IN_CHAIN:
- case SR_IN_CHAIN_AFTER_PRE_CHAIN:
- unreachable("suspending render pass not followed by resuming pass");
- break;
- }
- }
+ TU_DRAW_STATE_COUNT,
+};
- if (resuming && cmd->state.suspend_resume == SR_NONE)
- cmd->state.suspend_resume = SR_IN_PRE_CHAIN;
-}
+struct tu_draw_state_group
+{
+ enum tu_draw_state_group_id id;
+ uint32_t enable_mask;
+ const struct tu_cs_entry *ib;
+};
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
- const VkSubpassBeginInfo *pSubpassBeginInfo,
- const VkSubpassEndInfo *pSubpassEndInfo)
+static void
+tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
+ struct tu_cs *cs,
+ const struct tu_draw_info *draw)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+ const struct tu_pipeline *pipeline = cmd->state.pipeline;
+ const struct tu_dynamic_state *dynamic = &cmd->state.dynamic;
+ struct tu_draw_state_group draw_state_groups[TU_DRAW_STATE_COUNT];
+ uint32_t draw_state_group_count = 0;
- if (unlikely(cmd->device->instance->debug_flags & TU_DEBUG_DYNAMIC)) {
- vk_common_CmdNextSubpass2(commandBuffer, pSubpassBeginInfo,
- pSubpassEndInfo);
+ VkResult result = tu_cs_reserve_space(cmd->device, cs, 256);
+ if (result != VK_SUCCESS) {
+ cmd->record_result = result;
return;
}
- const struct tu_render_pass *pass = cmd->state.pass;
- struct tu_cs *cs = &cmd->draw_cs;
- const struct tu_subpass *last_subpass = cmd->state.subpass;
-
- const struct tu_subpass *subpass = cmd->state.subpass++;
-
- /* Track LRZ valid state
- *
- * TODO: Improve this tracking for keeping the state of the past depth/stencil images,
- * so if they become active again, we reuse its old state.
- */
- if (last_subpass->depth_stencil_attachment.attachment != subpass->depth_stencil_attachment.attachment) {
- cmd->state.lrz.valid = false;
- cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
- }
-
- tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
+ /* TODO lrz */
- if (subpass->resolve_attachments) {
- tu6_emit_blit_scissor(cmd, cs, true);
+ uint32_t pc_primitive_cntl = 0;
+ if (pipeline->ia.primitive_restart && draw->indexed)
+ pc_primitive_cntl |= A6XX_PC_PRIMITIVE_CNTL_0_PRIMITIVE_RESTART;
- for (unsigned i = 0; i < subpass->resolve_count; i++) {
- uint32_t a = subpass->resolve_attachments[i].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
+ tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9806, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9990, 0);
+ tu_cs_emit_write_reg(cs, REG_A6XX_VFD_UNKNOWN_A008, 0);
- uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
+ tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_0, 1);
+ tu_cs_emit(cs, pc_primitive_cntl);
- tu_store_gmem_attachment(cmd, cs, a, gmem_a, false);
-
- if (!pass->attachments[a].gmem)
- continue;
-
- /* check if the resolved attachment is needed by later subpasses,
- * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..
- */
- perf_debug(cmd->device, "TODO: missing GMEM->GMEM resolve path\n");
- tu_load_gmem_attachment(cmd, cs, a, false, true);
- }
+ if (cmd->state.dirty &
+ (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH) &&
+ (pipeline->dynamic_state.mask & TU_DYNAMIC_LINE_WIDTH)) {
+ tu6_emit_gras_su_cntl(cs, pipeline->rast.gras_su_cntl,
+ dynamic->line_width);
}
- tu_cond_exec_end(cs);
-
- tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
-
- tu6_emit_sysmem_resolves(cmd, cs, subpass);
-
- tu_cond_exec_end(cs);
-
- /* Handle dependencies for the next subpass */
- tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);
-
- if (cmd->state.subpass->feedback_invalidate)
- cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE;
-
- tu_emit_subpass_begin(cmd);
-}
-
-static uint32_t
-tu6_user_consts_size(const struct tu_pipeline *pipeline,
- gl_shader_stage type)
-{
- const struct tu_program_descriptor_linkage *link =
- &pipeline->program.link[type];
- uint32_t dwords = 0;
-
- if (link->tu_const_state.push_consts.dwords > 0) {
- unsigned num_units = link->tu_const_state.push_consts.dwords;
- dwords += 4 + num_units;
+ if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK) &&
+ (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_COMPARE_MASK)) {
+ tu6_emit_stencil_compare_mask(cs, dynamic->stencil_compare_mask.front,
+ dynamic->stencil_compare_mask.back);
}
- return dwords;
-}
-
-static void
-tu6_emit_user_consts(struct tu_cs *cs,
- const struct tu_pipeline *pipeline,
- gl_shader_stage type,
- uint32_t *push_constants)
-{
- const struct tu_program_descriptor_linkage *link =
- &pipeline->program.link[type];
-
- if (link->tu_const_state.push_consts.dwords > 0) {
- unsigned num_units = link->tu_const_state.push_consts.dwords;
- unsigned offset = link->tu_const_state.push_consts.lo;
-
- /* DST_OFF and NUM_UNIT requires vec4 units */
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units);
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset / 4) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
- CP_LOAD_STATE6_0_NUM_UNIT(num_units / 4));
- tu_cs_emit(cs, 0);
- tu_cs_emit(cs, 0);
- for (unsigned i = 0; i < num_units; i++)
- tu_cs_emit(cs, push_constants[i + offset]);
+ if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK) &&
+ (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_WRITE_MASK)) {
+ tu6_emit_stencil_write_mask(cs, dynamic->stencil_write_mask.front,
+ dynamic->stencil_write_mask.back);
}
-}
-static void
-tu6_emit_shared_consts(struct tu_cs *cs,
- const struct tu_pipeline *pipeline,
- uint32_t *push_constants,
- bool compute)
-{
- if (pipeline->shared_consts.dwords > 0) {
- /* Offset and num_units for shared consts are in units of dwords. */
- unsigned num_units = pipeline->shared_consts.dwords;
- unsigned offset = pipeline->shared_consts.lo;
-
- enum a6xx_state_type st = compute ? ST6_UBO : ST6_CONSTANTS;
- uint32_t cp_load_state = compute ? CP_LOAD_STATE6_FRAG : CP_LOAD_STATE6;
-
- tu_cs_emit_pkt7(cs, cp_load_state, 3 + num_units);
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
- CP_LOAD_STATE6_0_STATE_TYPE(st) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) |
- CP_LOAD_STATE6_0_NUM_UNIT(num_units));
- tu_cs_emit(cs, 0);
- tu_cs_emit(cs, 0);
-
- for (unsigned i = 0; i < num_units; i++)
- tu_cs_emit(cs, push_constants[i + offset]);
+ if ((cmd->state.dirty & TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE) &&
+ (pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_REFERENCE)) {
+ tu6_emit_stencil_reference(cs, dynamic->stencil_reference.front,
+ dynamic->stencil_reference.back);
}
-}
-static uint32_t
-tu6_const_size(struct tu_cmd_buffer *cmd,
- const struct tu_pipeline *pipeline,
- bool compute)
-{
- uint32_t dwords = 0;
+ if (cmd->state.dirty &
+ (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_VERTEX_BUFFERS)) {
+ for (uint32_t i = 0; i < pipeline->vi.count; i++) {
+ const uint32_t binding = pipeline->vi.bindings[i];
+ const uint32_t stride = pipeline->vi.strides[i];
+ const struct tu_buffer *buf = cmd->state.vb.buffers[binding];
+ const VkDeviceSize offset = buf->bo_offset +
+ cmd->state.vb.offsets[binding] +
+ pipeline->vi.offsets[i];
+ const VkDeviceSize size =
+ offset < buf->bo->size ? buf->bo->size - offset : 0;
- if (pipeline->shared_consts.dwords > 0) {
- dwords = pipeline->shared_consts.dwords + 4;
- } else {
- if (compute) {
- dwords = tu6_user_consts_size(pipeline, MESA_SHADER_COMPUTE);
- } else {
- for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++)
- dwords += tu6_user_consts_size(pipeline, type);
+ tu_cs_emit_pkt4(cs, REG_A6XX_VFD_FETCH(i), 4);
+ tu_cs_emit_qw(cs, buf->bo->iova + offset);
+ tu_cs_emit(cs, size);
+ tu_cs_emit(cs, stride);
}
}
- return dwords;
-}
-
-static struct tu_draw_state
-tu6_emit_consts(struct tu_cmd_buffer *cmd,
- const struct tu_pipeline *pipeline,
- bool compute)
-{
- uint32_t dwords = 0;
-
- dwords = tu6_const_size(cmd, pipeline, compute);
-
- if (dwords == 0)
- return (struct tu_draw_state) {};
-
- struct tu_cs cs;
- tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs);
-
- if (pipeline->shared_consts.dwords > 0) {
- tu6_emit_shared_consts(&cs, pipeline, cmd->push_constants, compute);
-
- for (uint32_t i = 0; i < ARRAY_SIZE(pipeline->program.link); i++) {
- const struct tu_program_descriptor_linkage *link =
- &pipeline->program.link[i];
- assert(!link->tu_const_state.push_consts.dwords);
- }
- } else {
- if (compute) {
- tu6_emit_user_consts(&cs, pipeline, MESA_SHADER_COMPUTE, cmd->push_constants);
+ /* TODO shader consts */
+
+ if (cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) {
+ draw_state_groups[draw_state_group_count++] =
+ (struct tu_draw_state_group) {
+ .id = TU_DRAW_STATE_PROGRAM,
+ .enable_mask = 0x6,
+ .ib = &pipeline->program.state_ib,
+ };
+ draw_state_groups[draw_state_group_count++] =
+ (struct tu_draw_state_group) {
+ .id = TU_DRAW_STATE_PROGRAM_BINNING,
+ .enable_mask = 0x1,
+ .ib = &pipeline->program.binning_state_ib,
+ };
+ draw_state_groups[draw_state_group_count++] =
+ (struct tu_draw_state_group) {
+ .id = TU_DRAW_STATE_VI,
+ .enable_mask = 0x6,
+ .ib = &pipeline->vi.state_ib,
+ };
+ draw_state_groups[draw_state_group_count++] =
+ (struct tu_draw_state_group) {
+ .id = TU_DRAW_STATE_VI_BINNING,
+ .enable_mask = 0x1,
+ .ib = &pipeline->vi.binning_state_ib,
+ };
+ draw_state_groups[draw_state_group_count++] =
+ (struct tu_draw_state_group) {
+ .id = TU_DRAW_STATE_VP,
+ .enable_mask = 0x7,
+ .ib = &pipeline->vp.state_ib,
+ };
+ draw_state_groups[draw_state_group_count++] =
+ (struct tu_draw_state_group) {
+ .id = TU_DRAW_STATE_RAST,
+ .enable_mask = 0x7,
+ .ib = &pipeline->rast.state_ib,
+ };
+ draw_state_groups[draw_state_group_count++] =
+ (struct tu_draw_state_group) {
+ .id = TU_DRAW_STATE_DS,
+ .enable_mask = 0x7,
+ .ib = &pipeline->ds.state_ib,
+ };
+ draw_state_groups[draw_state_group_count++] =
+ (struct tu_draw_state_group) {
+ .id = TU_DRAW_STATE_BLEND,
+ .enable_mask = 0x7,
+ .ib = &pipeline->blend.state_ib,
+ };
+ }
+
+ tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_group_count);
+ for (uint32_t i = 0; i < draw_state_group_count; i++) {
+ const struct tu_draw_state_group *group = &draw_state_groups[i];
+
+ uint32_t cp_set_draw_state =
+ CP_SET_DRAW_STATE__0_COUNT(group->ib->size / 4) |
+ CP_SET_DRAW_STATE__0_ENABLE_MASK(group->enable_mask) |
+ CP_SET_DRAW_STATE__0_GROUP_ID(group->id);
+ uint64_t iova;
+ if (group->ib->size) {
+ iova = group->ib->bo->iova + group->ib->offset;
} else {
- for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++)
- tu6_emit_user_consts(&cs, pipeline, type, cmd->push_constants);
+ cp_set_draw_state |= CP_SET_DRAW_STATE__0_DISABLE;
+ iova = 0;
}
- }
- return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
-}
-
-/* Various frontends (ANGLE, zink at least) will enable stencil testing with
- * what works out to be no-op writes. Simplify what they give us into flags
- * that LRZ can use.
- */
-static void
-tu6_update_simplified_stencil_state(struct tu_cmd_buffer *cmd)
-{
- bool stencil_test_enable =
- cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
-
- if (!stencil_test_enable) {
- cmd->state.stencil_front_write = false;
- cmd->state.stencil_back_write = false;
- return;
- }
-
- bool stencil_front_writemask =
- (cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
- (cmd->state.dynamic_stencil_wrmask & 0xff) :
- (cmd->state.pipeline->ds.stencil_wrmask & 0xff);
-
- bool stencil_back_writemask =
- (cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
- ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
- (cmd->state.pipeline->ds.stencil_wrmask & 0xff00) >> 8;
-
- VkStencilOp front_fail_op =
- (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL__SHIFT;
- VkStencilOp front_pass_op =
- (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS__SHIFT;
- VkStencilOp front_depth_fail_op =
- (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL__SHIFT;
- VkStencilOp back_fail_op =
- (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL_BF__SHIFT;
- VkStencilOp back_pass_op =
- (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS_BF__SHIFT;
- VkStencilOp back_depth_fail_op =
- (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__SHIFT;
-
- bool stencil_front_op_writes =
- front_pass_op != VK_STENCIL_OP_KEEP ||
- front_fail_op != VK_STENCIL_OP_KEEP ||
- front_depth_fail_op != VK_STENCIL_OP_KEEP;
-
- bool stencil_back_op_writes =
- back_pass_op != VK_STENCIL_OP_KEEP ||
- back_fail_op != VK_STENCIL_OP_KEEP ||
- back_depth_fail_op != VK_STENCIL_OP_KEEP;
-
- cmd->state.stencil_front_write =
- stencil_front_op_writes && stencil_front_writemask;
- cmd->state.stencil_back_write =
- stencil_back_op_writes && stencil_back_writemask;
-}
-
-static bool
-tu6_writes_depth(struct tu_cmd_buffer *cmd, bool depth_test_enable)
-{
- bool depth_write_enable =
- cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
-
- VkCompareOp depth_compare_op =
- (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
-
- bool depth_compare_op_writes = depth_compare_op != VK_COMPARE_OP_NEVER;
-
- return depth_test_enable && depth_write_enable && depth_compare_op_writes;
-}
-
-static bool
-tu6_writes_stencil(struct tu_cmd_buffer *cmd)
-{
- return cmd->state.stencil_front_write || cmd->state.stencil_back_write;
-}
-
-static void
-tu6_build_depth_plane_z_mode(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
-{
- enum a6xx_ztest_mode zmode = A6XX_EARLY_Z;
- bool depth_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
- bool depth_write = tu6_writes_depth(cmd, depth_test_enable);
- bool stencil_write = tu6_writes_stencil(cmd);
-
- if ((cmd->state.pipeline->lrz.fs.has_kill ||
- cmd->state.pipeline->output.subpass_feedback_loop_ds) &&
- (depth_write || stencil_write)) {
- zmode = (cmd->state.lrz.valid && cmd->state.lrz.enabled)
- ? A6XX_EARLY_LRZ_LATE_Z
- : A6XX_LATE_Z;
+ tu_cs_emit(cs, cp_set_draw_state);
+ tu_cs_emit_qw(cs, iova);
}
- if ((cmd->state.pipeline->lrz.force_late_z &&
- !cmd->state.pipeline->lrz.fs.force_early_z) || !depth_test_enable)
- zmode = A6XX_LATE_Z;
-
- /* User defined early tests take precedence above all else */
- if (cmd->state.pipeline->lrz.fs.early_fragment_tests)
- zmode = A6XX_EARLY_Z;
-
- tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1);
- tu_cs_emit(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL_Z_MODE(zmode));
-
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1);
- tu_cs_emit(cs, A6XX_RB_DEPTH_PLANE_CNTL_Z_MODE(zmode));
-}
+ tu_cs_sanity_check(cs);
-static void
-tu6_emit_blend(struct tu_cs *cs, struct tu_cmd_buffer *cmd)
-{
- struct tu_pipeline *pipeline = cmd->state.pipeline;
- uint32_t color_write_enable = cmd->state.pipeline_color_write_enable;
-
- if (pipeline->dynamic_state_mask &
- BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE))
- color_write_enable &= cmd->state.color_write_enable;
-
- for (unsigned i = 0; i < pipeline->blend.num_rts; i++) {
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_CONTROL(i), 2);
- if (color_write_enable & BIT(i)) {
- tu_cs_emit(cs, cmd->state.rb_mrt_control[i] |
- ((cmd->state.logic_op_enabled ?
- cmd->state.rb_mrt_control_rop : 0) &
- ~pipeline->blend.rb_mrt_control_mask));
- tu_cs_emit(cs, cmd->state.rb_mrt_blend_control[i]);
- } else {
- tu_cs_emit(cs, 0);
- tu_cs_emit(cs, 0);
+ /* track BOs */
+ if (cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) {
+ tu_bo_list_add(&cmd->bo_list, &pipeline->program.binary_bo,
+ MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
+ for (uint32_t i = 0; i < pipeline->cs.bo_count; i++) {
+ tu_bo_list_add(&cmd->bo_list, pipeline->cs.bos[i],
+ MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
}
}
-
- uint32_t blend_enable_mask = color_write_enable;
- if (!(cmd->state.logic_op_enabled && cmd->state.rop_reads_dst))
- blend_enable_mask &= cmd->state.pipeline_blend_enable;
-
- tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = pipeline->blend.num_rts));
- tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = pipeline->blend.num_rts));
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_BLEND_CNTL, 1);
- tu_cs_emit(cs, cmd->state.sp_blend_cntl |
- (A6XX_SP_BLEND_CNTL_ENABLE_BLEND(blend_enable_mask) &
- ~pipeline->blend.sp_blend_cntl_mask));
-
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_CNTL, 1);
- tu_cs_emit(cs, cmd->state.rb_blend_cntl |
- (A6XX_RB_BLEND_CNTL_ENABLE_BLEND(blend_enable_mask) &
- ~pipeline->blend.rb_blend_cntl_mask));
-}
-
-static VkResult
-tu6_draw_common(struct tu_cmd_buffer *cmd,
- struct tu_cs *cs,
- bool indexed,
- /* note: draw_count is 0 for indirect */
- uint32_t draw_count)
-{
- const struct tu_pipeline *pipeline = cmd->state.pipeline;
- struct tu_render_pass_state *rp = &cmd->state.rp;
-
- /* Fill draw stats for autotuner */
- rp->drawcall_count++;
-
- rp->drawcall_bandwidth_per_sample_sum +=
- pipeline->output.color_bandwidth_per_sample;
-
- /* add depth memory bandwidth cost */
- const uint32_t depth_bandwidth = pipeline->output.depth_cpp_per_sample;
- if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE)
- rp->drawcall_bandwidth_per_sample_sum += depth_bandwidth;
- if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)
- rp->drawcall_bandwidth_per_sample_sum += depth_bandwidth;
-
- /* add stencil memory bandwidth cost */
- const uint32_t stencil_bandwidth = pipeline->output.stencil_cpp_per_sample;
- if (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE)
- rp->drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
-
- tu_emit_cache_flush_renderpass(cmd, cs);
-
- bool primitive_restart_enabled = pipeline->ia.primitive_restart;
- if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE))
- primitive_restart_enabled = cmd->state.primitive_restart_enable;
-
- bool primitive_restart = primitive_restart_enabled && indexed;
- bool provoking_vtx_last = pipeline->rast.provoking_vertex_last;
- bool tess_upper_left_domain_origin =
- pipeline->tess.upper_left_domain_origin;
-
- struct tu_primitive_params* prim_params = &cmd->state.last_prim_params;
-
- if (!prim_params->valid ||
- prim_params->primitive_restart != primitive_restart ||
- prim_params->provoking_vtx_last != provoking_vtx_last ||
- prim_params->tess_upper_left_domain_origin !=
- tess_upper_left_domain_origin) {
- tu_cs_emit_regs(
- cs,
- A6XX_PC_PRIMITIVE_CNTL_0(.primitive_restart = primitive_restart,
- .provoking_vtx_last = provoking_vtx_last,
- .tess_upper_left_domain_origin =
- tess_upper_left_domain_origin));
- prim_params->valid = true;
- prim_params->primitive_restart = primitive_restart;
- prim_params->provoking_vtx_last = provoking_vtx_last;
- prim_params->tess_upper_left_domain_origin = tess_upper_left_domain_origin;
- }
-
- /* Early exit if there is nothing to emit, saves CPU cycles */
- uint32_t dirty = cmd->state.dirty;
- if (!(dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD))
- return VK_SUCCESS;
-
- bool dirty_lrz =
- dirty & (TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_RB_DEPTH_CNTL |
- TU_CMD_DIRTY_RB_STENCIL_CNTL | TU_CMD_DIRTY_BLEND);
-
- if (dirty_lrz) {
- struct tu_cs cs;
- uint32_t size = cmd->device->physical_device->info->a6xx.lrz_track_quirk ? 10 : 8;
-
- cmd->state.lrz_and_depth_plane_state =
- tu_cs_draw_state(&cmd->sub_cs, &cs, size);
- tu6_update_simplified_stencil_state(cmd);
- tu6_emit_lrz(cmd, &cs);
- tu6_build_depth_plane_z_mode(cmd, &cs);
- }
-
- if (dirty & TU_CMD_DIRTY_RASTERIZER_DISCARD) {
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4);
- tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = cmd->state.pc_raster_cntl));
- tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = cmd->state.vpc_unknown_9107));
- }
-
- if (dirty & TU_CMD_DIRTY_GRAS_SU_CNTL) {
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2);
- tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = cmd->state.gras_su_cntl));
- }
-
- if (dirty & TU_CMD_DIRTY_RB_DEPTH_CNTL) {
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2);
- uint32_t rb_depth_cntl = cmd->state.rb_depth_cntl;
-
- if ((rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE) ||
- (rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE))
- rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
-
- if ((rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE) &&
- !(rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE))
- tu6_apply_depth_bounds_workaround(cmd->device, &rb_depth_cntl);
-
- if (pipeline->output.rb_depth_cntl_disable)
- rb_depth_cntl = 0;
-
- tu_cs_emit_regs(&cs, A6XX_RB_DEPTH_CNTL(.dword = rb_depth_cntl));
- }
-
- if (dirty & TU_CMD_DIRTY_RB_STENCIL_CNTL) {
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2);
- tu_cs_emit_regs(&cs, A6XX_RB_STENCIL_CONTROL(.dword = cmd->state.rb_stencil_cntl));
- }
-
- if (dirty & TU_CMD_DIRTY_SHADER_CONSTS)
- cmd->state.shader_const = tu6_emit_consts(cmd, pipeline, false);
-
- if (dirty & TU_CMD_DIRTY_VIEWPORTS) {
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * cmd->state.max_viewport);
- tu6_emit_viewport(&cs, cmd->state.viewport, cmd->state.max_viewport,
- pipeline->viewport.z_negative_one_to_one);
- }
-
- if (dirty & TU_CMD_DIRTY_BLEND) {
- struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_BLEND,
- 8 + 3 * cmd->state.pipeline->blend.num_rts);
- tu6_emit_blend(&cs, cmd);
- }
-
- if (dirty & TU_CMD_DIRTY_PATCH_CONTROL_POINTS) {
- bool tess = cmd->state.pipeline->active_stages &
- VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
- uint32_t state_size = TU6_EMIT_PATCH_CONTROL_POINTS_DWORDS(
- pipeline->program.hs_param_dwords);
- struct tu_cs cs = tu_cmd_dynamic_state(
- cmd, TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS, tess ? state_size : 0);
- tu6_emit_patch_control_points(&cs, cmd->state.pipeline,
- cmd->state.patch_control_points);
- }
-
- /* for the first draw in a renderpass, re-emit all the draw states
- *
- * and if a draw-state disabling path (CmdClearAttachments 3D fallback) was
- * used, then draw states must be re-emitted. note however this only happens
- * in the sysmem path, so this can be skipped this for the gmem path (TODO)
- *
- * the two input attachment states are excluded because secondary command
- * buffer doesn't have a state ib to restore it, and not re-emitting them
- * is OK since CmdClearAttachments won't disable/overwrite them
- */
- if (dirty & TU_CMD_DIRTY_DRAW_STATE) {
- tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
-
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast.state);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order.state_sysmem);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order.state_gmem);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_CONST, cmd->state.shader_const);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE, cmd->state.lrz_and_depth_plane_state);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_MSAA, cmd->state.msaa);
-
- for (uint32_t i = 0; i < ARRAY_SIZE(cmd->state.dynamic_state); i++) {
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i,
- ((pipeline->dynamic_state_mask & BIT(i)) ?
- cmd->state.dynamic_state[i] :
- pipeline->dynamic_state[i]));
- }
- } else {
- /* emit draw states that were just updated
- * note we eventually don't want to have to emit anything here
- */
- bool emit_binding_stride = false, emit_blend = false,
- emit_patch_control_points = false;
- uint32_t draw_state_count =
- ((dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 1 : 0) +
- ((dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) ? 1 : 0) +
- ((dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
- ((dirty & TU_CMD_DIRTY_VS_PARAMS) ? 1 : 0) +
- (dirty_lrz ? 1 : 0);
-
- if ((dirty & TU_CMD_DIRTY_VB_STRIDE) &&
- (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) {
- emit_binding_stride = true;
- draw_state_count += 1;
- }
-
- if ((dirty & TU_CMD_DIRTY_BLEND) &&
- (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_BLEND))) {
- emit_blend = true;
- draw_state_count += 1;
- }
-
- if ((dirty & TU_CMD_DIRTY_PATCH_CONTROL_POINTS) &&
- (pipeline->dynamic_state_mask &
- BIT(TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS))) {
- emit_patch_control_points = true;
- draw_state_count += 1;
- }
-
- if (draw_state_count > 0)
- tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);
-
- if (dirty & TU_CMD_DIRTY_SHADER_CONSTS)
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_CONST, cmd->state.shader_const);
- if (dirty & TU_CMD_DIRTY_DESC_SETS_LOAD)
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state);
- if (dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);
- if (emit_binding_stride) {
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_VB_STRIDE,
- cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE]);
- }
- if (emit_blend) {
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_BLEND,
- cmd->state.dynamic_state[TU_DYNAMIC_STATE_BLEND]);
- }
- if (emit_patch_control_points) {
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
- cmd->state.dynamic_state[TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS]);
- }
- if (dirty & TU_CMD_DIRTY_VS_PARAMS)
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
-
- if (dirty_lrz) {
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE, cmd->state.lrz_and_depth_plane_state);
+ if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) {
+ for (uint32_t i = 0; i < MAX_VBS; i++) {
+ const struct tu_buffer *buf = cmd->state.vb.buffers[i];
+ if (buf)
+ tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
}
}
- tu_cs_sanity_check(cs);
-
- /* There are too many graphics dirty bits to list here, so just list the
- * bits to preserve instead. The only things not emitted here are
- * compute-related state.
- */
- cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
- return VK_SUCCESS;
+ cmd->state.dirty = 0;
}
-static uint32_t
-tu_draw_initiator(struct tu_cmd_buffer *cmd, enum pc_di_src_sel src_sel)
+static void
+tu6_emit_draw_direct(struct tu_cmd_buffer *cmd,
+ struct tu_cs *cs,
+ const struct tu_draw_info *draw)
{
- const struct tu_pipeline *pipeline = cmd->state.pipeline;
- enum pc_di_primtype primtype = cmd->state.primtype;
- if (primtype == DI_PT_PATCHES0)
- primtype += cmd->state.patch_control_points;
+ const enum pc_di_primtype primtype = cmd->state.pipeline->ia.primtype;
- uint32_t initiator =
- CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
- CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(src_sel) |
- CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(cmd->state.index_size) |
- CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY);
+ tu_cs_emit_pkt4(cs, REG_A6XX_VFD_INDEX_OFFSET, 2);
+ tu_cs_emit(cs, draw->vertex_offset);
+ tu_cs_emit(cs, draw->first_instance);
- if (pipeline->active_stages & VK_SHADER_STAGE_GEOMETRY_BIT)
- initiator |= CP_DRAW_INDX_OFFSET_0_GS_ENABLE;
-
- switch (pipeline->tess.patch_type) {
- case IR3_TESS_TRIANGLES:
- initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_TRIANGLES) |
- CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
- break;
- case IR3_TESS_ISOLINES:
- initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_ISOLINES) |
- CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
- break;
- case IR3_TESS_NONE:
- initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS);
- break;
- case IR3_TESS_QUADS:
- initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS) |
- CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
- break;
- }
- return initiator;
-}
+ /* TODO hw binning */
+ if (draw->indexed) {
+ const enum a4xx_index_size index_size =
+ tu6_index_size(cmd->state.index_type);
+ const uint32_t index_bytes =
+ (cmd->state.index_type == VK_INDEX_TYPE_UINT32) ? 4 : 2;
+ const struct tu_buffer *buf = cmd->state.index_buffer;
+ const VkDeviceSize offset = buf->bo_offset + cmd->state.index_offset +
+ index_bytes * draw->first_index;
+ const uint32_t size = index_bytes * draw->count;
+ const uint32_t cp_draw_indx =
+ CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
+ CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_DMA) |
+ CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) |
+ CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) | 0x2000;
-static uint32_t
-vs_params_offset(struct tu_cmd_buffer *cmd)
-{
- const struct tu_program_descriptor_linkage *link =
- &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
- const struct ir3_const_state *const_state = &link->const_state;
-
- if (const_state->offsets.driver_param >= link->constlen)
- return 0;
-
- /* this layout is required by CP_DRAW_INDIRECT_MULTI */
- STATIC_ASSERT(IR3_DP_DRAWID == 0);
- STATIC_ASSERT(IR3_DP_VTXID_BASE == 1);
- STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
-
- /* 0 means disabled for CP_DRAW_INDIRECT_MULTI */
- assert(const_state->offsets.driver_param != 0);
-
- return const_state->offsets.driver_param;
-}
+ tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
+ tu_cs_emit(cs, cp_draw_indx);
+ tu_cs_emit(cs, draw->instance_count);
+ tu_cs_emit(cs, draw->count);
+ tu_cs_emit(cs, 0x0); /* XXX */
+ tu_cs_emit_qw(cs, buf->bo->iova + offset);
+ tu_cs_emit(cs, size);
+ } else {
+ const uint32_t cp_draw_indx =
+ CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
+ CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
+ CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY) | 0x2000;
-static void
-tu6_emit_empty_vs_params(struct tu_cmd_buffer *cmd)
-{
- if (cmd->state.vs_params.iova) {
- cmd->state.vs_params = (struct tu_draw_state) {};
- cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;
+ tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
+ tu_cs_emit(cs, cp_draw_indx);
+ tu_cs_emit(cs, draw->instance_count);
+ tu_cs_emit(cs, draw->count);
}
}
static void
-tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
- uint32_t draw_id,
- uint32_t vertex_offset,
- uint32_t first_instance)
+tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw)
{
- uint32_t offset = vs_params_offset(cmd);
+ struct tu_cs *cs = &cmd->draw_cs;
- /* Beside re-emitting params when they are changed, we should re-emit
- * them after constants are invalidated via HLSQ_INVALIDATE_CMD.
- */
- if (!(cmd->state.dirty & (TU_CMD_DIRTY_DRAW_STATE | TU_CMD_DIRTY_VS_PARAMS)) &&
- (offset == 0 || draw_id == cmd->state.last_vs_params.draw_id) &&
- vertex_offset == cmd->state.last_vs_params.vertex_offset &&
- first_instance == cmd->state.last_vs_params.first_instance) {
- return;
- }
+ tu6_bind_draw_states(cmd, cs, draw);
- struct tu_cs cs;
- VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 8 : 0), &cs);
+ VkResult result = tu_cs_reserve_space(cmd->device, cs, 32);
if (result != VK_SUCCESS) {
- vk_command_buffer_set_error(&cmd->vk, result);
+ cmd->record_result = result;
return;
}
- tu_cs_emit_regs(&cs,
- A6XX_VFD_INDEX_OFFSET(vertex_offset),
- A6XX_VFD_INSTANCE_START_OFFSET(first_instance));
-
- if (offset) {
- tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
- tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
- CP_LOAD_STATE6_0_NUM_UNIT(1));
- tu_cs_emit(&cs, 0);
- tu_cs_emit(&cs, 0);
-
- tu_cs_emit(&cs, draw_id);
- tu_cs_emit(&cs, vertex_offset);
- tu_cs_emit(&cs, first_instance);
- tu_cs_emit(&cs, 0);
+ if (draw->indirect) {
+ tu_finishme("indirect draw");
+ return;
}
- cmd->state.last_vs_params.vertex_offset = vertex_offset;
- cmd->state.last_vs_params.first_instance = first_instance;
- cmd->state.last_vs_params.draw_id = draw_id;
+ /* TODO tu6_emit_marker should pick different regs depending on cs */
+ tu6_emit_marker(cmd, cs);
+ tu6_emit_draw_direct(cmd, cs, draw);
+ tu6_emit_marker(cmd, cs);
- struct tu_cs_entry entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
- cmd->state.vs_params = (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4};
+ cmd->wait_for_idle = true;
- cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;
+ tu_cs_sanity_check(cs);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdDraw(VkCommandBuffer commandBuffer,
uint32_t vertexCount,
uint32_t instanceCount,
uint32_t firstVertex,
uint32_t firstInstance)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs *cs = &cmd->draw_cs;
-
- tu6_emit_vs_params(cmd, 0, firstVertex, firstInstance);
-
- tu6_draw_common(cmd, cs, false, vertexCount);
-
- tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
- tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
- tu_cs_emit(cs, instanceCount);
- tu_cs_emit(cs, vertexCount);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
- uint32_t drawCount,
- const VkMultiDrawInfoEXT *pVertexInfo,
- uint32_t instanceCount,
- uint32_t firstInstance,
- uint32_t stride)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs *cs = &cmd->draw_cs;
-
- if (!drawCount)
- return;
-
- bool has_tess =
- cmd->state.pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
-
- uint32_t max_vertex_count = 0;
- if (has_tess) {
- uint32_t i = 0;
- vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
- max_vertex_count = MAX2(max_vertex_count, draw->vertexCount);
- }
- }
-
- uint32_t i = 0;
- vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
- tu6_emit_vs_params(cmd, i, draw->firstVertex, firstInstance);
-
- if (i == 0)
- tu6_draw_common(cmd, cs, false, max_vertex_count);
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
+ struct tu_draw_info info = {};
- if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) {
- tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
- cmd->state.dirty &= ~TU_CMD_DIRTY_VS_PARAMS;
- }
+ info.count = vertexCount;
+ info.instance_count = instanceCount;
+ info.first_instance = firstInstance;
+ info.vertex_offset = firstVertex;
- tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
- tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
- tu_cs_emit(cs, instanceCount);
- tu_cs_emit(cs, draw->vertexCount);
- }
+ tu_draw(cmd_buffer, &info);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
uint32_t indexCount,
uint32_t instanceCount,
@@ -4828,239 +2377,56 @@ tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
int32_t vertexOffset,
uint32_t firstInstance)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs *cs = &cmd->draw_cs;
-
- tu6_emit_vs_params(cmd, 0, vertexOffset, firstInstance);
-
- tu6_draw_common(cmd, cs, true, indexCount);
-
- tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
- tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
- tu_cs_emit(cs, instanceCount);
- tu_cs_emit(cs, indexCount);
- tu_cs_emit(cs, firstIndex);
- tu_cs_emit_qw(cs, cmd->state.index_va);
- tu_cs_emit(cs, cmd->state.max_index_count);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
- uint32_t drawCount,
- const VkMultiDrawIndexedInfoEXT *pIndexInfo,
- uint32_t instanceCount,
- uint32_t firstInstance,
- uint32_t stride,
- const int32_t *pVertexOffset)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs *cs = &cmd->draw_cs;
-
- if (!drawCount)
- return;
-
- bool has_tess =
- cmd->state.pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
-
- uint32_t max_index_count = 0;
- if (has_tess) {
- uint32_t i = 0;
- vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
- max_index_count = MAX2(max_index_count, draw->indexCount);
- }
- }
-
- uint32_t i = 0;
- vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
- int32_t vertexOffset = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
- tu6_emit_vs_params(cmd, i, vertexOffset, firstInstance);
-
- if (i == 0)
- tu6_draw_common(cmd, cs, true, max_index_count);
-
- if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) {
- tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
- tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
- cmd->state.dirty &= ~TU_CMD_DIRTY_VS_PARAMS;
- }
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
+ struct tu_draw_info info = {};
- tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
- tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
- tu_cs_emit(cs, instanceCount);
- tu_cs_emit(cs, draw->indexCount);
- tu_cs_emit(cs, draw->firstIndex);
- tu_cs_emit_qw(cs, cmd->state.index_va);
- tu_cs_emit(cs, cmd->state.max_index_count);
- }
-}
+ info.indexed = true;
+ info.count = indexCount;
+ info.instance_count = instanceCount;
+ info.first_index = firstIndex;
+ info.vertex_offset = vertexOffset;
+ info.first_instance = firstInstance;
-/* Various firmware bugs/inconsistencies mean that some indirect draw opcodes
- * do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if
- * pending for these opcodes. This may result in a few extra WAIT_FOR_ME's
- * with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's
- * before draw opcodes that don't need it.
- */
-static void
-draw_wfm(struct tu_cmd_buffer *cmd)
-{
- cmd->state.renderpass_cache.flush_bits |=
- cmd->state.renderpass_cache.pending_flush_bits & TU_CMD_FLAG_WAIT_FOR_ME;
- cmd->state.renderpass_cache.pending_flush_bits &= ~TU_CMD_FLAG_WAIT_FOR_ME;
+ tu_draw(cmd_buffer, &info);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
uint32_t drawCount,
uint32_t stride)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- TU_FROM_HANDLE(tu_buffer, buf, _buffer);
- struct tu_cs *cs = &cmd->draw_cs;
-
- tu6_emit_empty_vs_params(cmd);
-
- if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk)
- draw_wfm(cmd);
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
+ TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
+ struct tu_draw_info info = {};
- tu6_draw_common(cmd, cs, false, 0);
+ info.count = drawCount;
+ info.indirect = buffer;
+ info.indirect_offset = offset;
+ info.stride = stride;
- tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6);
- tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
- tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) |
- A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
- tu_cs_emit(cs, drawCount);
- tu_cs_emit_qw(cs, buf->iova + offset);
- tu_cs_emit(cs, stride);
+ tu_draw(cmd_buffer, &info);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset,
uint32_t drawCount,
uint32_t stride)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- TU_FROM_HANDLE(tu_buffer, buf, _buffer);
- struct tu_cs *cs = &cmd->draw_cs;
-
- tu6_emit_empty_vs_params(cmd);
-
- if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk)
- draw_wfm(cmd);
-
- tu6_draw_common(cmd, cs, true, 0);
-
- tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9);
- tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
- tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) |
- A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
- tu_cs_emit(cs, drawCount);
- tu_cs_emit_qw(cs, cmd->state.index_va);
- tu_cs_emit(cs, cmd->state.max_index_count);
- tu_cs_emit_qw(cs, buf->iova + offset);
- tu_cs_emit(cs, stride);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
- VkBuffer _buffer,
- VkDeviceSize offset,
- VkBuffer countBuffer,
- VkDeviceSize countBufferOffset,
- uint32_t drawCount,
- uint32_t stride)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- TU_FROM_HANDLE(tu_buffer, buf, _buffer);
- TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
- struct tu_cs *cs = &cmd->draw_cs;
-
- tu6_emit_empty_vs_params(cmd);
-
- /* It turns out that the firmware we have for a650 only partially fixed the
- * problem with CP_DRAW_INDIRECT_MULTI not waiting for WFI's to complete
- * before reading indirect parameters. It waits for WFI's before reading
- * the draw parameters, but after reading the indirect count :(.
- */
- draw_wfm(cmd);
-
- tu6_draw_common(cmd, cs, false, 0);
-
- tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 8);
- tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
- tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT) |
- A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
- tu_cs_emit(cs, drawCount);
- tu_cs_emit_qw(cs, buf->iova + offset);
- tu_cs_emit_qw(cs, count_buf->iova + countBufferOffset);
- tu_cs_emit(cs, stride);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
- VkBuffer _buffer,
- VkDeviceSize offset,
- VkBuffer countBuffer,
- VkDeviceSize countBufferOffset,
- uint32_t drawCount,
- uint32_t stride)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- TU_FROM_HANDLE(tu_buffer, buf, _buffer);
- TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
- struct tu_cs *cs = &cmd->draw_cs;
-
- tu6_emit_empty_vs_params(cmd);
-
- draw_wfm(cmd);
-
- tu6_draw_common(cmd, cs, true, 0);
-
- tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 11);
- tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
- tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT_INDEXED) |
- A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
- tu_cs_emit(cs, drawCount);
- tu_cs_emit_qw(cs, cmd->state.index_va);
- tu_cs_emit(cs, cmd->state.max_index_count);
- tu_cs_emit_qw(cs, buf->iova + offset);
- tu_cs_emit_qw(cs, count_buf->iova + countBufferOffset);
- tu_cs_emit(cs, stride);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
- uint32_t instanceCount,
- uint32_t firstInstance,
- VkBuffer _counterBuffer,
- VkDeviceSize counterBufferOffset,
- uint32_t counterOffset,
- uint32_t vertexStride)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer);
- struct tu_cs *cs = &cmd->draw_cs;
-
- /* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO.
- * Plus, for the common case where the counter buffer is written by
- * vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to
- * complete which means we need a WAIT_FOR_ME anyway.
- */
- draw_wfm(cmd);
-
- tu6_emit_vs_params(cmd, 0, 0, firstInstance);
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
+ TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
+ struct tu_draw_info info = {};
- tu6_draw_common(cmd, cs, false, 0);
+ info.indexed = true;
+ info.count = drawCount;
+ info.indirect = buffer;
+ info.indirect_offset = offset;
+ info.stride = stride;
- tu_cs_emit_pkt7(cs, CP_DRAW_AUTO, 6);
- tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_XFB));
- tu_cs_emit(cs, instanceCount);
- tu_cs_emit_qw(cs, buf->iova + counterBufferOffset);
- tu_cs_emit(cs, counterOffset);
- tu_cs_emit(cs, vertexStride);
+ tu_draw(cmd_buffer, &info);
}
struct tu_dispatch_info
@@ -5088,221 +2454,12 @@ struct tu_dispatch_info
};
static void
-tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
- struct tu_cs *cs, struct tu_pipeline *pipeline,
- const struct tu_dispatch_info *info)
-{
- gl_shader_stage type = MESA_SHADER_COMPUTE;
- const struct tu_program_descriptor_linkage *link =
- &pipeline->program.link[type];
- const struct ir3_const_state *const_state = &link->const_state;
- uint32_t offset = const_state->offsets.driver_param;
- unsigned subgroup_size = pipeline->compute.subgroup_size;
- unsigned subgroup_shift = util_logbase2(subgroup_size);
-
- if (link->constlen <= offset)
- return;
-
- uint32_t num_consts = MIN2(const_state->num_driver_params,
- (link->constlen - offset) * 4);
-
- if (!info->indirect) {
- uint32_t driver_params[12] = {
- [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
- [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
- [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
- [IR3_DP_BASE_GROUP_X] = info->offsets[0],
- [IR3_DP_BASE_GROUP_Y] = info->offsets[1],
- [IR3_DP_BASE_GROUP_Z] = info->offsets[2],
- [IR3_DP_CS_SUBGROUP_SIZE] = subgroup_size,
- [IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,
- };
-
- assert(num_consts <= ARRAY_SIZE(driver_params));
-
- /* push constants */
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
- CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
- tu_cs_emit(cs, 0);
- tu_cs_emit(cs, 0);
- uint32_t i;
- for (i = 0; i < num_consts; i++)
- tu_cs_emit(cs, driver_params[i]);
- } else if (!(info->indirect_offset & 0xf)) {
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
- CP_LOAD_STATE6_0_NUM_UNIT(1));
- tu_cs_emit_qw(cs, info->indirect->iova + info->indirect_offset);
- } else {
- /* Vulkan guarantees only 4 byte alignment for indirect_offset.
- * However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment.
- */
-
- uint64_t indirect_iova = info->indirect->iova + info->indirect_offset;
-
- for (uint32_t i = 0; i < 3; i++) {
- tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
- tu_cs_emit(cs, 0);
- tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[i]));
- tu_cs_emit_qw(cs, indirect_iova + i * 4);
- }
-
- tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
- tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
-
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
- CP_LOAD_STATE6_0_NUM_UNIT(1));
- tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0]));
- }
-
- /* Fill out IR3_DP_CS_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for
- * indirect dispatch.
- */
- if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) {
- bool emit_local = num_consts > IR3_DP_LOCAL_GROUP_SIZE_X;
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7 + (emit_local ? 4 : 0));
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
- CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4));
- tu_cs_emit_qw(cs, 0);
- tu_cs_emit(cs, 0); /* BASE_GROUP_X */
- tu_cs_emit(cs, 0); /* BASE_GROUP_Y */
- tu_cs_emit(cs, 0); /* BASE_GROUP_Z */
- tu_cs_emit(cs, subgroup_size);
- if (emit_local) {
- assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4));
- tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */
- tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */
- tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */
- tu_cs_emit(cs, subgroup_shift);
- }
- }
-}
-
-static void
-tu_dispatch(struct tu_cmd_buffer *cmd,
+tu_dispatch(struct tu_cmd_buffer *cmd_buffer,
const struct tu_dispatch_info *info)
{
- if (!info->indirect &&
- (info->blocks[0] == 0 || info->blocks[1] == 0 || info->blocks[2] == 0))
- return;
-
- struct tu_cs *cs = &cmd->cs;
- struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
-
- bool emit_instrlen_workaround =
- pipeline->program.cs_instrlen >
- cmd->device->physical_device->info->a6xx.instr_cache_size;
-
- /* There appears to be a HW bug where in some rare circumstances it appears
- * to accidentally use the FS instrlen instead of the CS instrlen, which
- * affects all known gens. Based on various experiments it appears that the
- * issue is that when prefetching a branch destination and there is a cache
- * miss, when fetching from memory the HW bounds-checks the fetch against
- * SP_CS_INSTRLEN, except when one of the two register contexts is active
- * it accidentally fetches SP_FS_INSTRLEN from the other (inactive)
- * context. To workaround it we set the FS instrlen here and do a dummy
- * event to roll the context (because it fetches SP_FS_INSTRLEN from the
- * "wrong" context). Because the bug seems to involve cache misses, we
- * don't emit this if the entire CS program fits in cache, which will
- * hopefully be the majority of cases.
- *
- * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/5892
- */
- if (emit_instrlen_workaround) {
- tu_cs_emit_regs(cs, A6XX_SP_FS_INSTRLEN(pipeline->program.cs_instrlen));
- tu6_emit_event_write(cmd, cs, LABEL);
- }
-
- /* TODO: We could probably flush less if we add a compute_flush_bits
- * bitfield.
- */
- tu_emit_cache_flush(cmd, cs);
-
- /* note: no reason to have this in a separate IB */
- tu_cs_emit_state_ib(cs, tu6_emit_consts(cmd, pipeline, true));
-
- tu_emit_compute_driver_params(cmd, cs, pipeline, info);
-
- if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD)
- tu_cs_emit_state_ib(cs, pipeline->load_state);
-
- cmd->state.dirty &= ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
-
- tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
- tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
-
- const uint32_t *local_size = pipeline->compute.local_size;
- const uint32_t *num_groups = info->blocks;
- tu_cs_emit_regs(cs,
- A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,
- .localsizex = local_size[0] - 1,
- .localsizey = local_size[1] - 1,
- .localsizez = local_size[2] - 1),
- A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),
- A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),
- A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),
- A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),
- A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),
- A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));
-
- tu_cs_emit_regs(cs,
- A6XX_HLSQ_CS_KERNEL_GROUP_X(1),
- A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
- A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
-
- trace_start_compute(&cmd->trace, cs);
-
- if (info->indirect) {
- uint64_t iova = info->indirect->iova + info->indirect_offset;
-
- tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
- tu_cs_emit(cs, 0x00000000);
- tu_cs_emit_qw(cs, iova);
- tu_cs_emit(cs,
- A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
- A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
- A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
- } else {
- tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
- tu_cs_emit(cs, 0x00000000);
- tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
- tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
- tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
- }
-
- trace_end_compute(&cmd->trace, cs,
- info->indirect != NULL,
- local_size[0], local_size[1], local_size[2],
- info->blocks[0], info->blocks[1], info->blocks[2]);
-
- /* For the workaround above, because it's using the "wrong" context for
- * SP_FS_INSTRLEN we should emit another dummy event write to avoid a
- * potential race between writing the register and the CP_EXEC_CS we just
- * did. We don't need to reset the register because it will be re-emitted
- * anyway when the next renderpass starts.
- */
- if (emit_instrlen_workaround) {
- tu6_emit_event_write(cmd, cs, LABEL);
- }
-
- tu_cs_emit_wfi(cs);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
uint32_t base_x,
uint32_t base_y,
@@ -5324,7 +2481,7 @@ tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
tu_dispatch(cmd_buffer, &info);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdDispatch(VkCommandBuffer commandBuffer,
uint32_t x,
uint32_t y,
@@ -5333,7 +2490,7 @@ tu_CmdDispatch(VkCommandBuffer commandBuffer,
tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
VkBuffer _buffer,
VkDeviceSize offset)
@@ -5348,410 +2505,133 @@ tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
tu_dispatch(cmd_buffer, &info);
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
- const VkSubpassEndInfo *pSubpassEndInfo)
+void
+tu_CmdEndRenderPass(VkCommandBuffer commandBuffer)
{
TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
- if (unlikely(cmd_buffer->device->instance->debug_flags & TU_DEBUG_DYNAMIC)) {
- vk_common_CmdEndRenderPass2(commandBuffer, pSubpassEndInfo);
- return;
- }
-
tu_cs_end(&cmd_buffer->draw_cs);
- tu_cs_end(&cmd_buffer->draw_epilogue_cs);
- tu_cmd_render(cmd_buffer);
-
- cmd_buffer->state.cache.pending_flush_bits |=
- cmd_buffer->state.renderpass_cache.pending_flush_bits;
- tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
-
- vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments);
-
- tu_reset_render_pass(cmd_buffer);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdEndRendering(VkCommandBuffer commandBuffer)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
-
- if (cmd_buffer->state.suspending)
- cmd_buffer->state.suspended_pass.lrz = cmd_buffer->state.lrz;
- if (!cmd_buffer->state.suspending) {
- tu_cs_end(&cmd_buffer->draw_cs);
- tu_cs_end(&cmd_buffer->draw_epilogue_cs);
+ tu_cmd_render_tiles(cmd_buffer);
- if (cmd_buffer->state.suspend_resume == SR_IN_PRE_CHAIN) {
- cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
- tu_save_pre_chain(cmd_buffer);
-
- /* Even we don't call tu_cmd_render here, renderpass is finished
- * and draw states should be disabled.
- */
- tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);
- } else {
- tu_cmd_render(cmd_buffer);
- }
+ /* discard draw_cs entries now that the tiles are rendered */
+ tu_cs_discard_entries(&cmd_buffer->draw_cs);
- tu_reset_render_pass(cmd_buffer);
- }
+ vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
+ cmd_buffer->state.attachments = NULL;
- if (cmd_buffer->state.resuming && !cmd_buffer->state.suspending) {
- /* exiting suspend/resume chain */
- switch (cmd_buffer->state.suspend_resume) {
- case SR_IN_CHAIN:
- cmd_buffer->state.suspend_resume = SR_NONE;
- break;
- case SR_IN_PRE_CHAIN:
- case SR_IN_CHAIN_AFTER_PRE_CHAIN:
- cmd_buffer->state.suspend_resume = SR_AFTER_PRE_CHAIN;
- break;
- default:
- unreachable("suspending render pass not followed by resuming pass");
- }
- }
+ cmd_buffer->state.pass = NULL;
+ cmd_buffer->state.subpass = NULL;
+ cmd_buffer->state.framebuffer = NULL;
}
-static void
-tu_barrier(struct tu_cmd_buffer *cmd,
- const VkDependencyInfo *dep_info)
+void
+tu_CmdEndRenderPass2KHR(VkCommandBuffer commandBuffer,
+ const VkSubpassEndInfoKHR *pSubpassEndInfo)
{
- VkPipelineStageFlags2 srcStage = 0;
- VkPipelineStageFlags2 dstStage = 0;
- enum tu_cmd_access_mask src_flags = 0;
- enum tu_cmd_access_mask dst_flags = 0;
-
- /* Inside a renderpass, we don't know yet whether we'll be using sysmem
- * so we have to use the sysmem flushes.
- */
- bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM &&
- !cmd->state.pass;
-
-
- for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
- VkPipelineStageFlags2 sanitized_src_stage =
- sanitize_src_stage(dep_info->pMemoryBarriers[i].srcStageMask);
- VkPipelineStageFlags2 sanitized_dst_stage =
- sanitize_dst_stage(dep_info->pMemoryBarriers[i].dstStageMask);
- src_flags |= vk2tu_access(dep_info->pMemoryBarriers[i].srcAccessMask,
- sanitized_src_stage, false, gmem);
- dst_flags |= vk2tu_access(dep_info->pMemoryBarriers[i].dstAccessMask,
- sanitized_dst_stage, false, gmem);
- srcStage |= sanitized_src_stage;
- dstStage |= sanitized_dst_stage;
- }
-
- for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
- VkPipelineStageFlags2 sanitized_src_stage =
- sanitize_src_stage(dep_info->pBufferMemoryBarriers[i].srcStageMask);
- VkPipelineStageFlags2 sanitized_dst_stage =
- sanitize_dst_stage(dep_info->pBufferMemoryBarriers[i].dstStageMask);
- src_flags |= vk2tu_access(dep_info->pBufferMemoryBarriers[i].srcAccessMask,
- sanitized_src_stage, false, gmem);
- dst_flags |= vk2tu_access(dep_info->pBufferMemoryBarriers[i].dstAccessMask,
- sanitized_dst_stage, false, gmem);
- srcStage |= sanitized_src_stage;
- dstStage |= sanitized_dst_stage;
- }
-
- for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
- VkImageLayout old_layout = dep_info->pImageMemoryBarriers[i].oldLayout;
- if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
- /* The underlying memory for this image may have been used earlier
- * within the same queue submission for a different image, which
- * means that there may be old, stale cache entries which are in the
- * "wrong" location, which could cause problems later after writing
- * to the image. We don't want these entries being flushed later and
- * overwriting the actual image, so we need to flush the CCU.
- */
- TU_FROM_HANDLE(tu_image, image, dep_info->pImageMemoryBarriers[i].image);
-
- if (vk_format_is_depth_or_stencil(image->vk.format)) {
- src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
- } else {
- src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
- }
- }
- VkPipelineStageFlags2 sanitized_src_stage =
- sanitize_src_stage(dep_info->pImageMemoryBarriers[i].srcStageMask);
- VkPipelineStageFlags2 sanitized_dst_stage =
- sanitize_dst_stage(dep_info->pImageMemoryBarriers[i].dstStageMask);
- src_flags |= vk2tu_access(dep_info->pImageMemoryBarriers[i].srcAccessMask,
- sanitized_src_stage, true, gmem);
- dst_flags |= vk2tu_access(dep_info->pImageMemoryBarriers[i].dstAccessMask,
- sanitized_dst_stage, true, gmem);
- srcStage |= sanitized_src_stage;
- dstStage |= sanitized_dst_stage;
- }
-
- if (cmd->state.pass) {
- const VkPipelineStageFlags framebuffer_space_stages =
- VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
- VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
- VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
- VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-
- /* We cannot have non-by-region "fb-space to fb-space" barriers.
- *
- * From the Vulkan 1.2.185 spec, section 7.6.1 "Subpass Self-dependency":
- *
- * If the source and destination stage masks both include
- * framebuffer-space stages, then dependencyFlags must include
- * VK_DEPENDENCY_BY_REGION_BIT.
- * [...]
- * Each of the synchronization scopes and access scopes of a
- * vkCmdPipelineBarrier2 or vkCmdPipelineBarrier command inside
- * a render pass instance must be a subset of the scopes of one of
- * the self-dependencies for the current subpass.
- *
- * If the self-dependency has VK_DEPENDENCY_BY_REGION_BIT or
- * VK_DEPENDENCY_VIEW_LOCAL_BIT set, then so must the pipeline barrier.
- *
- * By-region barriers are ok for gmem. All other barriers would involve
- * vtx stages which are NOT ok for gmem rendering.
- * See dep_invalid_for_gmem().
- */
- if ((srcStage & ~framebuffer_space_stages) ||
- (dstStage & ~framebuffer_space_stages)) {
- cmd->state.rp.disable_gmem = true;
- }
- }
-
- struct tu_cache_state *cache =
- cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache;
- tu_flush_for_access(cache, src_flags, dst_flags);
-
- enum tu_stage src_stage = vk2tu_src_stage(srcStage);
- enum tu_stage dst_stage = vk2tu_dst_stage(dstStage);
- tu_flush_for_stage(cache, src_stage, dst_stage);
+ tu_CmdEndRenderPass(commandBuffer);
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
- const VkDependencyInfo *pDependencyInfo)
+struct tu_barrier_info
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
-
- tu_barrier(cmd_buffer, pDependencyInfo);
-}
+ uint32_t eventCount;
+ const VkEvent *pEvents;
+ VkPipelineStageFlags srcStageMask;
+};
static void
-write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
- VkPipelineStageFlags2 stageMask, unsigned value)
+tu_barrier(struct tu_cmd_buffer *cmd_buffer,
+ uint32_t memoryBarrierCount,
+ const VkMemoryBarrier *pMemoryBarriers,
+ uint32_t bufferMemoryBarrierCount,
+ const VkBufferMemoryBarrier *pBufferMemoryBarriers,
+ uint32_t imageMemoryBarrierCount,
+ const VkImageMemoryBarrier *pImageMemoryBarriers,
+ const struct tu_barrier_info *info)
{
- struct tu_cs *cs = &cmd->cs;
-
- /* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */
- assert(!cmd->state.pass);
-
- tu_emit_cache_flush(cmd, cs);
-
- /* Flags that only require a top-of-pipe event. DrawIndirect parameters are
- * read by the CP, so the draw indirect stage counts as top-of-pipe too.
- */
- VkPipelineStageFlags2 top_of_pipe_flags =
- VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
- VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
-
- if (!(stageMask & ~top_of_pipe_flags)) {
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
- tu_cs_emit_qw(cs, event->bo->iova); /* ADDR_LO/HI */
- tu_cs_emit(cs, value);
- } else {
- /* Use a RB_DONE_TS event to wait for everything to complete. */
- tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
- tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS));
- tu_cs_emit_qw(cs, event->bo->iova);
- tu_cs_emit(cs, value);
- }
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetEvent2(VkCommandBuffer commandBuffer,
- VkEvent _event,
- const VkDependencyInfo *pDependencyInfo)
+void
+tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
+ VkPipelineStageFlags srcStageMask,
+ VkPipelineStageFlags destStageMask,
+ VkBool32 byRegion,
+ uint32_t memoryBarrierCount,
+ const VkMemoryBarrier *pMemoryBarriers,
+ uint32_t bufferMemoryBarrierCount,
+ const VkBufferMemoryBarrier *pBufferMemoryBarriers,
+ uint32_t imageMemoryBarrierCount,
+ const VkImageMemoryBarrier *pImageMemoryBarriers)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- TU_FROM_HANDLE(tu_event, event, _event);
- VkPipelineStageFlags2 src_stage_mask = 0;
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
+ struct tu_barrier_info info;
- for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
- src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
- for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
- src_stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
- for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
- src_stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
+ info.eventCount = 0;
+ info.pEvents = NULL;
+ info.srcStageMask = srcStageMask;
- write_event(cmd, event, src_stage_mask, 1);
+ tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
+ bufferMemoryBarrierCount, pBufferMemoryBarriers,
+ imageMemoryBarrierCount, pImageMemoryBarriers, &info);
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdResetEvent2(VkCommandBuffer commandBuffer,
- VkEvent _event,
- VkPipelineStageFlags2 stageMask)
+static void
+write_event(struct tu_cmd_buffer *cmd_buffer,
+ struct tu_event *event,
+ VkPipelineStageFlags stageMask,
+ unsigned value)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- TU_FROM_HANDLE(tu_event, event, _event);
-
- write_event(cmd, event, stageMask, 0);
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdWaitEvents2(VkCommandBuffer commandBuffer,
- uint32_t eventCount,
- const VkEvent *pEvents,
- const VkDependencyInfo* pDependencyInfos)
+void
+tu_CmdSetEvent(VkCommandBuffer commandBuffer,
+ VkEvent _event,
+ VkPipelineStageFlags stageMask)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
-
- for (uint32_t i = 0; i < eventCount; i++) {
- TU_FROM_HANDLE(tu_event, event, pEvents[i]);
-
- tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
- tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
- CP_WAIT_REG_MEM_0_POLL_MEMORY);
- tu_cs_emit_qw(cs, event->bo->iova); /* POLL_ADDR_LO/HI */
- tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
- tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
- tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
- }
-
- tu_barrier(cmd, pDependencyInfos);
-}
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
+ TU_FROM_HANDLE(tu_event, event, _event);
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
-{
- /* No-op */
+ write_event(cmd_buffer, event, stageMask, 1);
}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
- const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
+void
+tu_CmdResetEvent(VkCommandBuffer commandBuffer,
+ VkEvent _event,
+ VkPipelineStageFlags stageMask)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-
- cmd->state.predication_active = true;
-
- struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
-
- tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
- tu_cs_emit(cs, 1);
-
- /* Wait for any writes to the predicate to land */
- if (cmd->state.pass)
- tu_emit_cache_flush_renderpass(cmd, cs);
- else
- tu_emit_cache_flush(cmd, cs);
-
- TU_FROM_HANDLE(tu_buffer, buf, pConditionalRenderingBegin->buffer);
- uint64_t iova = buf->iova + pConditionalRenderingBegin->offset;
-
- /* qcom doesn't support 32-bit reference values, only 64-bit, but Vulkan
- * mandates 32-bit comparisons. Our workaround is to copy the the reference
- * value to the low 32-bits of a location where the high 32 bits are known
- * to be 0 and then compare that.
- */
- tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
- tu_cs_emit(cs, 0);
- tu_cs_emit_qw(cs, global_iova(cmd, predicate));
- tu_cs_emit_qw(cs, iova);
-
- tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
- tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
+ TU_FROM_HANDLE(tu_event, event, _event);
- bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
- tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3);
- tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) |
- CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS));
- tu_cs_emit_qw(cs, global_iova(cmd, predicate));
+ write_event(cmd_buffer, event, stageMask, 0);
}
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
+void
+tu_CmdWaitEvents(VkCommandBuffer commandBuffer,
+ uint32_t eventCount,
+ const VkEvent *pEvents,
+ VkPipelineStageFlags srcStageMask,
+ VkPipelineStageFlags dstStageMask,
+ uint32_t memoryBarrierCount,
+ const VkMemoryBarrier *pMemoryBarriers,
+ uint32_t bufferMemoryBarrierCount,
+ const VkBufferMemoryBarrier *pBufferMemoryBarriers,
+ uint32_t imageMemoryBarrierCount,
+ const VkImageMemoryBarrier *pImageMemoryBarriers)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
-
- cmd->state.predication_active = false;
+ TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
+ struct tu_barrier_info info;
- struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
+ info.eventCount = eventCount;
+ info.pEvents = pEvents;
+ info.srcStageMask = 0;
- tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
- tu_cs_emit(cs, 0);
+ tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
+ bufferMemoryBarrierCount, pBufferMemoryBarriers,
+ imageMemoryBarrierCount, pImageMemoryBarriers, &info);
}
void
-tu_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer,
- VkPipelineStageFlagBits2 pipelineStage,
- VkBuffer dstBuffer,
- VkDeviceSize dstOffset,
- uint32_t marker)
+tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
{
- /* Almost the same as write_event, but also allowed in renderpass */
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
-
- uint64_t va = buffer->bo->iova + dstOffset;
-
- struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
- struct tu_cache_state *cache =
- cmd->state.pass ? &cmd->state.renderpass_cache : &cmd->state.cache;
-
- /* From the Vulkan 1.2.203 spec:
- *
- * The access scope for buffer marker writes falls under
- * the VK_ACCESS_TRANSFER_WRITE_BIT, and the pipeline stages for
- * identifying the synchronization scope must include both pipelineStage
- * and VK_PIPELINE_STAGE_TRANSFER_BIT.
- *
- * Transfer operations use CCU however here we write via CP.
- * Flush CCU in order to make the results of previous transfer
- * operation visible to CP.
- */
- tu_flush_for_access(cache, 0, TU_ACCESS_SYSMEM_WRITE);
-
- /* Flags that only require a top-of-pipe event. DrawIndirect parameters are
- * read by the CP, so the draw indirect stage counts as top-of-pipe too.
- */
- VkPipelineStageFlags2 top_of_pipe_flags =
- VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
- VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
-
- bool is_top_of_pipe = !(pipelineStage & ~top_of_pipe_flags);
-
- /* We have to WFI only if we flushed CCU here and are using CP_MEM_WRITE.
- * Otherwise:
- * - We do CP_EVENT_WRITE(RB_DONE_TS) which should wait for flushes;
- * - There was a barrier to synchronize other writes with WriteBufferMarkerAMD
- * and they had to include our pipelineStage which forces the WFI.
- */
- if (cache->flush_bits != 0 && is_top_of_pipe) {
- cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
- }
-
- if (cmd->state.pass) {
- tu_emit_cache_flush_renderpass(cmd, cs);
- } else {
- tu_emit_cache_flush(cmd, cs);
- }
-
- if (is_top_of_pipe) {
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
- tu_cs_emit_qw(cs, va); /* ADDR_LO/HI */
- tu_cs_emit(cs, marker);
- } else {
- /* Use a RB_DONE_TS event to wait for everything to complete. */
- tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
- tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS));
- tu_cs_emit_qw(cs, va);
- tu_cs_emit(cs, marker);
- }
-
- /* Make sure the result of this write is visible to others. */
- tu_flush_for_access(cache, TU_ACCESS_CP_WRITE, 0);
+ /* No-op */
}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_cs.c b/lib/mesa/src/freedreno/vulkan/tu_cs.c
index 2e6f215f4..48242f813 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_cs.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_cs.c
@@ -1,101 +1,108 @@
/*
* Copyright © 2019 Google LLC
- * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
#include "tu_cs.h"
-#include "tu_suballoc.h"
-
/**
* Initialize a command stream.
*/
void
-tu_cs_init(struct tu_cs *cs,
- struct tu_device *device,
- enum tu_cs_mode mode,
- uint32_t initial_size, const char *name)
+tu_cs_init(struct tu_cs *cs, enum tu_cs_mode mode, uint32_t initial_size)
{
assert(mode != TU_CS_MODE_EXTERNAL);
memset(cs, 0, sizeof(*cs));
- cs->device = device;
cs->mode = mode;
cs->next_bo_size = initial_size;
- cs->name = name;
}
/**
* Initialize a command stream as a wrapper to an external buffer.
*/
void
-tu_cs_init_external(struct tu_cs *cs, struct tu_device *device,
- uint32_t *start, uint32_t *end)
+tu_cs_init_external(struct tu_cs *cs, uint32_t *start, uint32_t *end)
{
memset(cs, 0, sizeof(*cs));
- cs->device = device;
cs->mode = TU_CS_MODE_EXTERNAL;
cs->start = cs->reserved_end = cs->cur = start;
cs->end = end;
}
/**
- * Initialize a sub-command stream as a wrapper to an externally sub-allocated
- * buffer.
+ * Finish and release all resources owned by a command stream.
*/
void
-tu_cs_init_suballoc(struct tu_cs *cs, struct tu_device *device,
- struct tu_suballoc_bo *suballoc_bo)
+tu_cs_finish(struct tu_device *dev, struct tu_cs *cs)
{
- uint32_t *start = tu_suballoc_bo_map(suballoc_bo);
- uint32_t *end = start + (suballoc_bo->size >> 2);
+ for (uint32_t i = 0; i < cs->bo_count; ++i) {
+ tu_bo_finish(dev, cs->bos[i]);
+ free(cs->bos[i]);
+ }
- memset(cs, 0, sizeof(*cs));
- cs->device = device;
- cs->mode = TU_CS_MODE_SUB_STREAM;
- cs->start = cs->reserved_end = cs->cur = start;
- cs->end = end;
- cs->refcount_bo = tu_bo_get_ref(suballoc_bo->bo);
+ free(cs->entries);
+ free(cs->bos);
}
/**
- * Finish and release all resources owned by a command stream.
+ * Get the offset of the command packets emitted since the last call to
+ * tu_cs_add_entry.
*/
-void
-tu_cs_finish(struct tu_cs *cs)
+static uint32_t
+tu_cs_get_offset(const struct tu_cs *cs)
{
- for (uint32_t i = 0; i < cs->bo_count; ++i) {
- tu_bo_finish(cs->device, cs->bos[i]);
- }
-
- if (cs->refcount_bo)
- tu_bo_finish(cs->device, cs->refcount_bo);
+ assert(cs->bo_count);
+ return cs->start - (uint32_t *) cs->bos[cs->bo_count - 1]->map;
+}
- free(cs->entries);
- free(cs->bos);
+/**
+ * Get the size of the command packets emitted since the last call to
+ * tu_cs_add_entry.
+ */
+static uint32_t
+tu_cs_get_size(const struct tu_cs *cs)
+{
+ return cs->cur - cs->start;
}
-static struct tu_bo *
-tu_cs_current_bo(const struct tu_cs *cs)
+/**
+ * Get the size of the remaining space in the current BO.
+ */
+static uint32_t
+tu_cs_get_space(const struct tu_cs *cs)
{
- if (cs->refcount_bo) {
- return cs->refcount_bo;
- } else {
- assert(cs->bo_count);
- return cs->bos[cs->bo_count - 1];
- }
+ return cs->end - cs->cur;
}
/**
- * Get the offset of the command packets emitted since the last call to
+ * Return true if there is no command packet emitted since the last call to
* tu_cs_add_entry.
*/
static uint32_t
-tu_cs_get_offset(const struct tu_cs *cs)
+tu_cs_is_empty(const struct tu_cs *cs)
{
- return cs->start - (uint32_t *) tu_cs_current_bo(cs)->map;
+ return tu_cs_get_size(cs) == 0;
}
/*
@@ -103,12 +110,10 @@ tu_cs_get_offset(const struct tu_cs *cs)
* be emitted to the new BO.
*/
static VkResult
-tu_cs_add_bo(struct tu_cs *cs, uint32_t size)
+tu_cs_add_bo(struct tu_device *dev, struct tu_cs *cs, uint32_t size)
{
/* no BO for TU_CS_MODE_EXTERNAL */
assert(cs->mode != TU_CS_MODE_EXTERNAL);
- /* No adding more BOs if suballocating from a suballoc_bo. */
- assert(!cs->refcount_bo);
/* no dangling command packet */
assert(tu_cs_is_empty(cs));
@@ -125,18 +130,20 @@ tu_cs_add_bo(struct tu_cs *cs, uint32_t size)
cs->bos = new_bos;
}
- struct tu_bo *new_bo;
+ struct tu_bo *new_bo = malloc(sizeof(struct tu_bo));
+ if (!new_bo)
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
- VkResult result =
- tu_bo_init_new(cs->device, &new_bo, size * sizeof(uint32_t),
- TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP, cs->name);
+ VkResult result = tu_bo_init_new(dev, new_bo, size * sizeof(uint32_t));
if (result != VK_SUCCESS) {
+ free(new_bo);
return result;
}
- result = tu_bo_map(cs->device, new_bo);
+ result = tu_bo_map(dev, new_bo);
if (result != VK_SUCCESS) {
- tu_bo_finish(cs->device, new_bo);
+ tu_bo_finish(dev, new_bo);
+ free(new_bo);
return result;
}
@@ -152,7 +159,7 @@ tu_cs_add_bo(struct tu_cs *cs, uint32_t size)
* Reserve an IB entry.
*/
static VkResult
-tu_cs_reserve_entry(struct tu_cs *cs)
+tu_cs_reserve_entry(struct tu_device *dev, struct tu_cs *cs)
{
/* entries are only for TU_CS_MODE_GROW */
assert(cs->mode == TU_CS_MODE_GROW);
@@ -194,7 +201,7 @@ tu_cs_add_entry(struct tu_cs *cs)
/* add an entry for [cs->start, cs->cur] */
cs->entries[cs->entry_count++] = (struct tu_cs_entry) {
- .bo = tu_cs_current_bo(cs),
+ .bo = cs->bos[cs->bo_count - 1],
.size = tu_cs_get_size(cs) * sizeof(uint32_t),
.offset = tu_cs_get_offset(cs) * sizeof(uint32_t),
};
@@ -203,30 +210,6 @@ tu_cs_add_entry(struct tu_cs *cs)
}
/**
- * same behavior as tu_cs_emit_call but without the indirect
- */
-VkResult
-tu_cs_add_entries(struct tu_cs *cs, struct tu_cs *target)
-{
- VkResult result;
-
- assert(cs->mode == TU_CS_MODE_GROW);
- assert(target->mode == TU_CS_MODE_GROW);
-
- if (!tu_cs_is_empty(cs))
- tu_cs_add_entry(cs);
-
- for (unsigned i = 0; i < target->entry_count; i++) {
- result = tu_cs_reserve_entry(cs);
- if (result != VK_SUCCESS)
- return result;
- cs->entries[cs->entry_count++] = target->entries[i];
- }
-
- return VK_SUCCESS;
-}
-
-/**
* Begin (or continue) command packet emission. This does nothing but sanity
* checks currently. \a cs must not be in TU_CS_MODE_SUB_STREAM mode.
*/
@@ -259,58 +242,27 @@ tu_cs_end(struct tu_cs *cs)
* emission.
*/
VkResult
-tu_cs_begin_sub_stream(struct tu_cs *cs, uint32_t size, struct tu_cs *sub_cs)
+tu_cs_begin_sub_stream(struct tu_device *dev,
+ struct tu_cs *cs,
+ uint32_t size,
+ struct tu_cs *sub_cs)
{
assert(cs->mode == TU_CS_MODE_SUB_STREAM);
assert(size);
- VkResult result = tu_cs_reserve_space(cs, size);
+ VkResult result = tu_cs_reserve_space(dev, cs, size);
if (result != VK_SUCCESS)
return result;
- tu_cs_init_external(sub_cs, cs->device, cs->cur, cs->reserved_end);
+ tu_cs_init_external(sub_cs, cs->cur, cs->reserved_end);
tu_cs_begin(sub_cs);
- result = tu_cs_reserve_space(sub_cs, size);
+ result = tu_cs_reserve_space(dev, sub_cs, size);
assert(result == VK_SUCCESS);
return VK_SUCCESS;
}
/**
- * Allocate count*size dwords, aligned to size dwords.
- * \a cs must be in TU_CS_MODE_SUB_STREAM mode.
- *
- */
-VkResult
-tu_cs_alloc(struct tu_cs *cs,
- uint32_t count,
- uint32_t size,
- struct tu_cs_memory *memory)
-{
- assert(cs->mode == TU_CS_MODE_SUB_STREAM);
- assert(size && size <= 1024);
-
- if (!count)
- return VK_SUCCESS;
-
- /* TODO: smarter way to deal with alignment? */
-
- VkResult result = tu_cs_reserve_space(cs, count * size + (size-1));
- if (result != VK_SUCCESS)
- return result;
-
- struct tu_bo *bo = tu_cs_current_bo(cs);
- size_t offset = align(tu_cs_get_offset(cs), size);
-
- memory->map = bo->map + offset * sizeof(uint32_t);
- memory->iova = bo->iova + offset * sizeof(uint32_t);
-
- cs->start = cs->cur = (uint32_t*) bo->map + offset + count * size;
-
- return VK_SUCCESS;
-}
-
-/**
* End command packet emission to a sub-stream. \a sub_cs becomes invalid
* after this call.
*
@@ -321,6 +273,7 @@ struct tu_cs_entry
tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs)
{
assert(cs->mode == TU_CS_MODE_SUB_STREAM);
+ assert(cs->bo_count);
assert(sub_cs->start == cs->cur && sub_cs->end == cs->reserved_end);
tu_cs_sanity_check(sub_cs);
@@ -329,7 +282,7 @@ tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs)
cs->cur = sub_cs->cur;
struct tu_cs_entry entry = {
- .bo = tu_cs_current_bo(cs),
+ .bo = cs->bos[cs->bo_count - 1],
.size = tu_cs_get_size(cs) * sizeof(uint32_t),
.offset = tu_cs_get_offset(cs) * sizeof(uint32_t),
};
@@ -344,7 +297,9 @@ tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs)
* This never fails when \a cs has mode TU_CS_MODE_EXTERNAL.
*/
VkResult
-tu_cs_reserve_space(struct tu_cs *cs, uint32_t reserved_size)
+tu_cs_reserve_space(struct tu_device *dev,
+ struct tu_cs *cs,
+ uint32_t reserved_size)
{
if (tu_cs_get_space(cs) < reserved_size) {
if (cs->mode == TU_CS_MODE_EXTERNAL) {
@@ -360,39 +315,14 @@ tu_cs_reserve_space(struct tu_cs *cs, uint32_t reserved_size)
tu_cs_add_entry(cs);
}
- for (uint32_t i = 0; i < cs->cond_stack_depth; i++) {
- /* Subtract one here to account for the DWORD field itself. */
- *cs->cond_dwords[i] = cs->cur - cs->cond_dwords[i] - 1;
-
- /* space for CP_COND_REG_EXEC in next bo */
- reserved_size += 3;
- }
-
/* switch to a new BO */
uint32_t new_size = MAX2(cs->next_bo_size, reserved_size);
- VkResult result = tu_cs_add_bo(cs, new_size);
+ VkResult result = tu_cs_add_bo(dev, cs, new_size);
if (result != VK_SUCCESS)
return result;
- if (cs->cond_stack_depth) {
- cs->reserved_end = cs->cur + reserved_size;
- }
-
- /* Re-emit CP_COND_REG_EXECs */
- for (uint32_t i = 0; i < cs->cond_stack_depth; i++) {
- tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
- tu_cs_emit(cs, cs->cond_flags[i]);
-
- cs->cond_dwords[i] = cs->cur;
-
- /* Emit dummy DWORD field here */
- tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(0));
- }
-
- /* double the size for the next bo, also there is an upper
- * bound on IB size, which appears to be 0x0fffff
- */
- new_size = MIN2(new_size << 1, 0x0fffff);
+ /* double the size for the next bo */
+ new_size <<= 1;
if (cs->next_bo_size < new_size)
cs->next_bo_size = new_size;
}
@@ -402,7 +332,7 @@ tu_cs_reserve_space(struct tu_cs *cs, uint32_t reserved_size)
if (cs->mode == TU_CS_MODE_GROW) {
/* reserve an entry for the next call to this function or tu_cs_end */
- return tu_cs_reserve_entry(cs);
+ return tu_cs_reserve_entry(dev, cs);
}
return VK_SUCCESS;
@@ -413,16 +343,17 @@ tu_cs_reserve_space(struct tu_cs *cs, uint32_t reserved_size)
* packets in \a cs, but does not necessarily release all resources.
*/
void
-tu_cs_reset(struct tu_cs *cs)
+tu_cs_reset(struct tu_device *dev, struct tu_cs *cs)
{
if (cs->mode == TU_CS_MODE_EXTERNAL) {
- assert(!cs->bo_count && !cs->refcount_bo && !cs->entry_count);
+ assert(!cs->bo_count && !cs->entry_count);
cs->reserved_end = cs->cur = cs->start;
return;
}
for (uint32_t i = 0; i + 1 < cs->bo_count; ++i) {
- tu_bo_finish(cs->device, cs->bos[i]);
+ tu_bo_finish(dev, cs->bos[i]);
+ free(cs->bos[i]);
}
if (cs->bo_count) {
diff --git a/lib/mesa/src/freedreno/vulkan/tu_descriptor_set.c b/lib/mesa/src/freedreno/vulkan/tu_descriptor_set.c
index 14d8b4b07..0f49d26e2 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_descriptor_set.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_descriptor_set.c
@@ -1,100 +1,67 @@
/*
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
- * SPDX-License-Identifier: MIT
- */
-
-/**
- * @file
*
- * We use the bindless descriptor model, which maps fairly closely to how
- * Vulkan descriptor sets work. The two exceptions are input attachments and
- * dynamic descriptors, which have to be patched when recording command
- * buffers. We reserve an extra descriptor set for these. This descriptor set
- * contains all the input attachments in the pipeline, in order, and then all
- * the dynamic descriptors. The dynamic descriptors are stored in the CPU-side
- * datastructure for each tu_descriptor_set, and then combined into one big
- * descriptor set at CmdBindDescriptors time/draw time.
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
+#include "tu_private.h"
-#include "tu_descriptor_set.h"
-
+#include <assert.h>
#include <fcntl.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
#include "util/mesa-sha1.h"
-#include "vk_descriptors.h"
#include "vk_util.h"
-#include "tu_device.h"
-#include "tu_image.h"
-
-static inline uint8_t *
-pool_base(struct tu_descriptor_pool *pool)
+static int
+binding_compare(const void *av, const void *bv)
{
- return pool->host_bo ?: pool->bo->map;
-}
+ const VkDescriptorSetLayoutBinding *a =
+ (const VkDescriptorSetLayoutBinding *) av;
+ const VkDescriptorSetLayoutBinding *b =
+ (const VkDescriptorSetLayoutBinding *) bv;
-static uint32_t
-descriptor_size(struct tu_device *dev,
- const VkDescriptorSetLayoutBinding *binding,
- VkDescriptorType type)
-{
- switch (type) {
- case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
- if (unlikely(dev->instance->debug_flags & TU_DEBUG_DYNAMIC))
- return A6XX_TEX_CONST_DWORDS * 4;
-
- /* Input attachment doesn't use descriptor sets at all */
- return 0;
- case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- /* We make offsets and sizes all 16 dwords, to match how the hardware
- * interprets indices passed to sample/load/store instructions in
- * multiples of 16 dwords. This means that "normal" descriptors are all
- * of size 16, with padding for smaller descriptors like uniform storage
- * descriptors which are less than 16 dwords. However combined images
- * and samplers are actually two descriptors, so they have size 2.
- */
- return A6XX_TEX_CONST_DWORDS * 4 * 2;
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- /* When we support 16-bit storage, we need an extra descriptor setup as
- * a 32-bit array for isam to work.
- */
- if (dev->physical_device->info->a6xx.storage_16bit) {
- return A6XX_TEX_CONST_DWORDS * 4 * 2;
- } else {
- return A6XX_TEX_CONST_DWORDS * 4;
- }
- case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
- return A6XX_TEX_CONST_DWORDS * 4 +
- ALIGN(binding->descriptorCount, A6XX_TEX_CONST_DWORDS * 4);
- default:
- return A6XX_TEX_CONST_DWORDS * 4;
- }
+ return (a->binding < b->binding) ? -1 : (a->binding > b->binding) ? 1 : 0;
}
-static bool
-is_dynamic(VkDescriptorType type)
+static VkDescriptorSetLayoutBinding *
+create_sorted_bindings(const VkDescriptorSetLayoutBinding *bindings,
+ unsigned count)
{
- return type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
- type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
-}
+ VkDescriptorSetLayoutBinding *sorted_bindings =
+ malloc(count * sizeof(VkDescriptorSetLayoutBinding));
+ if (!sorted_bindings)
+ return NULL;
-static uint32_t
-mutable_descriptor_size(struct tu_device *dev,
- const VkMutableDescriptorTypeListEXT *list)
-{
- uint32_t max_size = 0;
+ memcpy(sorted_bindings, bindings,
+ count * sizeof(VkDescriptorSetLayoutBinding));
- for (uint32_t i = 0; i < list->descriptorTypeCount; i++) {
- uint32_t size = descriptor_size(dev, NULL, list->pDescriptorTypes[i]);
- max_size = MAX2(max_size, size);
- }
+ qsort(sorted_bindings, count, sizeof(VkDescriptorSetLayoutBinding),
+ binding_compare);
- return max_size;
+ return sorted_bindings;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreateDescriptorSetLayout(
VkDevice _device,
const VkDescriptorSetLayoutCreateInfo *pCreateInfo,
@@ -106,191 +73,178 @@ tu_CreateDescriptorSetLayout(
assert(pCreateInfo->sType ==
VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO);
- const VkDescriptorSetLayoutBindingFlagsCreateInfo *variable_flags =
- vk_find_struct_const(
- pCreateInfo->pNext,
- DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO);
- const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
+ const VkDescriptorSetLayoutBindingFlagsCreateInfoEXT *variable_flags =
vk_find_struct_const(
pCreateInfo->pNext,
- MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
+ DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO_EXT);
- uint32_t num_bindings = 0;
+ uint32_t max_binding = 0;
uint32_t immutable_sampler_count = 0;
- uint32_t ycbcr_sampler_count = 0;
for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
- num_bindings = MAX2(num_bindings, pCreateInfo->pBindings[j].binding + 1);
- if ((pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
- pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
- pCreateInfo->pBindings[j].pImmutableSamplers) {
+ max_binding = MAX2(max_binding, pCreateInfo->pBindings[j].binding);
+ if (pCreateInfo->pBindings[j].pImmutableSamplers)
immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
-
- bool has_ycbcr_sampler = false;
- for (unsigned i = 0; i < pCreateInfo->pBindings[j].descriptorCount; ++i) {
- if (tu_sampler_from_handle(pCreateInfo->pBindings[j].pImmutableSamplers[i])->ycbcr_sampler)
- has_ycbcr_sampler = true;
- }
-
- if (has_ycbcr_sampler)
- ycbcr_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
- }
}
uint32_t samplers_offset =
- offsetof(struct tu_descriptor_set_layout, binding[num_bindings]);
-
- /* note: only need to store TEX_SAMP_DWORDS for immutable samples,
- * but using struct tu_sampler makes things simpler */
- uint32_t size = samplers_offset +
- immutable_sampler_count * sizeof(struct tu_sampler) +
- ycbcr_sampler_count * sizeof(struct tu_sampler_ycbcr_conversion);
+ sizeof(struct tu_descriptor_set_layout) +
+ (max_binding + 1) * sizeof(set_layout->binding[0]);
+ size_t size =
+ samplers_offset + immutable_sampler_count * 4 * sizeof(uint32_t);
- set_layout = vk_descriptor_set_layout_zalloc(&device->vk, size);
+ set_layout = vk_alloc2(&device->alloc, pAllocator, size, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!set_layout)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
set_layout->flags = pCreateInfo->flags;
- /* We just allocate all the immutable samplers at the end of the struct */
- struct tu_sampler *samplers = (void*) &set_layout->binding[num_bindings];
- struct tu_sampler_ycbcr_conversion *ycbcr_samplers =
- (void*) &samplers[immutable_sampler_count];
-
- VkDescriptorSetLayoutBinding *bindings = NULL;
- VkResult result = vk_create_sorted_bindings(
- pCreateInfo->pBindings, pCreateInfo->bindingCount, &bindings);
- if (result != VK_SUCCESS) {
- vk_object_free(&device->vk, pAllocator, set_layout);
- return vk_error(device, result);
+ /* We just allocate all the samplers at the end of the struct */
+ uint32_t *samplers = (uint32_t *) &set_layout->binding[max_binding + 1];
+ (void) samplers; /* TODO: Use me */
+
+ VkDescriptorSetLayoutBinding *bindings = create_sorted_bindings(
+ pCreateInfo->pBindings, pCreateInfo->bindingCount);
+ if (!bindings) {
+ vk_free2(&device->alloc, pAllocator, set_layout);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
}
- set_layout->binding_count = num_bindings;
+ set_layout->binding_count = max_binding + 1;
set_layout->shader_stages = 0;
+ set_layout->dynamic_shader_stages = 0;
set_layout->has_immutable_samplers = false;
- set_layout->has_inline_uniforms = false;
set_layout->size = 0;
- uint32_t dynamic_offset_size = 0;
+ memset(set_layout->binding, 0,
+ size - sizeof(struct tu_descriptor_set_layout));
+
+ uint32_t buffer_count = 0;
+ uint32_t dynamic_offset_count = 0;
for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
const VkDescriptorSetLayoutBinding *binding = bindings + j;
uint32_t b = binding->binding;
+ uint32_t alignment;
+ unsigned binding_buffer_count = 0;
- set_layout->binding[b].type = binding->descriptorType;
- set_layout->binding[b].array_size =
- binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ?
- 1 : binding->descriptorCount;
- set_layout->binding[b].offset = set_layout->size;
- set_layout->binding[b].dynamic_offset_offset = dynamic_offset_size;
- set_layout->binding[b].shader_stages = binding->stageFlags;
-
- if (binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) {
- /* For mutable descriptor types we must allocate a size that fits the
- * largest descriptor type that the binding can mutate to.
- */
- set_layout->binding[b].size =
- mutable_descriptor_size(device, &mutable_info->pMutableDescriptorTypeLists[j]);
- } else {
- set_layout->binding[b].size =
- descriptor_size(device, binding, binding->descriptorType);
+ switch (binding->descriptorType) {
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+ assert(!(pCreateInfo->flags &
+ VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
+ set_layout->binding[b].dynamic_offset_count = 1;
+ set_layout->dynamic_shader_stages |= binding->stageFlags;
+ set_layout->binding[b].size = 0;
+ binding_buffer_count = 1;
+ alignment = 1;
+ break;
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+ set_layout->binding[b].size = 16;
+ binding_buffer_count = 1;
+ alignment = 16;
+ break;
+ case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+ case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+ case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+ /* main descriptor + fmask descriptor */
+ set_layout->binding[b].size = 64;
+ binding_buffer_count = 1;
+ alignment = 32;
+ break;
+ case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+ /* main descriptor + fmask descriptor + sampler */
+ set_layout->binding[b].size = 96;
+ binding_buffer_count = 1;
+ alignment = 32;
+ break;
+ case VK_DESCRIPTOR_TYPE_SAMPLER:
+ set_layout->binding[b].size = 16;
+ alignment = 16;
+ break;
+ default:
+ unreachable("unknown descriptor type\n");
+ break;
}
- if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
- set_layout->has_inline_uniforms = true;
+ set_layout->size = align(set_layout->size, alignment);
+ set_layout->binding[b].type = binding->descriptorType;
+ set_layout->binding[b].array_size = binding->descriptorCount;
+ set_layout->binding[b].offset = set_layout->size;
+ set_layout->binding[b].buffer_offset = buffer_count;
+ set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count;
if (variable_flags && binding->binding < variable_flags->bindingCount &&
(variable_flags->pBindingFlags[binding->binding] &
- VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) {
+ VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT_EXT)) {
assert(!binding->pImmutableSamplers); /* Terribly ill defined how
many samplers are valid */
- assert(binding->binding == num_bindings - 1);
+ assert(binding->binding == max_binding);
set_layout->has_variable_descriptors = true;
}
- if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
- binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) &&
- binding->pImmutableSamplers) {
+ if (binding->pImmutableSamplers) {
set_layout->binding[b].immutable_samplers_offset = samplers_offset;
set_layout->has_immutable_samplers = true;
-
- for (uint32_t i = 0; i < binding->descriptorCount; i++)
- samplers[i] = *tu_sampler_from_handle(binding->pImmutableSamplers[i]);
-
- samplers += binding->descriptorCount;
- samplers_offset += sizeof(struct tu_sampler) * binding->descriptorCount;
-
- bool has_ycbcr_sampler = false;
- for (unsigned i = 0; i < pCreateInfo->pBindings[j].descriptorCount; ++i) {
- if (tu_sampler_from_handle(binding->pImmutableSamplers[i])->ycbcr_sampler)
- has_ycbcr_sampler = true;
- }
-
- if (has_ycbcr_sampler) {
- set_layout->binding[b].ycbcr_samplers_offset =
- (const char*)ycbcr_samplers - (const char*)set_layout;
- for (uint32_t i = 0; i < binding->descriptorCount; i++) {
- struct tu_sampler *sampler = tu_sampler_from_handle(binding->pImmutableSamplers[i]);
- if (sampler->ycbcr_sampler)
- ycbcr_samplers[i] = *sampler->ycbcr_sampler;
- else
- ycbcr_samplers[i].ycbcr_model = VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY;
- }
- ycbcr_samplers += binding->descriptorCount;
- } else {
- set_layout->binding[b].ycbcr_samplers_offset = 0;
- }
- }
-
- uint32_t size =
- ALIGN_POT(set_layout->binding[b].array_size * set_layout->binding[b].size, 4 * A6XX_TEX_CONST_DWORDS);
- if (is_dynamic(binding->descriptorType)) {
- dynamic_offset_size += size;
- } else {
- set_layout->size += size;
}
+ set_layout->size +=
+ binding->descriptorCount * set_layout->binding[b].size;
+ buffer_count += binding->descriptorCount * binding_buffer_count;
+ dynamic_offset_count += binding->descriptorCount *
+ set_layout->binding[b].dynamic_offset_count;
set_layout->shader_stages |= binding->stageFlags;
}
free(bindings);
- set_layout->dynamic_offset_size = dynamic_offset_size;
+ set_layout->buffer_count = buffer_count;
+ set_layout->dynamic_offset_count = dynamic_offset_count;
*pSetLayout = tu_descriptor_set_layout_to_handle(set_layout);
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
+void
+tu_DestroyDescriptorSetLayout(VkDevice _device,
+ VkDescriptorSetLayout _set_layout,
+ const VkAllocationCallbacks *pAllocator)
+{
+ TU_FROM_HANDLE(tu_device, device, _device);
+ TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout, _set_layout);
+
+ if (!set_layout)
+ return;
+
+ vk_free2(&device->alloc, pAllocator, set_layout);
+}
+
+void
tu_GetDescriptorSetLayoutSupport(
- VkDevice _device,
+ VkDevice device,
const VkDescriptorSetLayoutCreateInfo *pCreateInfo,
VkDescriptorSetLayoutSupport *pSupport)
{
- TU_FROM_HANDLE(tu_device, device, _device);
-
- VkDescriptorSetLayoutBinding *bindings = NULL;
- VkResult result = vk_create_sorted_bindings(
- pCreateInfo->pBindings, pCreateInfo->bindingCount, &bindings);
- if (result != VK_SUCCESS) {
+ VkDescriptorSetLayoutBinding *bindings = create_sorted_bindings(
+ pCreateInfo->pBindings, pCreateInfo->bindingCount);
+ if (!bindings) {
pSupport->supported = false;
return;
}
- const VkDescriptorSetLayoutBindingFlagsCreateInfo *variable_flags =
+ const VkDescriptorSetLayoutBindingFlagsCreateInfoEXT *variable_flags =
vk_find_struct_const(
pCreateInfo->pNext,
- DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO);
- VkDescriptorSetVariableDescriptorCountLayoutSupport *variable_count =
+ DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO_EXT);
+ VkDescriptorSetVariableDescriptorCountLayoutSupportEXT *variable_count =
vk_find_struct(
(void *) pCreateInfo->pNext,
- DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT);
- const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
- vk_find_struct_const(
- pCreateInfo->pNext,
- MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
-
+ DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT_EXT);
if (variable_count) {
variable_count->maxVariableDescriptorCount = 0;
}
@@ -300,157 +254,71 @@ tu_GetDescriptorSetLayoutSupport(
for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) {
const VkDescriptorSetLayoutBinding *binding = bindings + i;
- uint64_t descriptor_sz;
-
- if (is_dynamic(binding->descriptorType)) {
- descriptor_sz = 0;
- } else if (binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) {
- const VkMutableDescriptorTypeListEXT *list =
- &mutable_info->pMutableDescriptorTypeLists[i];
-
- for (uint32_t j = 0; j < list->descriptorTypeCount; j++) {
- /* Don't support the input attachement and combined image sampler type
- * for mutable descriptors */
- if (list->pDescriptorTypes[j] == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT ||
- list->pDescriptorTypes[j] == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
- list->pDescriptorTypes[j] == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
- supported = false;
- goto out;
- }
- }
-
- descriptor_sz =
- mutable_descriptor_size(device, &mutable_info->pMutableDescriptorTypeLists[i]);
- } else {
- descriptor_sz = descriptor_size(device, binding, binding->descriptorType);
+ uint64_t descriptor_size = 0;
+ uint64_t descriptor_alignment = 1;
+ switch (binding->descriptorType) {
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+ break;
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+ case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+ case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+ descriptor_size = 16;
+ descriptor_alignment = 16;
+ break;
+ case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+ case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+ case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+ descriptor_size = 64;
+ descriptor_alignment = 32;
+ break;
+ case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+ descriptor_size = 96;
+ descriptor_alignment = 32;
+ break;
+ case VK_DESCRIPTOR_TYPE_SAMPLER:
+ descriptor_size = 16;
+ descriptor_alignment = 16;
+ break;
+ default:
+ unreachable("unknown descriptor type\n");
+ break;
}
- uint64_t descriptor_alignment = 4 * A6XX_TEX_CONST_DWORDS;
- if (size && !ALIGN_POT(size, descriptor_alignment)) {
+ if (size && !align_u64(size, descriptor_alignment)) {
supported = false;
}
- size = ALIGN_POT(size, descriptor_alignment);
-
- uint64_t max_count = MAX_SET_SIZE;
- unsigned descriptor_count = binding->descriptorCount;
- if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
- max_count = MAX_SET_SIZE - size;
- descriptor_count = descriptor_sz;
- descriptor_sz = 1;
- } else if (descriptor_sz) {
- max_count = (MAX_SET_SIZE - size) / descriptor_sz;
- }
+ size = align_u64(size, descriptor_alignment);
- if (max_count < descriptor_count) {
+ uint64_t max_count = UINT64_MAX;
+ if (descriptor_size)
+ max_count = (UINT64_MAX - size) / descriptor_size;
+
+ if (max_count < binding->descriptorCount) {
supported = false;
}
-
if (variable_flags && binding->binding < variable_flags->bindingCount &&
variable_count &&
(variable_flags->pBindingFlags[binding->binding] &
- VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) {
+ VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT_EXT)) {
variable_count->maxVariableDescriptorCount =
MIN2(UINT32_MAX, max_count);
}
- size += descriptor_count * descriptor_sz;
+ size += binding->descriptorCount * descriptor_size;
}
-out:
free(bindings);
pSupport->supported = supported;
}
-/* Note: we must hash any values used in tu_lower_io(). */
-
-#define SHA1_UPDATE_VALUE(ctx, x) _mesa_sha1_update(ctx, &(x), sizeof(x));
-
-static void
-sha1_update_ycbcr_sampler(struct mesa_sha1 *ctx,
- const struct tu_sampler_ycbcr_conversion *sampler)
-{
- SHA1_UPDATE_VALUE(ctx, sampler->ycbcr_model);
- SHA1_UPDATE_VALUE(ctx, sampler->ycbcr_range);
- SHA1_UPDATE_VALUE(ctx, sampler->format);
-}
-
-static void
-sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx,
- const struct tu_descriptor_set_binding_layout *layout,
- const struct tu_descriptor_set_layout *set_layout)
-{
- SHA1_UPDATE_VALUE(ctx, layout->type);
- SHA1_UPDATE_VALUE(ctx, layout->offset);
- SHA1_UPDATE_VALUE(ctx, layout->size);
- SHA1_UPDATE_VALUE(ctx, layout->array_size);
- SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_offset);
- SHA1_UPDATE_VALUE(ctx, layout->immutable_samplers_offset);
-
- const struct tu_sampler_ycbcr_conversion *ycbcr_samplers =
- tu_immutable_ycbcr_samplers(set_layout, layout);
-
- if (ycbcr_samplers) {
- for (unsigned i = 0; i < layout->array_size; i++)
- sha1_update_ycbcr_sampler(ctx, ycbcr_samplers + i);
- }
-}
-
-
-static void
-sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx,
- const struct tu_descriptor_set_layout *layout)
-{
- for (uint16_t i = 0; i < layout->binding_count; i++)
- sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i],
- layout);
-}
-
/*
* Pipeline layouts. These have nothing to do with the pipeline. They are
* just multiple descriptor set layouts pasted together.
*/
-void
-tu_pipeline_layout_init(struct tu_pipeline_layout *layout)
-{
- unsigned dynamic_offset_size = 0;
-
- for (uint32_t set = 0; set < layout->num_sets; set++) {
- assert(set < MAX_SETS);
- layout->set[set].dynamic_offset_start = dynamic_offset_size;
-
- if (layout->set[set].layout)
- dynamic_offset_size += layout->set[set].layout->dynamic_offset_size;
- }
-
- layout->dynamic_offset_size = dynamic_offset_size;
-
- /* We only care about INDEPENDENT_SETS for dynamic-offset descriptors,
- * where all the descriptors from all the sets are combined into one set
- * and we have to provide the dynamic_offset_start dynamically with fast
- * linking.
- */
- if (dynamic_offset_size == 0) {
- layout->independent_sets = false;
- }
-
- struct mesa_sha1 ctx;
- _mesa_sha1_init(&ctx);
- for (unsigned s = 0; s < layout->num_sets; s++) {
- if (layout->set[s].layout)
- sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout);
- _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start,
- sizeof(layout->set[s].dynamic_offset_start));
- }
- _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets));
- _mesa_sha1_update(&ctx, &layout->push_constant_size,
- sizeof(layout->push_constant_size));
- _mesa_sha1_update(&ctx, &layout->independent_sets,
- sizeof(layout->independent_sets));
- _mesa_sha1_final(&ctx, layout->sha1);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreatePipelineLayout(VkDevice _device,
const VkPipelineLayoutCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
@@ -458,26 +326,42 @@ tu_CreatePipelineLayout(VkDevice _device,
{
TU_FROM_HANDLE(tu_device, device, _device);
struct tu_pipeline_layout *layout;
+ struct mesa_sha1 ctx;
assert(pCreateInfo->sType ==
VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO);
- layout = vk_object_alloc(&device->vk, pAllocator, sizeof(*layout),
- VK_OBJECT_TYPE_PIPELINE_LAYOUT);
+ layout = vk_alloc2(&device->alloc, pAllocator, sizeof(*layout), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (layout == NULL)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
layout->num_sets = pCreateInfo->setLayoutCount;
+
+ unsigned dynamic_offset_count = 0;
+
+ _mesa_sha1_init(&ctx);
for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) {
TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout,
pCreateInfo->pSetLayouts[set]);
-
- assert(set < MAX_SETS);
layout->set[set].layout = set_layout;
- if (set_layout)
- vk_descriptor_set_layout_ref(&set_layout->vk);
+
+ layout->set[set].dynamic_offset_start = dynamic_offset_count;
+ for (uint32_t b = 0; b < set_layout->binding_count; b++) {
+ dynamic_offset_count += set_layout->binding[b].array_size *
+ set_layout->binding[b].dynamic_offset_count;
+ if (set_layout->binding[b].immutable_samplers_offset)
+ _mesa_sha1_update(
+ &ctx,
+ tu_immutable_samplers(set_layout, set_layout->binding + b),
+ set_layout->binding[b].array_size * 4 * sizeof(uint32_t));
+ }
+ _mesa_sha1_update(
+ &ctx, set_layout->binding,
+ sizeof(set_layout->binding[0]) * set_layout->binding_count);
}
+ layout->dynamic_offset_count = dynamic_offset_count;
layout->push_constant_size = 0;
for (unsigned i = 0; i < pCreateInfo->pushConstantRangeCount; ++i) {
@@ -487,17 +371,15 @@ tu_CreatePipelineLayout(VkDevice _device,
}
layout->push_constant_size = align(layout->push_constant_size, 16);
- layout->independent_sets =
- pCreateInfo->flags & VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT;
-
- tu_pipeline_layout_init(layout);
-
+ _mesa_sha1_update(&ctx, &layout->push_constant_size,
+ sizeof(layout->push_constant_size));
+ _mesa_sha1_final(&ctx, layout->sha1);
*pPipelineLayout = tu_pipeline_layout_to_handle(layout);
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyPipelineLayout(VkDevice _device,
VkPipelineLayout _pipelineLayout,
const VkAllocationCallbacks *pAllocator)
@@ -507,329 +389,31 @@ tu_DestroyPipelineLayout(VkDevice _device,
if (!pipeline_layout)
return;
-
- for (uint32_t i = 0; i < pipeline_layout->num_sets; i++) {
- if (pipeline_layout->set[i].layout)
- vk_descriptor_set_layout_unref(&device->vk, &pipeline_layout->set[i].layout->vk);
- }
-
- vk_object_free(&device->vk, pAllocator, pipeline_layout);
+ vk_free2(&device->alloc, pAllocator, pipeline_layout);
}
#define EMPTY 1
-static VkResult
-tu_descriptor_set_create(struct tu_device *device,
- struct tu_descriptor_pool *pool,
- struct tu_descriptor_set_layout *layout,
- uint32_t variable_count,
- struct tu_descriptor_set **out_set)
-{
- struct tu_descriptor_set *set;
- unsigned dynamic_offset = sizeof(struct tu_descriptor_set);
- unsigned mem_size = dynamic_offset + layout->dynamic_offset_size;
-
- if (pool->host_memory_base) {
- if (pool->host_memory_end - pool->host_memory_ptr < mem_size)
- return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY);
-
- set = (struct tu_descriptor_set*)pool->host_memory_ptr;
- pool->host_memory_ptr += mem_size;
- } else {
- set = vk_alloc2(&device->vk.alloc, NULL, mem_size, 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-
- if (!set)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
- }
-
- memset(set, 0, mem_size);
- vk_object_base_init(&device->vk, &set->base, VK_OBJECT_TYPE_DESCRIPTOR_SET);
-
- if (layout->dynamic_offset_size) {
- set->dynamic_descriptors = (uint32_t *)((uint8_t*)set + dynamic_offset);
- }
-
- set->layout = layout;
- set->pool = pool;
- uint32_t layout_size = layout->size;
- if (layout->has_variable_descriptors) {
- struct tu_descriptor_set_binding_layout *binding =
- &layout->binding[layout->binding_count - 1];
- if (binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
- layout_size = binding->offset + A6XX_TEX_CONST_DWORDS * 4 +
- ALIGN(variable_count, A6XX_TEX_CONST_DWORDS * 4);
- } else {
- uint32_t stride = binding->size;
- layout_size = binding->offset + variable_count * stride;
- }
- }
-
- if (layout_size) {
- set->size = layout_size;
-
- if (!pool->host_memory_base && pool->entry_count == pool->max_entry_count) {
- vk_object_free(&device->vk, NULL, set);
- return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY);
- }
-
- /* try to allocate linearly first, so that we don't spend
- * time looking for gaps if the app only allocates &
- * resets via the pool. */
- if (pool->current_offset + layout_size <= pool->size) {
- set->mapped_ptr = (uint32_t*)(pool_base(pool) + pool->current_offset);
- set->va = pool->host_bo ? 0 : pool->bo->iova + pool->current_offset;
-
- if (!pool->host_memory_base) {
- pool->entries[pool->entry_count].offset = pool->current_offset;
- pool->entries[pool->entry_count].size = layout_size;
- pool->entries[pool->entry_count].set = set;
- pool->entry_count++;
- }
- pool->current_offset += layout_size;
- } else if (!pool->host_memory_base) {
- uint64_t offset = 0;
- int index;
-
- for (index = 0; index < pool->entry_count; ++index) {
- if (pool->entries[index].offset - offset >= layout_size)
- break;
- offset = pool->entries[index].offset + pool->entries[index].size;
- }
-
- if (pool->size - offset < layout_size) {
- vk_object_free(&device->vk, NULL, set);
- return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY);
- }
-
- set->mapped_ptr = (uint32_t*)(pool_base(pool) + offset);
- set->va = pool->host_bo ? 0 : pool->bo->iova + offset;
-
- memmove(&pool->entries[index + 1], &pool->entries[index],
- sizeof(pool->entries[0]) * (pool->entry_count - index));
- pool->entries[index].offset = offset;
- pool->entries[index].size = layout_size;
- pool->entries[index].set = set;
- pool->entry_count++;
- } else
- return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY);
- }
-
- if (layout->has_immutable_samplers) {
- for (unsigned i = 0; i < layout->binding_count; ++i) {
- if (!layout->binding[i].immutable_samplers_offset)
- continue;
-
- unsigned offset = layout->binding[i].offset / 4;
- if (layout->binding[i].type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
- offset += A6XX_TEX_CONST_DWORDS;
-
- const struct tu_sampler *samplers =
- (const struct tu_sampler *)((const char *)layout +
- layout->binding[i].immutable_samplers_offset);
- for (unsigned j = 0; j < layout->binding[i].array_size; ++j) {
- memcpy(set->mapped_ptr + offset, samplers[j].descriptor,
- sizeof(samplers[j].descriptor));
- offset += layout->binding[i].size / 4;
- }
- }
- }
-
- if (layout->has_inline_uniforms) {
- for (unsigned i = 0; i < layout->binding_count; i++) {
- if (layout->binding[i].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
- continue;
-
- uint32_t *ptr = set->mapped_ptr + layout->binding[i].offset / 4;
- uint64_t va = set->va + layout->binding[i].offset +
- A6XX_TEX_CONST_DWORDS * 4;
- uint32_t size =
- (layout->has_variable_descriptors && i == layout->binding_count - 1) ?
- variable_count : layout->binding[i].size - A6XX_TEX_CONST_DWORDS * 4;
- size = ALIGN_POT(size, 16) / 16;
-
- ptr[0] = A6XX_UBO_0_BASE_LO(va);
- ptr[1] = A6XX_UBO_1_BASE_HI(va >> 32) | A6XX_UBO_1_SIZE(size);
- }
- }
-
- vk_descriptor_set_layout_ref(&layout->vk);
- list_addtail(&set->pool_link, &pool->desc_sets);
-
- *out_set = set;
- return VK_SUCCESS;
-}
-
-static void
-tu_descriptor_set_destroy(struct tu_device *device,
- struct tu_descriptor_pool *pool,
- struct tu_descriptor_set *set,
- bool free_bo)
-{
- assert(!pool->host_memory_base);
-
- if (free_bo && set->size && !pool->host_memory_base) {
- uint32_t offset = (uint8_t*)set->mapped_ptr - pool_base(pool);
-
- for (int i = 0; i < pool->entry_count; ++i) {
- if (pool->entries[i].offset == offset) {
- memmove(&pool->entries[i], &pool->entries[i+1],
- sizeof(pool->entries[i]) * (pool->entry_count - i - 1));
- --pool->entry_count;
- break;
- }
- }
- }
-
- vk_object_free(&device->vk, NULL, set);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreateDescriptorPool(VkDevice _device,
const VkDescriptorPoolCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
VkDescriptorPool *pDescriptorPool)
{
TU_FROM_HANDLE(tu_device, device, _device);
- struct tu_descriptor_pool *pool;
- uint64_t size = sizeof(struct tu_descriptor_pool);
- uint64_t bo_size = 0, dynamic_size = 0;
- VkResult ret;
-
- const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
- vk_find_struct_const( pCreateInfo->pNext,
- MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
-
- const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info =
- vk_find_struct_const(pCreateInfo->pNext,
- DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO);
-
- if (inline_info) {
- /* In addition to the size of the descriptors, we have to factor in the
- * padding for each binding. The sizes are 4 aligned but we have to
- * align to a descriptor size, and in the worst case each inline
- * binding has a size of 4 bytes and we have to pad each one out.
- */
- bo_size += (2 * 4 * A6XX_TEX_CONST_DWORDS - 4) *
- inline_info->maxInlineUniformBlockBindings;
- }
-
- for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
- const VkDescriptorPoolSize *pool_size = &pCreateInfo->pPoolSizes[i];
-
- switch (pool_size->type) {
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- dynamic_size += descriptor_size(device, NULL, pool_size->type) *
- pool_size->descriptorCount;
- break;
- case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
- if (mutable_info && i < mutable_info->mutableDescriptorTypeListCount &&
- mutable_info->pMutableDescriptorTypeLists[i].descriptorTypeCount > 0) {
- bo_size +=
- mutable_descriptor_size(device, &mutable_info->pMutableDescriptorTypeLists[i]) *
- pool_size->descriptorCount;
- } else {
- /* Allocate the maximum size possible. */
- bo_size += 2 * A6XX_TEX_CONST_DWORDS * 4 *
- pool_size->descriptorCount;
- }
- break;
- case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
- bo_size += pool_size->descriptorCount;
- break;
- default:
- bo_size += descriptor_size(device, NULL, pool_size->type) *
- pool_size->descriptorCount;
- break;
- }
- }
-
- if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
- uint64_t host_size = pCreateInfo->maxSets * sizeof(struct tu_descriptor_set);
- host_size += dynamic_size;
- size += host_size;
- } else {
- size += sizeof(struct tu_descriptor_pool_entry) * pCreateInfo->maxSets;
- }
-
- pool = vk_object_zalloc(&device->vk, pAllocator, size,
- VK_OBJECT_TYPE_DESCRIPTOR_POOL);
- if (!pool)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
- pool->host_memory_base = (uint8_t*)pool + sizeof(struct tu_descriptor_pool);
- pool->host_memory_ptr = pool->host_memory_base;
- pool->host_memory_end = (uint8_t*)pool + size;
- }
-
- if (bo_size) {
- if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT)) {
- ret = tu_bo_init_new(device, &pool->bo, bo_size, TU_BO_ALLOC_ALLOW_DUMP, "descriptor pool");
- if (ret)
- goto fail_alloc;
-
- ret = tu_bo_map(device, pool->bo);
- if (ret)
- goto fail_map;
- } else {
- pool->host_bo = vk_alloc2(&device->vk.alloc, pAllocator, bo_size, 8,
- VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- if (!pool->host_bo) {
- ret = VK_ERROR_OUT_OF_HOST_MEMORY;
- goto fail_alloc;
- }
- }
- }
- pool->size = bo_size;
- pool->max_entry_count = pCreateInfo->maxSets;
-
- list_inithead(&pool->desc_sets);
-
- *pDescriptorPool = tu_descriptor_pool_to_handle(pool);
+ tu_use_args(device);
+ tu_stub();
return VK_SUCCESS;
-
-fail_map:
- tu_bo_finish(device, pool->bo);
-fail_alloc:
- vk_object_free(&device->vk, pAllocator, pool);
- return ret;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyDescriptorPool(VkDevice _device,
VkDescriptorPool _pool,
const VkAllocationCallbacks *pAllocator)
{
- TU_FROM_HANDLE(tu_device, device, _device);
- TU_FROM_HANDLE(tu_descriptor_pool, pool, _pool);
-
- if (!pool)
- return;
-
- list_for_each_entry_safe(struct tu_descriptor_set, set,
- &pool->desc_sets, pool_link) {
- vk_descriptor_set_layout_unref(&device->vk, &set->layout->vk);
- }
-
- if (!pool->host_memory_base) {
- for(int i = 0; i < pool->entry_count; ++i) {
- tu_descriptor_set_destroy(device, pool, pool->entries[i].set, false);
- }
- }
-
- if (pool->size) {
- if (pool->host_bo)
- vk_free2(&device->vk.alloc, pAllocator, pool->host_bo);
- else
- tu_bo_finish(device, pool->bo);
- }
-
- vk_object_free(&device->vk, pAllocator, pool);
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_ResetDescriptorPool(VkDevice _device,
VkDescriptorPool descriptorPool,
VkDescriptorPoolResetFlags flags)
@@ -837,26 +421,12 @@ tu_ResetDescriptorPool(VkDevice _device,
TU_FROM_HANDLE(tu_device, device, _device);
TU_FROM_HANDLE(tu_descriptor_pool, pool, descriptorPool);
- list_for_each_entry_safe(struct tu_descriptor_set, set,
- &pool->desc_sets, pool_link) {
- vk_descriptor_set_layout_unref(&device->vk, &set->layout->vk);
- }
- list_inithead(&pool->desc_sets);
-
- if (!pool->host_memory_base) {
- for(int i = 0; i < pool->entry_count; ++i) {
- tu_descriptor_set_destroy(device, pool, pool->entries[i].set, false);
- }
- pool->entry_count = 0;
- }
-
- pool->current_offset = 0;
- pool->host_memory_ptr = pool->host_memory_base;
-
+ tu_use_args(device, pool);
+ tu_stub();
return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_AllocateDescriptorSets(VkDevice _device,
const VkDescriptorSetAllocateInfo *pAllocateInfo,
VkDescriptorSet *pDescriptorSets)
@@ -864,42 +434,12 @@ tu_AllocateDescriptorSets(VkDevice _device,
TU_FROM_HANDLE(tu_device, device, _device);
TU_FROM_HANDLE(tu_descriptor_pool, pool, pAllocateInfo->descriptorPool);
- VkResult result = VK_SUCCESS;
- uint32_t i;
- struct tu_descriptor_set *set = NULL;
-
- const VkDescriptorSetVariableDescriptorCountAllocateInfo *variable_counts =
- vk_find_struct_const(pAllocateInfo->pNext, DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO);
- if (variable_counts && !variable_counts->descriptorSetCount)
- variable_counts = NULL;
-
- /* allocate a set of buffers for each shader to contain descriptors */
- for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) {
- TU_FROM_HANDLE(tu_descriptor_set_layout, layout,
- pAllocateInfo->pSetLayouts[i]);
-
- assert(!(layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
-
- result = tu_descriptor_set_create(
- device, pool, layout,
- variable_counts ? variable_counts->pDescriptorCounts[i] : 0, &set);
- if (result != VK_SUCCESS)
- break;
-
- pDescriptorSets[i] = tu_descriptor_set_to_handle(set);
- }
-
- if (result != VK_SUCCESS) {
- tu_FreeDescriptorSets(_device, pAllocateInfo->descriptorPool,
- i, pDescriptorSets);
- for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) {
- pDescriptorSets[i] = VK_NULL_HANDLE;
- }
- }
- return result;
+ tu_use_args(device, pool);
+ tu_stub();
+ return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_FreeDescriptorSets(VkDevice _device,
VkDescriptorPool descriptorPool,
uint32_t count,
@@ -908,338 +448,23 @@ tu_FreeDescriptorSets(VkDevice _device,
TU_FROM_HANDLE(tu_device, device, _device);
TU_FROM_HANDLE(tu_descriptor_pool, pool, descriptorPool);
- for (uint32_t i = 0; i < count; i++) {
- TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
-
- if (set) {
- vk_descriptor_set_layout_unref(&device->vk, &set->layout->vk);
- list_del(&set->pool_link);
- }
-
- if (set && !pool->host_memory_base)
- tu_descriptor_set_destroy(device, pool, set, true);
- }
+ tu_use_args(device, pool);
+ tu_stub();
return VK_SUCCESS;
}
-static void
-write_texel_buffer_descriptor(uint32_t *dst, const VkBufferView buffer_view)
-{
- if (buffer_view == VK_NULL_HANDLE) {
- memset(dst, 0, A6XX_TEX_CONST_DWORDS * sizeof(uint32_t));
- } else {
- TU_FROM_HANDLE(tu_buffer_view, view, buffer_view);
-
- memcpy(dst, view->descriptor, sizeof(view->descriptor));
- }
-}
-
-static void
-write_buffer_descriptor(const struct tu_device *device,
- uint32_t *dst,
- const VkDescriptorBufferInfo *buffer_info)
-{
- bool storage_16bit = device->physical_device->info->a6xx.storage_16bit;
- /* newer a6xx allows using 16-bit descriptor for both 16-bit and 32-bit
- * access, but we need to keep a 32-bit descriptor for readonly access via
- * isam.
- */
- unsigned descriptors = storage_16bit ? 2 : 1;
- if (buffer_info->buffer == VK_NULL_HANDLE) {
- memset(dst, 0, descriptors * A6XX_TEX_CONST_DWORDS * sizeof(uint32_t));
- return;
- }
-
- TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer);
-
- assert((buffer_info->offset & 63) == 0); /* minStorageBufferOffsetAlignment */
- uint64_t va = buffer->iova + buffer_info->offset;
- uint32_t range = vk_buffer_range(&buffer->vk, buffer_info->offset, buffer_info->range);
-
- for (unsigned i = 0; i < descriptors; i++) {
- if (storage_16bit && i == 0) {
- dst[0] = A6XX_TEX_CONST_0_TILE_MODE(TILE6_LINEAR) | A6XX_TEX_CONST_0_FMT(FMT6_16_UINT);
- dst[1] = DIV_ROUND_UP(range, 2);
- } else {
- dst[0] = A6XX_TEX_CONST_0_TILE_MODE(TILE6_LINEAR) | A6XX_TEX_CONST_0_FMT(FMT6_32_UINT);
- dst[1] = DIV_ROUND_UP(range, 4);
- }
- dst[2] =
- A6XX_TEX_CONST_2_BUFFER | A6XX_TEX_CONST_2_TYPE(A6XX_TEX_BUFFER);
- dst[3] = 0;
- dst[4] = A6XX_TEX_CONST_4_BASE_LO(va);
- dst[5] = A6XX_TEX_CONST_5_BASE_HI(va >> 32);
- for (int j = 6; j < A6XX_TEX_CONST_DWORDS; j++)
- dst[j] = 0;
- dst += A6XX_TEX_CONST_DWORDS;
- }
-}
-
-static void
-write_ubo_descriptor(uint32_t *dst, const VkDescriptorBufferInfo *buffer_info)
-{
- if (buffer_info->buffer == VK_NULL_HANDLE) {
- dst[0] = dst[1] = 0;
- return;
- }
-
- TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer);
-
- uint32_t range = vk_buffer_range(&buffer->vk, buffer_info->offset, buffer_info->range);
- /* The HW range is in vec4 units */
- range = ALIGN_POT(range, 16) / 16;
- uint64_t va = buffer->iova + buffer_info->offset;
-
- dst[0] = A6XX_UBO_0_BASE_LO(va);
- dst[1] = A6XX_UBO_1_BASE_HI(va >> 32) | A6XX_UBO_1_SIZE(range);
-}
-
-static void
-write_image_descriptor(uint32_t *dst,
- VkDescriptorType descriptor_type,
- const VkDescriptorImageInfo *image_info)
-{
- if (image_info->imageView == VK_NULL_HANDLE) {
- memset(dst, 0, A6XX_TEX_CONST_DWORDS * sizeof(uint32_t));
- return;
- }
-
- TU_FROM_HANDLE(tu_image_view, iview, image_info->imageView);
-
- if (descriptor_type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) {
- memcpy(dst, iview->view.storage_descriptor, sizeof(iview->view.storage_descriptor));
- } else {
- memcpy(dst, iview->view.descriptor, sizeof(iview->view.descriptor));
- }
-}
-
-static void
-write_combined_image_sampler_descriptor(uint32_t *dst,
- VkDescriptorType descriptor_type,
- const VkDescriptorImageInfo *image_info,
- bool has_sampler)
-{
- write_image_descriptor(dst, descriptor_type, image_info);
- /* copy over sampler state */
- if (has_sampler) {
- TU_FROM_HANDLE(tu_sampler, sampler, image_info->sampler);
- memcpy(dst + A6XX_TEX_CONST_DWORDS, sampler->descriptor, sizeof(sampler->descriptor));
- }
-}
-
-static void
-write_sampler_descriptor(uint32_t *dst, const VkDescriptorImageInfo *image_info)
-{
- TU_FROM_HANDLE(tu_sampler, sampler, image_info->sampler);
-
- memcpy(dst, sampler->descriptor, sizeof(sampler->descriptor));
-}
-
-/* note: this is used with immutable samplers in push descriptors */
-static void
-write_sampler_push(uint32_t *dst, const struct tu_sampler *sampler)
-{
- memcpy(dst, sampler->descriptor, sizeof(sampler->descriptor));
-}
-
void
-tu_update_descriptor_sets(const struct tu_device *device,
+tu_update_descriptor_sets(struct tu_device *device,
+ struct tu_cmd_buffer *cmd_buffer,
VkDescriptorSet dstSetOverride,
uint32_t descriptorWriteCount,
const VkWriteDescriptorSet *pDescriptorWrites,
uint32_t descriptorCopyCount,
const VkCopyDescriptorSet *pDescriptorCopies)
{
- uint32_t i, j;
- for (i = 0; i < descriptorWriteCount; i++) {
- const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
- TU_FROM_HANDLE(tu_descriptor_set, set, dstSetOverride ?: writeset->dstSet);
- const struct tu_descriptor_set_binding_layout *binding_layout =
- set->layout->binding + writeset->dstBinding;
- uint32_t *ptr = set->mapped_ptr;
- if (writeset->descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
- writeset->descriptorType == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
- ptr = set->dynamic_descriptors;
- ptr += binding_layout->dynamic_offset_offset / 4;
- } else {
- ptr = set->mapped_ptr;
- ptr += binding_layout->offset / 4;
- }
-
- /* for immutable samplers with push descriptors: */
- const bool copy_immutable_samplers =
- dstSetOverride && binding_layout->immutable_samplers_offset;
- const struct tu_sampler *samplers =
- tu_immutable_samplers(set->layout, binding_layout);
-
- if (writeset->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
- /* We need to respect this note:
- *
- * The same behavior applies to bindings with a descriptor type of
- * VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK where descriptorCount
- * specifies the number of bytes to update while dstArrayElement
- * specifies the starting byte offset, thus in this case if the
- * dstBinding has a smaller byte size than the sum of
- * dstArrayElement and descriptorCount, then the remainder will be
- * used to update the subsequent binding - dstBinding+1 starting
- * at offset zero. This falls out as a special case of the above
- * rule.
- *
- * This means we can't just do a straight memcpy, because due to
- * alignment padding and the descriptor itself there are gaps between
- * sequential bindings. We have to loop over each binding updated.
- */
- const VkWriteDescriptorSetInlineUniformBlock *inline_write =
- vk_find_struct_const(writeset->pNext,
- WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK);
- uint32_t remaining = inline_write->dataSize;
- const uint8_t *src = inline_write->pData;
- uint32_t dst_offset = writeset->dstArrayElement;
- do {
- uint8_t *dst = (uint8_t *)(ptr + A6XX_TEX_CONST_DWORDS) + dst_offset;
- uint32_t binding_size =
- binding_layout->size - A6XX_TEX_CONST_DWORDS * 4 - dst_offset;
- uint32_t to_write = MIN2(remaining, binding_size);
- memcpy(dst, src, to_write);
-
- binding_layout++;
- ptr = set->mapped_ptr + binding_layout->offset / 4;
- dst_offset = 0;
- src += to_write;
- remaining -= to_write;
- } while (remaining > 0);
-
- continue;
- }
-
- ptr += binding_layout->size / 4 * writeset->dstArrayElement;
- for (j = 0; j < writeset->descriptorCount; ++j) {
- switch(writeset->descriptorType) {
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
- write_ubo_descriptor(ptr, writeset->pBufferInfo + j);
- break;
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
- write_buffer_descriptor(device, ptr, writeset->pBufferInfo + j);
- break;
- case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
- write_texel_buffer_descriptor(ptr, writeset->pTexelBufferView[j]);
- break;
- case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
- case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
- write_image_descriptor(ptr, writeset->descriptorType, writeset->pImageInfo + j);
- break;
- case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- write_combined_image_sampler_descriptor(ptr,
- writeset->descriptorType,
- writeset->pImageInfo + j,
- !binding_layout->immutable_samplers_offset);
-
- if (copy_immutable_samplers)
- write_sampler_push(ptr + A6XX_TEX_CONST_DWORDS, &samplers[writeset->dstArrayElement + j]);
- break;
- case VK_DESCRIPTOR_TYPE_SAMPLER:
- if (!binding_layout->immutable_samplers_offset)
- write_sampler_descriptor(ptr, writeset->pImageInfo + j);
- else if (copy_immutable_samplers)
- write_sampler_push(ptr, &samplers[writeset->dstArrayElement + j]);
- break;
- case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
- /* nothing in descriptor set - framebuffer state is used instead */
- if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC))
- write_image_descriptor(ptr, writeset->descriptorType, writeset->pImageInfo + j);
- break;
- default:
- unreachable("unimplemented descriptor type");
- break;
- }
- ptr += binding_layout->size / 4;
- }
- }
-
- for (i = 0; i < descriptorCopyCount; i++) {
- const VkCopyDescriptorSet *copyset = &pDescriptorCopies[i];
- TU_FROM_HANDLE(tu_descriptor_set, src_set,
- copyset->srcSet);
- TU_FROM_HANDLE(tu_descriptor_set, dst_set,
- copyset->dstSet);
- const struct tu_descriptor_set_binding_layout *src_binding_layout =
- src_set->layout->binding + copyset->srcBinding;
- const struct tu_descriptor_set_binding_layout *dst_binding_layout =
- dst_set->layout->binding + copyset->dstBinding;
- uint32_t *src_ptr = src_set->mapped_ptr;
- uint32_t *dst_ptr = dst_set->mapped_ptr;
- if (src_binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
- src_binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
- src_ptr = src_set->dynamic_descriptors;
- dst_ptr = dst_set->dynamic_descriptors;
- src_ptr += src_binding_layout->dynamic_offset_offset / 4;
- dst_ptr += dst_binding_layout->dynamic_offset_offset / 4;
- } else {
- src_ptr = src_set->mapped_ptr;
- dst_ptr = dst_set->mapped_ptr;
- src_ptr += src_binding_layout->offset / 4;
- dst_ptr += dst_binding_layout->offset / 4;
- }
-
- if (src_binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
- uint32_t remaining = copyset->descriptorCount;
- uint32_t src_start = copyset->srcArrayElement;
- uint32_t dst_start = copyset->dstArrayElement;
- uint8_t *src = (uint8_t *)(src_ptr + A6XX_TEX_CONST_DWORDS) + src_start;
- uint8_t *dst = (uint8_t *)(dst_ptr + A6XX_TEX_CONST_DWORDS) + dst_start;
- uint32_t src_remaining =
- src_binding_layout->size - src_start - 4 * A6XX_TEX_CONST_DWORDS;
- uint32_t dst_remaining =
- dst_binding_layout->size - dst_start - 4 * A6XX_TEX_CONST_DWORDS;
- do {
- uint32_t to_write = MIN3(remaining, src_remaining, dst_remaining);
- memcpy(dst, src, to_write);
-
- src += to_write;
- dst += to_write;
- src_remaining -= to_write;
- dst_remaining -= to_write;
- remaining -= to_write;
-
- if (src_remaining == 0) {
- src_binding_layout++;
- src_ptr = src_set->mapped_ptr + src_binding_layout->offset / 4;
- src = (uint8_t *)(src_ptr + A6XX_TEX_CONST_DWORDS);
- src_remaining = src_binding_layout->size - 4 * A6XX_TEX_CONST_DWORDS;
- }
-
- if (dst_remaining == 0) {
- dst_binding_layout++;
- dst_ptr = dst_set->mapped_ptr + dst_binding_layout->offset / 4;
- dst = (uint8_t *)(dst_ptr + A6XX_TEX_CONST_DWORDS);
- dst_remaining = dst_binding_layout->size - 4 * A6XX_TEX_CONST_DWORDS;
- }
- } while (remaining > 0);
-
- continue;
- }
-
- src_ptr += src_binding_layout->size * copyset->srcArrayElement / 4;
- dst_ptr += dst_binding_layout->size * copyset->dstArrayElement / 4;
-
- /* In case of copies between mutable descriptor types
- * and non-mutable descriptor types.
- */
- uint32_t copy_size = MIN2(src_binding_layout->size, dst_binding_layout->size);
-
- for (j = 0; j < copyset->descriptorCount; ++j) {
- memcpy(dst_ptr, src_ptr, copy_size);
-
- src_ptr += src_binding_layout->size / 4;
- dst_ptr += dst_binding_layout->size / 4;
- }
- }
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_UpdateDescriptorSets(VkDevice _device,
uint32_t descriptorWriteCount,
const VkWriteDescriptorSet *pDescriptorWrites,
@@ -1247,12 +472,13 @@ tu_UpdateDescriptorSets(VkDevice _device,
const VkCopyDescriptorSet *pDescriptorCopies)
{
TU_FROM_HANDLE(tu_device, device, _device);
- tu_update_descriptor_sets(device, VK_NULL_HANDLE,
+
+ tu_update_descriptor_sets(device, NULL, VK_NULL_HANDLE,
descriptorWriteCount, pDescriptorWrites,
descriptorCopyCount, pDescriptorCopies);
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreateDescriptorUpdateTemplate(
VkDevice _device,
const VkDescriptorUpdateTemplateCreateInfo *pCreateInfo,
@@ -1260,142 +486,28 @@ tu_CreateDescriptorUpdateTemplate(
VkDescriptorUpdateTemplate *pDescriptorUpdateTemplate)
{
TU_FROM_HANDLE(tu_device, device, _device);
- struct tu_descriptor_set_layout *set_layout = NULL;
+ TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout,
+ pCreateInfo->descriptorSetLayout);
const uint32_t entry_count = pCreateInfo->descriptorUpdateEntryCount;
- uint32_t dst_entry_count = 0;
-
- if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR) {
- TU_FROM_HANDLE(tu_pipeline_layout, pipeline_layout, pCreateInfo->pipelineLayout);
-
- /* descriptorSetLayout should be ignored for push descriptors
- * and instead it refers to pipelineLayout and set.
- */
- assert(pCreateInfo->set < MAX_SETS);
- set_layout = pipeline_layout->set[pCreateInfo->set].layout;
- } else {
- TU_FROM_HANDLE(tu_descriptor_set_layout, _set_layout,
- pCreateInfo->descriptorSetLayout);
- set_layout = _set_layout;
- }
-
- for (uint32_t i = 0; i < entry_count; i++) {
- const VkDescriptorUpdateTemplateEntry *entry = &pCreateInfo->pDescriptorUpdateEntries[i];
- if (entry->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
- dst_entry_count++;
- continue;
- }
-
- /* Calculate how many bindings this update steps over, so we can split
- * up the template entry. This lets the actual update be a simple
- * memcpy.
- */
- uint32_t remaining = entry->descriptorCount;
- const struct tu_descriptor_set_binding_layout *binding_layout =
- set_layout->binding + entry->dstBinding;
- uint32_t dst_start = entry->dstArrayElement;
- do {
- uint32_t size = binding_layout->size - A6XX_TEX_CONST_DWORDS * 4;
- uint32_t count = MIN2(remaining, size - dst_start);
- remaining -= count;
- binding_layout++;
- dst_entry_count++;
- dst_start = 0;
- } while (remaining > 0);
- }
-
const size_t size =
sizeof(struct tu_descriptor_update_template) +
- sizeof(struct tu_descriptor_update_template_entry) * dst_entry_count;
+ sizeof(struct tu_descriptor_update_template_entry) * entry_count;
struct tu_descriptor_update_template *templ;
- templ = vk_object_alloc(&device->vk, pAllocator, size,
- VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE);
+ templ = vk_alloc2(&device->alloc, pAllocator, size, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!templ)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- templ->entry_count = dst_entry_count;
-
- if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR) {
- templ->bind_point = pCreateInfo->pipelineBindPoint;
- }
-
- uint32_t j = 0;
- for (uint32_t i = 0; i < entry_count; i++) {
- const VkDescriptorUpdateTemplateEntry *entry = &pCreateInfo->pDescriptorUpdateEntries[i];
-
- const struct tu_descriptor_set_binding_layout *binding_layout =
- set_layout->binding + entry->dstBinding;
- uint32_t dst_offset, dst_stride;
- const struct tu_sampler *immutable_samplers = NULL;
-
- /* dst_offset is an offset into dynamic_descriptors when the descriptor
- * is dynamic, and an offset into mapped_ptr otherwise.
- */
- switch (entry->descriptorType) {
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- dst_offset = binding_layout->dynamic_offset_offset / 4;
- break;
- case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
- uint32_t remaining = entry->descriptorCount;
- uint32_t dst_start = entry->dstArrayElement;
- uint32_t src_offset = entry->offset;
- /* See comment in update_descriptor_sets() */
- do {
- dst_offset =
- binding_layout->offset + A6XX_TEX_CONST_DWORDS * 4 + dst_start;
- uint32_t size = binding_layout->size - A6XX_TEX_CONST_DWORDS * 4;
- uint32_t count = MIN2(remaining, size - dst_start);
- templ->entry[j++] = (struct tu_descriptor_update_template_entry) {
- .descriptor_type = entry->descriptorType,
- .descriptor_count = count,
- .src_offset = src_offset,
- .dst_offset = dst_offset,
- };
- remaining -= count;
- src_offset += count;
- binding_layout++;
- dst_start = 0;
- } while (remaining > 0);
-
- continue;
- }
- case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- case VK_DESCRIPTOR_TYPE_SAMPLER:
- if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR &&
- binding_layout->immutable_samplers_offset) {
- immutable_samplers =
- tu_immutable_samplers(set_layout, binding_layout) + entry->dstArrayElement;
- }
- FALLTHROUGH;
- default:
- dst_offset = binding_layout->offset / 4;
- }
-
- dst_offset += (binding_layout->size * entry->dstArrayElement) / 4;
- dst_stride = binding_layout->size / 4;
-
- templ->entry[j++] = (struct tu_descriptor_update_template_entry) {
- .descriptor_type = entry->descriptorType,
- .descriptor_count = entry->descriptorCount,
- .src_offset = entry->offset,
- .src_stride = entry->stride,
- .dst_offset = dst_offset,
- .dst_stride = dst_stride,
- .has_sampler = !binding_layout->immutable_samplers_offset,
- .immutable_samplers = immutable_samplers,
- };
- }
-
- assert(j == dst_entry_count);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
*pDescriptorUpdateTemplate =
tu_descriptor_update_template_to_handle(templ);
+ tu_use_args(set_layout);
+ tu_stub();
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyDescriptorUpdateTemplate(
VkDevice _device,
VkDescriptorUpdateTemplate descriptorUpdateTemplate,
@@ -1408,90 +520,23 @@ tu_DestroyDescriptorUpdateTemplate(
if (!templ)
return;
- vk_object_free(&device->vk, pAllocator, templ);
+ vk_free2(&device->alloc, pAllocator, templ);
}
void
tu_update_descriptor_set_with_template(
- const struct tu_device *device,
+ struct tu_device *device,
+ struct tu_cmd_buffer *cmd_buffer,
struct tu_descriptor_set *set,
VkDescriptorUpdateTemplate descriptorUpdateTemplate,
const void *pData)
{
TU_FROM_HANDLE(tu_descriptor_update_template, templ,
descriptorUpdateTemplate);
-
- for (uint32_t i = 0; i < templ->entry_count; i++) {
- uint32_t *ptr = set->mapped_ptr;
- const void *src = ((const char *) pData) + templ->entry[i].src_offset;
- const struct tu_sampler *samplers = templ->entry[i].immutable_samplers;
-
- if (templ->entry[i].descriptor_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
- memcpy(((uint8_t *) ptr) + templ->entry[i].dst_offset, src,
- templ->entry[i].descriptor_count);
- continue;
- }
-
- ptr += templ->entry[i].dst_offset;
- unsigned dst_offset = templ->entry[i].dst_offset;
- for (unsigned j = 0; j < templ->entry[i].descriptor_count; ++j) {
- switch(templ->entry[i].descriptor_type) {
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: {
- assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
- write_ubo_descriptor(set->dynamic_descriptors + dst_offset, src);
- break;
- }
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
- write_ubo_descriptor(ptr, src);
- break;
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
- assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
- write_buffer_descriptor(device, set->dynamic_descriptors + dst_offset, src);
- break;
- }
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
- write_buffer_descriptor(device, ptr, src);
- break;
- case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
- write_texel_buffer_descriptor(ptr, *(VkBufferView *) src);
- break;
- case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
- case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
- write_image_descriptor(ptr, templ->entry[i].descriptor_type, src);
- break;
- }
- case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- write_combined_image_sampler_descriptor(ptr,
- templ->entry[i].descriptor_type,
- src,
- templ->entry[i].has_sampler);
- if (samplers)
- write_sampler_push(ptr + A6XX_TEX_CONST_DWORDS, &samplers[j]);
- break;
- case VK_DESCRIPTOR_TYPE_SAMPLER:
- if (templ->entry[i].has_sampler)
- write_sampler_descriptor(ptr, src);
- else if (samplers)
- write_sampler_push(ptr, &samplers[j]);
- break;
- case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
- /* nothing in descriptor set - framebuffer state is used instead */
- if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC))
- write_image_descriptor(ptr, templ->entry[i].descriptor_type, src);
- break;
- default:
- unreachable("unimplemented descriptor type");
- break;
- }
- src = (char *) src + templ->entry[i].src_stride;
- ptr += templ->entry[i].dst_stride;
- dst_offset += templ->entry[i].dst_stride;
- }
- }
+ tu_use_args(templ);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_UpdateDescriptorSetWithTemplate(
VkDevice _device,
VkDescriptorSet descriptorSet,
@@ -1501,46 +546,25 @@ tu_UpdateDescriptorSetWithTemplate(
TU_FROM_HANDLE(tu_device, device, _device);
TU_FROM_HANDLE(tu_descriptor_set, set, descriptorSet);
- tu_update_descriptor_set_with_template(device, set, descriptorUpdateTemplate, pData);
+ tu_update_descriptor_set_with_template(device, NULL, set,
+ descriptorUpdateTemplate, pData);
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreateSamplerYcbcrConversion(
- VkDevice _device,
+ VkDevice device,
const VkSamplerYcbcrConversionCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
VkSamplerYcbcrConversion *pYcbcrConversion)
{
- TU_FROM_HANDLE(tu_device, device, _device);
- struct tu_sampler_ycbcr_conversion *conversion;
-
- conversion = vk_object_alloc(&device->vk, pAllocator, sizeof(*conversion),
- VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION);
- if (!conversion)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- conversion->format = pCreateInfo->format;
- conversion->ycbcr_model = pCreateInfo->ycbcrModel;
- conversion->ycbcr_range = pCreateInfo->ycbcrRange;
- conversion->components = pCreateInfo->components;
- conversion->chroma_offsets[0] = pCreateInfo->xChromaOffset;
- conversion->chroma_offsets[1] = pCreateInfo->yChromaOffset;
- conversion->chroma_filter = pCreateInfo->chromaFilter;
-
- *pYcbcrConversion = tu_sampler_ycbcr_conversion_to_handle(conversion);
+ *pYcbcrConversion = VK_NULL_HANDLE;
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
-tu_DestroySamplerYcbcrConversion(VkDevice _device,
+void
+tu_DestroySamplerYcbcrConversion(VkDevice device,
VkSamplerYcbcrConversion ycbcrConversion,
const VkAllocationCallbacks *pAllocator)
{
- TU_FROM_HANDLE(tu_device, device, _device);
- TU_FROM_HANDLE(tu_sampler_ycbcr_conversion, ycbcr_conversion, ycbcrConversion);
-
- if (!ycbcr_conversion)
- return;
-
- vk_object_free(&device->vk, pAllocator, ycbcr_conversion);
+ /* Do nothing. */
}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_device.c b/lib/mesa/src/freedreno/vulkan/tu_device.c
index 83f782635..901f02486 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_device.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_device.c
@@ -1,401 +1,358 @@
/*
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
- * SPDX-License-Identifier: MIT
*
* based in part on anv driver which is:
* Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
-#include "tu_device.h"
+#include "tu_private.h"
#include <fcntl.h>
-#include <poll.h>
+#include <libsync.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/mman.h>
#include <sys/sysinfo.h>
+#include <unistd.h>
+#include <xf86drm.h>
-#include "git_sha1.h"
-#include "util/u_debug.h"
+#include "compiler/glsl_types.h"
+#include "util/debug.h"
#include "util/disk_cache.h"
-#include "util/driconf.h"
-#include "util/os_misc.h"
-#include "vk_shader_module.h"
-#include "vk_sampler.h"
+#include "vk_format.h"
#include "vk_util.h"
-/* for fd_get_driver/device_uuid() */
-#include "freedreno/common/freedreno_uuid.h"
-
-#include "tu_clear_blit.h"
-#include "tu_cmd_buffer.h"
-#include "tu_cs.h"
-#include "tu_descriptor_set.h"
-#include "tu_dynamic_rendering.h"
-#include "tu_image.h"
-#include "tu_pass.h"
-#include "tu_query.h"
-#include "tu_tracepoints.h"
-#include "tu_wsi.h"
-
-#if defined(VK_USE_PLATFORM_WAYLAND_KHR) || \
- defined(VK_USE_PLATFORM_XCB_KHR) || \
- defined(VK_USE_PLATFORM_XLIB_KHR) || \
- defined(VK_USE_PLATFORM_DISPLAY_KHR)
-#define TU_HAS_SURFACE 1
-#else
-#define TU_HAS_SURFACE 0
-#endif
-
+#include "drm-uapi/msm_drm.h"
static int
-tu_device_get_cache_uuid(struct tu_physical_device *device, void *uuid)
-{
- struct mesa_sha1 ctx;
- unsigned char sha1[20];
- /* Note: IR3_SHADER_DEBUG also affects compilation, but it's not
- * initialized until after compiler creation so we have to add it to the
- * shader hash instead, since the compiler is only created with the logical
- * device.
- */
- uint64_t driver_flags = device->instance->debug_flags & TU_DEBUG_NOMULTIPOS;
- uint16_t family = fd_dev_gpu_id(&device->dev_id);
-
+tu_device_get_cache_uuid(uint16_t family, void *uuid)
+{
+ uint32_t mesa_timestamp;
+ uint16_t f = family;
memset(uuid, 0, VK_UUID_SIZE);
- _mesa_sha1_init(&ctx);
-
- if (!disk_cache_get_function_identifier(tu_device_get_cache_uuid, &ctx))
+ if (!disk_cache_get_function_timestamp(tu_device_get_cache_uuid,
+ &mesa_timestamp))
return -1;
- _mesa_sha1_update(&ctx, &family, sizeof(family));
- _mesa_sha1_update(&ctx, &driver_flags, sizeof(driver_flags));
- _mesa_sha1_final(&ctx, sha1);
-
- memcpy(uuid, sha1, VK_UUID_SIZE);
+ memcpy(uuid, &mesa_timestamp, 4);
+ memcpy((char *) uuid + 4, &f, 2);
+ snprintf((char *) uuid + 6, VK_UUID_SIZE - 10, "tu");
return 0;
}
-#define TU_API_VERSION VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION)
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_EnumerateInstanceVersion(uint32_t *pApiVersion)
-{
- *pApiVersion = TU_API_VERSION;
- return VK_SUCCESS;
-}
-
-static const struct vk_instance_extension_table tu_instance_extensions_supported = {
- .KHR_device_group_creation = true,
- .KHR_external_fence_capabilities = true,
- .KHR_external_memory_capabilities = true,
- .KHR_external_semaphore_capabilities = true,
- .KHR_get_physical_device_properties2 = true,
- .KHR_surface = TU_HAS_SURFACE,
- .KHR_get_surface_capabilities2 = TU_HAS_SURFACE,
- .EXT_debug_report = true,
- .EXT_debug_utils = true,
-#ifdef VK_USE_PLATFORM_WAYLAND_KHR
- .KHR_wayland_surface = true,
-#endif
-#ifdef VK_USE_PLATFORM_XCB_KHR
- .KHR_xcb_surface = true,
-#endif
-#ifdef VK_USE_PLATFORM_XLIB_KHR
- .KHR_xlib_surface = true,
-#endif
-#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
- .EXT_acquire_xlib_display = true,
-#endif
-#ifdef VK_USE_PLATFORM_DISPLAY_KHR
- .KHR_display = true,
- .KHR_get_display_properties2 = true,
- .EXT_direct_mode_display = true,
- .EXT_display_surface_counter = true,
- .EXT_acquire_drm_display = true,
-#endif
-};
+static void
+tu_get_driver_uuid(void *uuid)
+{
+ memset(uuid, 0, VK_UUID_SIZE);
+ snprintf(uuid, VK_UUID_SIZE, "freedreno");
+}
static void
-get_device_extensions(const struct tu_physical_device *device,
- struct vk_device_extension_table *ext)
-{
- *ext = (struct vk_device_extension_table) {
- .KHR_16bit_storage = device->info->a6xx.storage_16bit,
- .KHR_bind_memory2 = true,
- .KHR_copy_commands2 = true,
- .KHR_create_renderpass2 = true,
- .KHR_dedicated_allocation = true,
- .KHR_depth_stencil_resolve = true,
- .KHR_descriptor_update_template = true,
- .KHR_device_group = true,
- .KHR_draw_indirect_count = true,
- .KHR_external_fence = true,
- .KHR_external_fence_fd = true,
- .KHR_external_memory = true,
- .KHR_external_memory_fd = true,
- .KHR_external_semaphore = true,
- .KHR_external_semaphore_fd = true,
- .KHR_format_feature_flags2 = true,
- .KHR_get_memory_requirements2 = true,
- .KHR_global_priority = true,
- .KHR_imageless_framebuffer = true,
- .KHR_incremental_present = TU_HAS_SURFACE,
- .KHR_image_format_list = true,
- .KHR_maintenance1 = true,
- .KHR_maintenance2 = true,
- .KHR_maintenance3 = true,
- .KHR_maintenance4 = true,
- .KHR_multiview = true,
- .KHR_performance_query = device->instance->debug_flags & TU_DEBUG_PERFC,
- .KHR_pipeline_executable_properties = true,
- .KHR_push_descriptor = true,
- .KHR_relaxed_block_layout = true,
- .KHR_sampler_mirror_clamp_to_edge = true,
- .KHR_sampler_ycbcr_conversion = true,
- .KHR_shader_draw_parameters = true,
- .KHR_shader_float_controls = true,
- .KHR_shader_float16_int8 = true,
- .KHR_shader_subgroup_extended_types = true,
- .KHR_shader_terminate_invocation = true,
- .KHR_spirv_1_4 = true,
- .KHR_storage_buffer_storage_class = true,
- .KHR_swapchain = TU_HAS_SURFACE,
- .KHR_swapchain_mutable_format = TU_HAS_SURFACE,
- .KHR_uniform_buffer_standard_layout = true,
- .KHR_variable_pointers = true,
- .KHR_vulkan_memory_model = true,
- .KHR_driver_properties = true,
- .KHR_separate_depth_stencil_layouts = true,
- .KHR_buffer_device_address = true,
- .KHR_shader_integer_dot_product = true,
- .KHR_zero_initialize_workgroup_memory = true,
- .KHR_shader_non_semantic_info = true,
- .KHR_synchronization2 = true,
- .KHR_dynamic_rendering = true,
-#ifndef TU_USE_KGSL
- .KHR_timeline_semaphore = true,
-#endif
-#ifdef VK_USE_PLATFORM_DISPLAY_KHR
- .EXT_display_control = true,
-#endif
- .EXT_external_memory_dma_buf = true,
- .EXT_image_drm_format_modifier = true,
- .EXT_sample_locations = device->info->a6xx.has_sample_locations,
- .EXT_sampler_filter_minmax = true,
- .EXT_transform_feedback = true,
- .EXT_4444_formats = true,
- .EXT_border_color_swizzle = true,
- .EXT_conditional_rendering = true,
- .EXT_custom_border_color = true,
- .EXT_depth_clip_control = true,
- .EXT_depth_clip_enable = true,
- .EXT_descriptor_indexing = true,
- .EXT_extended_dynamic_state = true,
- .EXT_extended_dynamic_state2 = true,
- .EXT_filter_cubic = device->info->a6xx.has_tex_filter_cubic,
- .EXT_global_priority = true,
- .EXT_global_priority_query = true,
- .EXT_host_query_reset = true,
- .EXT_index_type_uint8 = true,
- .EXT_memory_budget = true,
- .EXT_primitive_topology_list_restart = true,
- .EXT_private_data = true,
- .EXT_queue_family_foreign = true,
- .EXT_robustness2 = true,
- .EXT_scalar_block_layout = true,
- .EXT_separate_stencil_usage = true,
- .EXT_shader_demote_to_helper_invocation = true,
- .EXT_shader_stencil_export = true,
- .EXT_shader_viewport_index_layer = true,
- .EXT_shader_module_identifier = true,
- .EXT_texel_buffer_alignment = true,
- .EXT_vertex_attribute_divisor = true,
- .EXT_provoking_vertex = true,
- .EXT_line_rasterization = true,
- .EXT_subgroup_size_control = true,
- .EXT_image_robustness = true,
- .EXT_primitives_generated_query = true,
- .EXT_image_view_min_lod = true,
- .EXT_pipeline_creation_feedback = true,
- .EXT_pipeline_creation_cache_control = true,
- .EXT_vertex_input_dynamic_state = true,
- .EXT_attachment_feedback_loop_layout = true,
- .EXT_rasterization_order_attachment_access = true,
- .EXT_multi_draw = true,
-#ifndef TU_USE_KGSL
- .EXT_physical_device_drm = true,
-#endif
- /* For Graphics Flight Recorder (GFR) */
- .AMD_buffer_marker = true,
- .ARM_rasterization_order_attachment_access = true,
-#ifdef ANDROID
- .ANDROID_native_buffer = true,
-#endif
- .IMG_filter_cubic = device->info->a6xx.has_tex_filter_cubic,
- .VALVE_mutable_descriptor_type = true,
- .EXT_image_2d_view_of_3d = true,
- .EXT_color_write_enable = true,
- .EXT_load_store_op_none = true,
- .EXT_non_seamless_cube_map = true,
- .EXT_tooling_info = true,
- .EXT_inline_uniform_block = true,
- .EXT_mutable_descriptor_type = true,
- .KHR_pipeline_library = true,
- .EXT_graphics_pipeline_library = true,
+tu_get_device_uuid(void *uuid)
+{
+ memset(uuid, 0, VK_UUID_SIZE);
+}
+
+static VkResult
+tu_bo_init(struct tu_device *dev,
+ struct tu_bo *bo,
+ uint32_t gem_handle,
+ uint64_t size)
+{
+ uint64_t iova = tu_gem_info_iova(dev, gem_handle);
+ if (!iova)
+ return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+ *bo = (struct tu_bo) {
+ .gem_handle = gem_handle,
+ .size = size,
+ .iova = iova,
};
+
+ return VK_SUCCESS;
}
-static const struct vk_pipeline_cache_object_ops *const cache_import_ops[] = {
- &tu_shaders_ops,
- &tu_nir_shaders_ops,
- NULL,
-};
+VkResult
+tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size)
+{
+ /* TODO: Choose better flags. As of 2018-11-12, freedreno/drm/msm_bo.c
+ * always sets `flags = MSM_BO_WC`, and we copy that behavior here.
+ */
+ uint32_t gem_handle = tu_gem_new(dev, size, MSM_BO_WC);
+ if (!gem_handle)
+ return vk_error(dev->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+ VkResult result = tu_bo_init(dev, bo, gem_handle, size);
+ if (result != VK_SUCCESS) {
+ tu_gem_close(dev, gem_handle);
+ return vk_error(dev->instance, result);
+ }
+
+ return VK_SUCCESS;
+}
+
+VkResult
+tu_bo_init_dmabuf(struct tu_device *dev,
+ struct tu_bo *bo,
+ uint64_t size,
+ int fd)
+{
+ uint32_t gem_handle = tu_gem_import_dmabuf(dev, fd, size);
+ if (!gem_handle)
+ return vk_error(dev->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+
+ VkResult result = tu_bo_init(dev, bo, gem_handle, size);
+ if (result != VK_SUCCESS) {
+ tu_gem_close(dev, gem_handle);
+ return vk_error(dev->instance, result);
+ }
+
+ return VK_SUCCESS;
+}
+
+int
+tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo)
+{
+ return tu_gem_export_dmabuf(dev, bo->gem_handle);
+}
VkResult
+tu_bo_map(struct tu_device *dev, struct tu_bo *bo)
+{
+ if (bo->map)
+ return VK_SUCCESS;
+
+ uint64_t offset = tu_gem_info_offset(dev, bo->gem_handle);
+ if (!offset)
+ return vk_error(dev->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+ /* TODO: Should we use the wrapper os_mmap() like Freedreno does? */
+ void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ dev->physical_device->local_fd, offset);
+ if (map == MAP_FAILED)
+ return vk_error(dev->instance, VK_ERROR_MEMORY_MAP_FAILED);
+
+ bo->map = map;
+ return VK_SUCCESS;
+}
+
+void
+tu_bo_finish(struct tu_device *dev, struct tu_bo *bo)
+{
+ assert(bo->gem_handle);
+
+ if (bo->map)
+ munmap(bo->map, bo->size);
+
+ tu_gem_close(dev, bo->gem_handle);
+}
+
+static VkResult
tu_physical_device_init(struct tu_physical_device *device,
- struct tu_instance *instance)
+ struct tu_instance *instance,
+ drmDevicePtr drm_device)
{
+ const char *path = drm_device->nodes[DRM_NODE_RENDER];
VkResult result = VK_SUCCESS;
+ drmVersionPtr version;
+ int fd;
+ int master_fd = -1;
+
+ fd = open(path, O_RDWR | O_CLOEXEC);
+ if (fd < 0) {
+ return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+ "failed to open device %s", path);
+ }
+
+ /* Version 1.3 added MSM_INFO_IOVA. */
+ const int min_version_major = 1;
+ const int min_version_minor = 3;
+
+ version = drmGetVersion(fd);
+ if (!version) {
+ close(fd);
+ return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+ "failed to query kernel driver version for device %s",
+ path);
+ }
+
+ if (strcmp(version->name, "msm")) {
+ drmFreeVersion(version);
+ close(fd);
+ return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+ "device %s does not use the msm kernel driver", path);
+ }
+
+ if (version->version_major != min_version_major ||
+ version->version_minor < min_version_minor) {
+ result = vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+ "kernel driver for device %s has version %d.%d, "
+ "but Vulkan requires version >= %d.%d",
+ path, version->version_major, version->version_minor,
+ min_version_major, min_version_minor);
+ drmFreeVersion(version);
+ close(fd);
+ return result;
+ }
- const char *fd_name = fd_dev_name(&device->dev_id);
- if (strncmp(fd_name, "FD", 2) == 0) {
- device->name = vk_asprintf(&instance->vk.alloc,
- VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE,
- "Turnip Adreno (TM) %s", &fd_name[2]);
- } else {
- device->name = vk_strdup(&instance->vk.alloc, fd_name,
- VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+ drmFreeVersion(version);
+
+ if (instance->debug_flags & TU_DEBUG_STARTUP)
+ tu_logi("Found compatible device '%s'.", path);
+ device->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
+ device->instance = instance;
+ assert(strlen(path) < ARRAY_SIZE(device->path));
+ strncpy(device->path, path, ARRAY_SIZE(device->path));
+
+ if (instance->enabled_extensions.KHR_display) {
+ master_fd =
+ open(drm_device->nodes[DRM_NODE_PRIMARY], O_RDWR | O_CLOEXEC);
+ if (master_fd >= 0) {
+ /* TODO: free master_fd is accel is not working? */
+ }
}
- if (!device->name) {
- return vk_startup_errorf(instance, VK_ERROR_OUT_OF_HOST_MEMORY,
- "device name alloc fail");
+
+ device->master_fd = master_fd;
+ device->local_fd = fd;
+
+ if (tu_drm_get_gpu_id(device, &device->gpu_id)) {
+ if (instance->debug_flags & TU_DEBUG_STARTUP)
+ tu_logi("Could not query the GPU ID");
+ result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+ "could not get GPU ID");
+ goto fail;
}
- const struct fd_dev_info *info = fd_dev_info(&device->dev_id);
- if (!info) {
- result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
- "device %s is unsupported", device->name);
- goto fail_free_name;
+ if (tu_drm_get_gmem_size(device, &device->gmem_size)) {
+ if (instance->debug_flags & TU_DEBUG_STARTUP)
+ tu_logi("Could not query the GMEM size");
+ result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+ "could not get GMEM size");
+ goto fail;
}
- switch (fd_dev_gen(&device->dev_id)) {
- case 6:
- device->info = info;
- device->ccu_offset_bypass = device->info->num_ccu * A6XX_CCU_DEPTH_SIZE;
- device->ccu_offset_gmem = (device->gmem_size -
- device->info->num_ccu * A6XX_CCU_GMEM_COLOR_SIZE);
+
+ memset(device->name, 0, sizeof(device->name));
+ sprintf(device->name, "FD%d", device->gpu_id);
+
+ switch (device->gpu_id) {
+ case 630:
+ device->tile_align_w = 32;
+ device->tile_align_h = 32;
break;
default:
- result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
- "device %s is unsupported", device->name);
- goto fail_free_name;
- }
- if (tu_device_get_cache_uuid(device, device->cache_uuid)) {
- result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
- "cannot generate UUID");
- goto fail_free_name;
+ result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+ "device %s is unsupported", device->name);
+ goto fail;
}
-
- if (device->has_set_iova) {
- mtx_init(&device->vma_mutex, mtx_plain);
- util_vma_heap_init(&device->vma, device->va_start,
- ROUND_DOWN_TO(device->va_size, 4096));
+ if (tu_device_get_cache_uuid(device->gpu_id, device->cache_uuid)) {
+ result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+ "cannot generate UUID");
+ goto fail;
}
- fd_get_driver_uuid(device->driver_uuid);
- fd_get_device_uuid(device->device_uuid, &device->dev_id);
+ /* The gpu id is already embedded in the uuid so we just pass "tu"
+ * when creating the cache.
+ */
+ char buf[VK_UUID_SIZE * 2 + 1];
+ disk_cache_format_hex_id(buf, device->cache_uuid, VK_UUID_SIZE * 2);
+ device->disk_cache = disk_cache_create(device->name, buf, 0);
- struct vk_device_extension_table supported_extensions;
- get_device_extensions(device, &supported_extensions);
+ fprintf(stderr, "WARNING: tu is not a conformant vulkan implementation, "
+ "testing use only.\n");
- struct vk_physical_device_dispatch_table dispatch_table;
- vk_physical_device_dispatch_table_from_entrypoints(
- &dispatch_table, &tu_physical_device_entrypoints, true);
- vk_physical_device_dispatch_table_from_entrypoints(
- &dispatch_table, &wsi_physical_device_entrypoints, false);
+ tu_get_driver_uuid(&device->device_uuid);
+ tu_get_device_uuid(&device->device_uuid);
- result = vk_physical_device_init(&device->vk, &instance->vk,
- &supported_extensions,
- &dispatch_table);
- if (result != VK_SUCCESS)
- goto fail_free_vma;
+ tu_fill_device_extension_table(device, &device->supported_extensions);
- device->vk.supported_sync_types = device->sync_types;
+ if (result != VK_SUCCESS) {
+ vk_error(instance, result);
+ goto fail;
+ }
-#if TU_HAS_SURFACE
result = tu_wsi_init(device);
if (result != VK_SUCCESS) {
- vk_startup_errorf(instance, result, "WSI init failure");
- vk_physical_device_finish(&device->vk);
- goto fail_free_vma;
+ vk_error(instance, result);
+ goto fail;
}
-#endif
-
- /* The gpu id is already embedded in the uuid so we just pass "tu"
- * when creating the cache.
- */
- char buf[VK_UUID_SIZE * 2 + 1];
- disk_cache_format_hex_id(buf, device->cache_uuid, VK_UUID_SIZE * 2);
- device->vk.disk_cache = disk_cache_create(device->name, buf, 0);
-
- device->vk.pipeline_cache_import_ops = cache_import_ops;
return VK_SUCCESS;
-fail_free_vma:
- if (device->has_set_iova)
- util_vma_heap_finish(&device->vma);
-fail_free_name:
- vk_free(&instance->vk.alloc, (void *)device->name);
+fail:
+ close(fd);
+ if (master_fd != -1)
+ close(master_fd);
return result;
}
static void
tu_physical_device_finish(struct tu_physical_device *device)
{
-#if TU_HAS_SURFACE
tu_wsi_finish(device);
-#endif
+ disk_cache_destroy(device->disk_cache);
close(device->local_fd);
if (device->master_fd != -1)
close(device->master_fd);
+}
- if (device->has_set_iova)
- util_vma_heap_finish(&device->vma);
-
- vk_free(&device->instance->vk.alloc, (void *)device->name);
+static void *
+default_alloc_func(void *pUserData,
+ size_t size,
+ size_t align,
+ VkSystemAllocationScope allocationScope)
+{
+ return malloc(size);
+}
- vk_physical_device_finish(&device->vk);
+static void *
+default_realloc_func(void *pUserData,
+ void *pOriginal,
+ size_t size,
+ size_t align,
+ VkSystemAllocationScope allocationScope)
+{
+ return realloc(pOriginal, size);
}
static void
-tu_destroy_physical_device(struct vk_physical_device *device)
+default_free_func(void *pUserData, void *pMemory)
{
- tu_physical_device_finish((struct tu_physical_device *) device);
- vk_free(&device->instance->alloc, device);
+ free(pMemory);
}
+static const VkAllocationCallbacks default_alloc = {
+ .pUserData = NULL,
+ .pfnAllocation = default_alloc_func,
+ .pfnReallocation = default_realloc_func,
+ .pfnFree = default_free_func,
+};
+
static const struct debug_control tu_debug_options[] = {
{ "startup", TU_DEBUG_STARTUP },
{ "nir", TU_DEBUG_NIR },
- { "nobin", TU_DEBUG_NOBIN },
- { "sysmem", TU_DEBUG_SYSMEM },
- { "gmem", TU_DEBUG_GMEM },
- { "forcebin", TU_DEBUG_FORCEBIN },
- { "layout", TU_DEBUG_LAYOUT },
- { "noubwc", TU_DEBUG_NOUBWC },
- { "nomultipos", TU_DEBUG_NOMULTIPOS },
- { "nolrz", TU_DEBUG_NOLRZ },
- { "nolrzfc", TU_DEBUG_NOLRZFC },
- { "perf", TU_DEBUG_PERF },
- { "perfc", TU_DEBUG_PERFC },
- { "flushall", TU_DEBUG_FLUSHALL },
- { "syncdraw", TU_DEBUG_SYNCDRAW },
- { "dontcare_as_load", TU_DEBUG_DONT_CARE_AS_LOAD },
- { "rast_order", TU_DEBUG_RAST_ORDER },
- { "unaligned_store", TU_DEBUG_UNALIGNED_STORE },
- { "log_skip_gmem_ops", TU_DEBUG_LOG_SKIP_GMEM_OPS },
- { "dynamic", TU_DEBUG_DYNAMIC },
- { "bos", TU_DEBUG_BOS },
+ { "ir3", TU_DEBUG_IR3 },
{ NULL, 0 }
};
@@ -406,34 +363,17 @@ tu_get_debug_option_name(int id)
return tu_debug_options[id].string;
}
-static const driOptionDescription tu_dri_options[] = {
- DRI_CONF_SECTION_PERFORMANCE
- DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0)
- DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false)
- DRI_CONF_VK_X11_ENSURE_MIN_IMAGE_COUNT(false)
- DRI_CONF_VK_XWAYLAND_WAIT_READY(true)
- DRI_CONF_SECTION_END
-
- DRI_CONF_SECTION_DEBUG
- DRI_CONF_VK_WSI_FORCE_BGRA8_UNORM_FIRST(false)
- DRI_CONF_VK_DONT_CARE_AS_LOAD(false)
- DRI_CONF_SECTION_END
-};
-
-static void
-tu_init_dri_options(struct tu_instance *instance)
+static int
+tu_get_instance_extension_index(const char *name)
{
- driParseOptionInfo(&instance->available_dri_options, tu_dri_options,
- ARRAY_SIZE(tu_dri_options));
- driParseConfigFiles(&instance->dri_options, &instance->available_dri_options, 0, "turnip", NULL, NULL,
- instance->vk.app_info.app_name, instance->vk.app_info.app_version,
- instance->vk.app_info.engine_name, instance->vk.app_info.engine_version);
-
- if (driQueryOptionb(&instance->dri_options, "vk_dont_care_as_load"))
- instance->debug_flags |= TU_DEBUG_DONT_CARE_AS_LOAD;
+ for (unsigned i = 0; i < TU_INSTANCE_EXTENSION_COUNT; ++i) {
+ if (strcmp(name, tu_instance_extensions[i].extensionName) == 0)
+ return i;
+ }
+ return -1;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
VkInstance *pInstance)
@@ -443,66 +383,63 @@ tu_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO);
- if (pAllocator == NULL)
- pAllocator = vk_default_allocator();
-
- instance = vk_zalloc(pAllocator, sizeof(*instance), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+ uint32_t client_version;
+ if (pCreateInfo->pApplicationInfo &&
+ pCreateInfo->pApplicationInfo->apiVersion != 0) {
+ client_version = pCreateInfo->pApplicationInfo->apiVersion;
+ } else {
+ tu_EnumerateInstanceVersion(&client_version);
+ }
+ instance = vk_zalloc2(&default_alloc, pAllocator, sizeof(*instance), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
if (!instance)
return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
- struct vk_instance_dispatch_table dispatch_table;
- vk_instance_dispatch_table_from_entrypoints(
- &dispatch_table, &tu_instance_entrypoints, true);
- vk_instance_dispatch_table_from_entrypoints(
- &dispatch_table, &wsi_instance_entrypoints, false);
+ instance->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
- result = vk_instance_init(&instance->vk,
- &tu_instance_extensions_supported,
- &dispatch_table,
- pCreateInfo, pAllocator);
- if (result != VK_SUCCESS) {
- vk_free(pAllocator, instance);
- return vk_error(NULL, result);
- }
+ if (pAllocator)
+ instance->alloc = *pAllocator;
+ else
+ instance->alloc = default_alloc;
-#ifndef TU_USE_KGSL
- instance->vk.physical_devices.try_create_for_drm =
- tu_physical_device_try_create;
-#else
- instance->vk.physical_devices.enumerate = tu_enumerate_devices;
-#endif
- instance->vk.physical_devices.destroy = tu_destroy_physical_device;
+ instance->api_version = client_version;
+ instance->physical_device_count = -1;
instance->debug_flags =
- parse_debug_string(os_get_option("TU_DEBUG"), tu_debug_options);
-
-#ifdef DEBUG
- /* Enable startup debugging by default on debug drivers. You almost always
- * want to see your startup failures in that case, and it's hard to set
- * this env var on android.
- */
- instance->debug_flags |= TU_DEBUG_STARTUP;
-#endif
+ parse_debug_string(getenv("TU_DEBUG"), tu_debug_options);
if (instance->debug_flags & TU_DEBUG_STARTUP)
- mesa_logi("Created an instance");
+ tu_logi("Created an instance");
- VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
+ for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) {
+ const char *ext_name = pCreateInfo->ppEnabledExtensionNames[i];
+ int index = tu_get_instance_extension_index(ext_name);
+
+ if (index < 0 || !tu_supported_instance_extensions.extensions[index]) {
+ vk_free2(&default_alloc, pAllocator, instance);
+ return vk_error(instance, VK_ERROR_EXTENSION_NOT_PRESENT);
+ }
- tu_init_dri_options(instance);
+ instance->enabled_extensions.extensions[index] = true;
+ }
- *pInstance = tu_instance_to_handle(instance);
+ result = vk_debug_report_instance_init(&instance->debug_report_callbacks);
+ if (result != VK_SUCCESS) {
+ vk_free2(&default_alloc, pAllocator, instance);
+ return vk_error(instance, result);
+ }
+
+ glsl_type_singleton_init_or_ref();
-#ifdef HAVE_PERFETTO
- tu_perfetto_init();
-#endif
+ VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
+
+ *pInstance = tu_instance_to_handle(instance);
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyInstance(VkInstance _instance,
const VkAllocationCallbacks *pAllocator)
{
@@ -511,623 +448,272 @@ tu_DestroyInstance(VkInstance _instance,
if (!instance)
return;
+ for (int i = 0; i < instance->physical_device_count; ++i) {
+ tu_physical_device_finish(instance->physical_devices + i);
+ }
+
VG(VALGRIND_DESTROY_MEMPOOL(instance));
- driDestroyOptionCache(&instance->dri_options);
- driDestroyOptionInfo(&instance->available_dri_options);
+ glsl_type_singleton_decref();
- vk_instance_finish(&instance->vk);
- vk_free(&instance->vk.alloc, instance);
-}
+ vk_debug_report_instance_destroy(&instance->debug_report_callbacks);
-static void
-tu_get_physical_device_features_1_1(struct tu_physical_device *pdevice,
- VkPhysicalDeviceVulkan11Features *features)
-{
- features->storageBuffer16BitAccess = pdevice->info->a6xx.storage_16bit;
- features->uniformAndStorageBuffer16BitAccess = false;
- features->storagePushConstant16 = false;
- features->storageInputOutput16 = false;
- features->multiview = true;
- features->multiviewGeometryShader = false;
- features->multiviewTessellationShader = false;
- features->variablePointersStorageBuffer = true;
- features->variablePointers = true;
- features->protectedMemory = false;
- features->samplerYcbcrConversion = true;
- features->shaderDrawParameters = true;
+ vk_free(&instance->alloc, instance);
}
-static void
-tu_get_physical_device_features_1_2(struct tu_physical_device *pdevice,
- VkPhysicalDeviceVulkan12Features *features)
-{
- features->samplerMirrorClampToEdge = true;
- features->drawIndirectCount = true;
- features->storageBuffer8BitAccess = false;
- features->uniformAndStorageBuffer8BitAccess = false;
- features->storagePushConstant8 = false;
- features->shaderBufferInt64Atomics = false;
- features->shaderSharedInt64Atomics = false;
- features->shaderFloat16 = true;
- features->shaderInt8 = false;
-
- features->descriptorIndexing = true;
- features->shaderInputAttachmentArrayDynamicIndexing = false;
- features->shaderUniformTexelBufferArrayDynamicIndexing = true;
- features->shaderStorageTexelBufferArrayDynamicIndexing = true;
- features->shaderUniformBufferArrayNonUniformIndexing = true;
- features->shaderSampledImageArrayNonUniformIndexing = true;
- features->shaderStorageBufferArrayNonUniformIndexing = true;
- features->shaderStorageImageArrayNonUniformIndexing = true;
- features->shaderInputAttachmentArrayNonUniformIndexing = false;
- features->shaderUniformTexelBufferArrayNonUniformIndexing = true;
- features->shaderStorageTexelBufferArrayNonUniformIndexing = true;
- features->descriptorBindingUniformBufferUpdateAfterBind = true;
- features->descriptorBindingSampledImageUpdateAfterBind = true;
- features->descriptorBindingStorageImageUpdateAfterBind = true;
- features->descriptorBindingStorageBufferUpdateAfterBind = true;
- features->descriptorBindingUniformTexelBufferUpdateAfterBind = true;
- features->descriptorBindingStorageTexelBufferUpdateAfterBind = true;
- features->descriptorBindingUpdateUnusedWhilePending = true;
- features->descriptorBindingPartiallyBound = true;
- features->descriptorBindingVariableDescriptorCount = true;
- features->runtimeDescriptorArray = true;
-
- features->samplerFilterMinmax = true;
- features->scalarBlockLayout = true;
- features->imagelessFramebuffer = true;
- features->uniformBufferStandardLayout = true;
- features->shaderSubgroupExtendedTypes = true;
- features->separateDepthStencilLayouts = true;
- features->hostQueryReset = true;
- features->timelineSemaphore = true;
- features->bufferDeviceAddress = true;
- features->bufferDeviceAddressCaptureReplay = pdevice->has_set_iova;
- features->bufferDeviceAddressMultiDevice = false;
- features->vulkanMemoryModel = true;
- features->vulkanMemoryModelDeviceScope = true;
- features->vulkanMemoryModelAvailabilityVisibilityChains = true;
- features->shaderOutputViewportIndex = true;
- features->shaderOutputLayer = true;
- features->subgroupBroadcastDynamicId = true;
+static VkResult
+tu_enumerate_devices(struct tu_instance *instance)
+{
+ /* TODO: Check for more devices ? */
+ drmDevicePtr devices[8];
+ VkResult result = VK_ERROR_INCOMPATIBLE_DRIVER;
+ int max_devices;
+
+ instance->physical_device_count = 0;
+
+ max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices));
+
+ if (instance->debug_flags & TU_DEBUG_STARTUP)
+ tu_logi("Found %d drm nodes", max_devices);
+
+ if (max_devices < 1)
+ return vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER);
+
+ for (unsigned i = 0; i < (unsigned) max_devices; i++) {
+ if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER &&
+ devices[i]->bustype == DRM_BUS_PLATFORM) {
+
+ result = tu_physical_device_init(
+ instance->physical_devices + instance->physical_device_count,
+ instance, devices[i]);
+ if (result == VK_SUCCESS)
+ ++instance->physical_device_count;
+ else if (result != VK_ERROR_INCOMPATIBLE_DRIVER)
+ break;
+ }
+ }
+ drmFreeDevices(devices, max_devices);
+
+ return result;
}
-static void
-tu_get_physical_device_features_1_3(struct tu_physical_device *pdevice,
- VkPhysicalDeviceVulkan13Features *features)
-{
- features->robustImageAccess = true;
- features->inlineUniformBlock = true;
- features->descriptorBindingInlineUniformBlockUpdateAfterBind = true;
- features->pipelineCreationCacheControl = true;
- features->privateData = true;
- features->shaderDemoteToHelperInvocation = true;
- features->shaderTerminateInvocation = true;
- features->subgroupSizeControl = true;
- features->computeFullSubgroups = true;
- features->synchronization2 = true;
- features->textureCompressionASTC_HDR = false;
- features->shaderZeroInitializeWorkgroupMemory = true;
- features->dynamicRendering = true;
- features->shaderIntegerDotProduct = true;
- features->maintenance4 = true;
+VkResult
+tu_EnumeratePhysicalDevices(VkInstance _instance,
+ uint32_t *pPhysicalDeviceCount,
+ VkPhysicalDevice *pPhysicalDevices)
+{
+ TU_FROM_HANDLE(tu_instance, instance, _instance);
+ VK_OUTARRAY_MAKE(out, pPhysicalDevices, pPhysicalDeviceCount);
+
+ VkResult result;
+
+ if (instance->physical_device_count < 0) {
+ result = tu_enumerate_devices(instance);
+ if (result != VK_SUCCESS && result != VK_ERROR_INCOMPATIBLE_DRIVER)
+ return result;
+ }
+
+ for (uint32_t i = 0; i < instance->physical_device_count; ++i) {
+ vk_outarray_append(&out, p)
+ {
+ *p = tu_physical_device_to_handle(instance->physical_devices + i);
+ }
+ }
+
+ return vk_outarray_status(&out);
}
-void
-tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
- VkPhysicalDeviceFeatures2 *pFeatures)
+VkResult
+tu_EnumeratePhysicalDeviceGroups(
+ VkInstance _instance,
+ uint32_t *pPhysicalDeviceGroupCount,
+ VkPhysicalDeviceGroupProperties *pPhysicalDeviceGroupProperties)
{
- TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice);
+ TU_FROM_HANDLE(tu_instance, instance, _instance);
+ VK_OUTARRAY_MAKE(out, pPhysicalDeviceGroupProperties,
+ pPhysicalDeviceGroupCount);
+ VkResult result;
+
+ if (instance->physical_device_count < 0) {
+ result = tu_enumerate_devices(instance);
+ if (result != VK_SUCCESS && result != VK_ERROR_INCOMPATIBLE_DRIVER)
+ return result;
+ }
- pFeatures->features = (VkPhysicalDeviceFeatures) {
- .robustBufferAccess = true,
- .fullDrawIndexUint32 = true,
- .imageCubeArray = true,
- .independentBlend = true,
- .geometryShader = true,
- .tessellationShader = true,
- .sampleRateShading = true,
- .dualSrcBlend = true,
- .logicOp = true,
- .multiDrawIndirect = true,
- .drawIndirectFirstInstance = true,
- .depthClamp = true,
- .depthBiasClamp = true,
- .fillModeNonSolid = true,
- .depthBounds = true,
+ for (uint32_t i = 0; i < instance->physical_device_count; ++i) {
+ vk_outarray_append(&out, p)
+ {
+ p->physicalDeviceCount = 1;
+ p->physicalDevices[0] =
+ tu_physical_device_to_handle(instance->physical_devices + i);
+ p->subsetAllocation = false;
+ }
+ }
+
+ return vk_outarray_status(&out);
+}
+
+void
+tu_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice,
+ VkPhysicalDeviceFeatures *pFeatures)
+{
+ memset(pFeatures, 0, sizeof(*pFeatures));
+
+ *pFeatures = (VkPhysicalDeviceFeatures) {
+ .robustBufferAccess = false,
+ .fullDrawIndexUint32 = false,
+ .imageCubeArray = false,
+ .independentBlend = false,
+ .geometryShader = false,
+ .tessellationShader = false,
+ .sampleRateShading = false,
+ .dualSrcBlend = false,
+ .logicOp = false,
+ .multiDrawIndirect = false,
+ .drawIndirectFirstInstance = false,
+ .depthClamp = false,
+ .depthBiasClamp = false,
+ .fillModeNonSolid = false,
+ .depthBounds = false,
.wideLines = false,
- .largePoints = true,
- .alphaToOne = true,
- .multiViewport = true,
- .samplerAnisotropy = true,
- .textureCompressionETC2 = true,
- .textureCompressionASTC_LDR = true,
- .textureCompressionBC = true,
- .occlusionQueryPrecise = true,
- .pipelineStatisticsQuery = true,
- .vertexPipelineStoresAndAtomics = true,
- .fragmentStoresAndAtomics = true,
- .shaderTessellationAndGeometryPointSize = true,
- .shaderImageGatherExtended = true,
- .shaderStorageImageExtendedFormats = true,
+ .largePoints = false,
+ .alphaToOne = false,
+ .multiViewport = false,
+ .samplerAnisotropy = false,
+ .textureCompressionETC2 = false,
+ .textureCompressionASTC_LDR = false,
+ .textureCompressionBC = false,
+ .occlusionQueryPrecise = false,
+ .pipelineStatisticsQuery = false,
+ .vertexPipelineStoresAndAtomics = false,
+ .fragmentStoresAndAtomics = false,
+ .shaderTessellationAndGeometryPointSize = false,
+ .shaderImageGatherExtended = false,
+ .shaderStorageImageExtendedFormats = false,
.shaderStorageImageMultisample = false,
- .shaderUniformBufferArrayDynamicIndexing = true,
- .shaderSampledImageArrayDynamicIndexing = true,
- .shaderStorageBufferArrayDynamicIndexing = true,
- .shaderStorageImageArrayDynamicIndexing = true,
- .shaderStorageImageReadWithoutFormat = true,
- .shaderStorageImageWriteWithoutFormat = true,
- .shaderClipDistance = true,
- .shaderCullDistance = true,
+ .shaderUniformBufferArrayDynamicIndexing = false,
+ .shaderSampledImageArrayDynamicIndexing = false,
+ .shaderStorageBufferArrayDynamicIndexing = false,
+ .shaderStorageImageArrayDynamicIndexing = false,
+ .shaderStorageImageReadWithoutFormat = false,
+ .shaderStorageImageWriteWithoutFormat = false,
+ .shaderClipDistance = false,
+ .shaderCullDistance = false,
.shaderFloat64 = false,
.shaderInt64 = false,
- .shaderInt16 = true,
+ .shaderInt16 = false,
.sparseBinding = false,
- .variableMultisampleRate = true,
- .inheritedQueries = true,
+ .variableMultisampleRate = false,
+ .inheritedQueries = false,
};
+}
- VkPhysicalDeviceVulkan11Features core_1_1 = {
- .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
- };
- tu_get_physical_device_features_1_1(pdevice, &core_1_1);
-
- VkPhysicalDeviceVulkan12Features core_1_2 = {
- .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
- };
- tu_get_physical_device_features_1_2(pdevice, &core_1_2);
-
- VkPhysicalDeviceVulkan13Features core_1_3 = {
- .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES,
- };
- tu_get_physical_device_features_1_3(pdevice, &core_1_3);
-
+void
+tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
+ VkPhysicalDeviceFeatures2 *pFeatures)
+{
vk_foreach_struct(ext, pFeatures->pNext)
{
- if (vk_get_physical_device_core_1_1_feature_ext(ext, &core_1_1))
- continue;
- if (vk_get_physical_device_core_1_2_feature_ext(ext, &core_1_2))
- continue;
- if (vk_get_physical_device_core_1_3_feature_ext(ext, &core_1_3))
- continue;
-
switch (ext->sType) {
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
- VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
- (VkPhysicalDeviceConditionalRenderingFeaturesEXT *) ext;
- features->conditionalRendering = true;
- features->inheritedConditionalRendering = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT: {
- VkPhysicalDeviceTransformFeedbackFeaturesEXT *features =
- (VkPhysicalDeviceTransformFeedbackFeaturesEXT *) ext;
- features->transformFeedback = true;
- features->geometryStreams = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT: {
- VkPhysicalDeviceIndexTypeUint8FeaturesEXT *features =
- (VkPhysicalDeviceIndexTypeUint8FeaturesEXT *)ext;
- features->indexTypeUint8 = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: {
- VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features =
- (VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *)ext;
- features->vertexAttributeInstanceRateDivisor = true;
- features->vertexAttributeInstanceRateZeroDivisor = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_ENABLE_FEATURES_EXT: {
- VkPhysicalDeviceDepthClipEnableFeaturesEXT *features =
- (VkPhysicalDeviceDepthClipEnableFeaturesEXT *)ext;
- features->depthClipEnable = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_4444_FORMATS_FEATURES_EXT: {
- VkPhysicalDevice4444FormatsFeaturesEXT *features = (void *)ext;
- features->formatA4R4G4B4 = true;
- features->formatA4B4G4R4 = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BORDER_COLOR_SWIZZLE_FEATURES_EXT: {
- VkPhysicalDeviceBorderColorSwizzleFeaturesEXT *features = (void *)ext;
- features->borderColorSwizzle = true;
- features->borderColorSwizzleFromImage = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: {
- VkPhysicalDeviceCustomBorderColorFeaturesEXT *features = (void *) ext;
- features->customBorderColors = true;
- features->customBorderColorWithoutFormat = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT: {
- VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *features = (void *)ext;
- features->extendedDynamicState = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_2_FEATURES_EXT: {
- VkPhysicalDeviceExtendedDynamicState2FeaturesEXT *features =
- (VkPhysicalDeviceExtendedDynamicState2FeaturesEXT *)ext;
- features->extendedDynamicState2 = true;
- features->extendedDynamicState2LogicOp = true;
- features->extendedDynamicState2PatchControlPoints = true;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES: {
+ VkPhysicalDeviceVariablePointersFeatures *features = (void *) ext;
+ features->variablePointersStorageBuffer = false;
+ features->variablePointers = false;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: {
- VkPhysicalDevicePerformanceQueryFeaturesKHR *feature =
- (VkPhysicalDevicePerformanceQueryFeaturesKHR *)ext;
- feature->performanceCounterQueryPools = true;
- feature->performanceCounterMultipleQueryPools = false;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES: {
+ VkPhysicalDeviceMultiviewFeatures *features =
+ (VkPhysicalDeviceMultiviewFeatures *) ext;
+ features->multiview = false;
+ features->multiviewGeometryShader = false;
+ features->multiviewTessellationShader = false;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR: {
- VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *features =
- (VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *)ext;
- features->pipelineExecutableInfo = true;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES: {
+ VkPhysicalDeviceShaderDrawParametersFeatures *features =
+ (VkPhysicalDeviceShaderDrawParametersFeatures *) ext;
+ features->shaderDrawParameters = false;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES: {
- VkPhysicalDeviceShaderFloat16Int8Features *features =
- (VkPhysicalDeviceShaderFloat16Int8Features *) ext;
- features->shaderFloat16 = true;
- features->shaderInt8 = false;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: {
+ VkPhysicalDeviceProtectedMemoryFeatures *features =
+ (VkPhysicalDeviceProtectedMemoryFeatures *) ext;
+ features->protectedMemory = false;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES: {
- VkPhysicalDeviceScalarBlockLayoutFeatures *features = (void *)ext;
- features->scalarBlockLayout = true;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: {
+ VkPhysicalDevice16BitStorageFeatures *features =
+ (VkPhysicalDevice16BitStorageFeatures *) ext;
+ features->storageBuffer16BitAccess = false;
+ features->uniformAndStorageBuffer16BitAccess = false;
+ features->storagePushConstant16 = false;
+ features->storageInputOutput16 = false;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: {
- VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext;
- features->robustBufferAccess2 = true;
- features->robustImageAccess2 = true;
- features->nullDescriptor = true;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: {
+ VkPhysicalDeviceSamplerYcbcrConversionFeatures *features =
+ (VkPhysicalDeviceSamplerYcbcrConversionFeatures *) ext;
+ features->samplerYcbcrConversion = false;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES: {
- VkPhysicalDeviceTimelineSemaphoreFeatures *features =
- (VkPhysicalDeviceTimelineSemaphoreFeatures *) ext;
- features->timelineSemaphore = true;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES_EXT: {
+ VkPhysicalDeviceDescriptorIndexingFeaturesEXT *features =
+ (VkPhysicalDeviceDescriptorIndexingFeaturesEXT *) ext;
+ features->shaderInputAttachmentArrayDynamicIndexing = false;
+ features->shaderUniformTexelBufferArrayDynamicIndexing = false;
+ features->shaderStorageTexelBufferArrayDynamicIndexing = false;
+ features->shaderUniformBufferArrayNonUniformIndexing = false;
+ features->shaderSampledImageArrayNonUniformIndexing = false;
+ features->shaderStorageBufferArrayNonUniformIndexing = false;
+ features->shaderStorageImageArrayNonUniformIndexing = false;
+ features->shaderInputAttachmentArrayNonUniformIndexing = false;
+ features->shaderUniformTexelBufferArrayNonUniformIndexing = false;
+ features->shaderStorageTexelBufferArrayNonUniformIndexing = false;
+ features->descriptorBindingUniformBufferUpdateAfterBind = false;
+ features->descriptorBindingSampledImageUpdateAfterBind = false;
+ features->descriptorBindingStorageImageUpdateAfterBind = false;
+ features->descriptorBindingStorageBufferUpdateAfterBind = false;
+ features->descriptorBindingUniformTexelBufferUpdateAfterBind = false;
+ features->descriptorBindingStorageTexelBufferUpdateAfterBind = false;
+ features->descriptorBindingUpdateUnusedWhilePending = false;
+ features->descriptorBindingPartiallyBound = false;
+ features->descriptorBindingVariableDescriptorCount = false;
+ features->runtimeDescriptorArray = false;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_FEATURES_EXT: {
- VkPhysicalDeviceProvokingVertexFeaturesEXT *features =
- (VkPhysicalDeviceProvokingVertexFeaturesEXT *)ext;
- features->provokingVertexLast = true;
- features->transformFeedbackPreservesProvokingVertex = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MUTABLE_DESCRIPTOR_TYPE_FEATURES_EXT: {
- VkPhysicalDeviceMutableDescriptorTypeFeaturesEXT *features =
- (VkPhysicalDeviceMutableDescriptorTypeFeaturesEXT *)ext;
- features->mutableDescriptorType = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_FEATURES_EXT: {
- VkPhysicalDeviceLineRasterizationFeaturesEXT *features =
- (VkPhysicalDeviceLineRasterizationFeaturesEXT *)ext;
- features->rectangularLines = true;
- features->bresenhamLines = true;
- features->smoothLines = false;
- features->stippledRectangularLines = false;
- features->stippledBresenhamLines = false;
- features->stippledSmoothLines = false;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVE_TOPOLOGY_LIST_RESTART_FEATURES_EXT: {
- VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT *features =
- (VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT *)ext;
- features->primitiveTopologyListRestart = true;
- features->primitiveTopologyPatchListRestart = false;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_FEATURES_EXT: {
- VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesEXT *features =
- (VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesEXT *)ext;
- features->rasterizationOrderColorAttachmentAccess = true;
- features->rasterizationOrderDepthAttachmentAccess = true;
- features->rasterizationOrderStencilAttachmentAccess = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_CONTROL_FEATURES_EXT: {
- VkPhysicalDeviceDepthClipControlFeaturesEXT *features =
- (VkPhysicalDeviceDepthClipControlFeaturesEXT *)ext;
- features->depthClipControl = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT: {
- VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *features =
- (VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *)ext;
- features->texelBufferAlignment = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVES_GENERATED_QUERY_FEATURES_EXT: {
- VkPhysicalDevicePrimitivesGeneratedQueryFeaturesEXT *features =
- (VkPhysicalDevicePrimitivesGeneratedQueryFeaturesEXT *)ext;
- features->primitivesGeneratedQuery = true;
- features->primitivesGeneratedQueryWithRasterizerDiscard = false;
- features->primitivesGeneratedQueryWithNonZeroStreams = false;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_VIEW_MIN_LOD_FEATURES_EXT: {
- VkPhysicalDeviceImageViewMinLodFeaturesEXT *features =
- (VkPhysicalDeviceImageViewMinLodFeaturesEXT *)ext;
- features->minLod = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_2D_VIEW_OF_3D_FEATURES_EXT: {
- VkPhysicalDeviceImage2DViewOf3DFeaturesEXT *features =
- (VkPhysicalDeviceImage2DViewOf3DFeaturesEXT *)ext;
- features->image2DViewOf3D = true;
- features->sampler2DViewOf3D = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: {
- VkPhysicalDeviceColorWriteEnableFeaturesEXT *features =
- (VkPhysicalDeviceColorWriteEnableFeaturesEXT *)ext;
- features->colorWriteEnable = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MODULE_IDENTIFIER_FEATURES_EXT: {
- VkPhysicalDeviceShaderModuleIdentifierFeaturesEXT *features =
- (VkPhysicalDeviceShaderModuleIdentifierFeaturesEXT *)ext;
- features->shaderModuleIdentifier = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_INPUT_DYNAMIC_STATE_FEATURES_EXT: {
- VkPhysicalDeviceVertexInputDynamicStateFeaturesEXT *features =
- (VkPhysicalDeviceVertexInputDynamicStateFeaturesEXT *)ext;
- features->vertexInputDynamicState = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_NON_SEAMLESS_CUBE_MAP_FEATURES_EXT: {
- VkPhysicalDeviceNonSeamlessCubeMapFeaturesEXT *features =
- (VkPhysicalDeviceNonSeamlessCubeMapFeaturesEXT *)ext;
- features->nonSeamlessCubeMap = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ATTACHMENT_FEEDBACK_LOOP_LAYOUT_FEATURES_EXT: {
- VkPhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT *features =
- (VkPhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT*)ext;
- features->attachmentFeedbackLoopLayout = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GLOBAL_PRIORITY_QUERY_FEATURES_KHR: {
- VkPhysicalDeviceGlobalPriorityQueryFeaturesKHR *features =
- (VkPhysicalDeviceGlobalPriorityQueryFeaturesKHR*)ext;
- features->globalPriorityQuery = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_FEATURES_EXT: {
- VkPhysicalDeviceMultiDrawFeaturesEXT *features =
- (VkPhysicalDeviceMultiDrawFeaturesEXT *)ext;
- features->multiDraw = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GRAPHICS_PIPELINE_LIBRARY_FEATURES_EXT: {
- VkPhysicalDeviceGraphicsPipelineLibraryFeaturesEXT *features =
- (VkPhysicalDeviceGraphicsPipelineLibraryFeaturesEXT *)ext;
- features->graphicsPipelineLibrary = true;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
+ VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
+ (VkPhysicalDeviceConditionalRenderingFeaturesEXT *) ext;
+ features->conditionalRendering = false;
+ features->inheritedConditionalRendering = false;
break;
}
-
default:
break;
}
}
+ return tu_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
}
-
-static void
-tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice,
- VkPhysicalDeviceVulkan11Properties *p)
-{
- assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES);
-
- memcpy(p->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
- memcpy(p->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
- memset(p->deviceLUID, 0, VK_LUID_SIZE);
- p->deviceNodeMask = 0;
- p->deviceLUIDValid = false;
-
- p->subgroupSize = 128;
- p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
- p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
- VK_SUBGROUP_FEATURE_VOTE_BIT |
- VK_SUBGROUP_FEATURE_BALLOT_BIT |
- VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
- VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
- VK_SUBGROUP_FEATURE_ARITHMETIC_BIT;
- if (pdevice->info->a6xx.has_getfiberid) {
- p->subgroupSupportedStages |= VK_SHADER_STAGE_ALL_GRAPHICS;
- p->subgroupSupportedOperations |= VK_SUBGROUP_FEATURE_QUAD_BIT;
- }
-
- p->subgroupQuadOperationsInAllStages = false;
-
- p->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES;
- p->maxMultiviewViewCount = MAX_VIEWS;
- p->maxMultiviewInstanceIndex = INT_MAX;
- p->protectedNoFault = false;
- /* Our largest descriptors are 2 texture descriptors, or a texture and
- * sampler descriptor.
- */
- p->maxPerSetDescriptors = MAX_SET_SIZE / (2 * A6XX_TEX_CONST_DWORDS * 4);
- /* Our buffer size fields allow only this much */
- p->maxMemoryAllocationSize = 0xFFFFFFFFull;
-
-}
-
-
-static const size_t max_descriptor_set_size = MAX_SET_SIZE / (4 * A6XX_TEX_CONST_DWORDS);
-static const VkSampleCountFlags sample_counts =
- VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT;
-
-static void
-tu_get_physical_device_properties_1_2(struct tu_physical_device *pdevice,
- VkPhysicalDeviceVulkan12Properties *p)
-{
- assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES);
-
- p->driverID = VK_DRIVER_ID_MESA_TURNIP;
- memset(p->driverName, 0, sizeof(p->driverName));
- snprintf(p->driverName, VK_MAX_DRIVER_NAME_SIZE,
- "turnip Mesa driver");
- memset(p->driverInfo, 0, sizeof(p->driverInfo));
- snprintf(p->driverInfo, VK_MAX_DRIVER_INFO_SIZE,
- "Mesa " PACKAGE_VERSION MESA_GIT_SHA1);
- p->conformanceVersion = (VkConformanceVersion) {
- .major = 1,
- .minor = 2,
- .subminor = 7,
- .patch = 1,
- };
-
- p->denormBehaviorIndependence =
- VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;
- p->roundingModeIndependence =
- VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;
-
- p->shaderDenormFlushToZeroFloat16 = true;
- p->shaderDenormPreserveFloat16 = false;
- p->shaderRoundingModeRTEFloat16 = true;
- p->shaderRoundingModeRTZFloat16 = false;
- p->shaderSignedZeroInfNanPreserveFloat16 = true;
-
- p->shaderDenormFlushToZeroFloat32 = true;
- p->shaderDenormPreserveFloat32 = false;
- p->shaderRoundingModeRTEFloat32 = true;
- p->shaderRoundingModeRTZFloat32 = false;
- p->shaderSignedZeroInfNanPreserveFloat32 = true;
-
- p->shaderDenormFlushToZeroFloat64 = false;
- p->shaderDenormPreserveFloat64 = false;
- p->shaderRoundingModeRTEFloat64 = false;
- p->shaderRoundingModeRTZFloat64 = false;
- p->shaderSignedZeroInfNanPreserveFloat64 = false;
-
- p->shaderUniformBufferArrayNonUniformIndexingNative = true;
- p->shaderSampledImageArrayNonUniformIndexingNative = true;
- p->shaderStorageBufferArrayNonUniformIndexingNative = true;
- p->shaderStorageImageArrayNonUniformIndexingNative = true;
- p->shaderInputAttachmentArrayNonUniformIndexingNative = false;
- p->robustBufferAccessUpdateAfterBind = false;
- p->quadDivergentImplicitLod = false;
-
- p->maxUpdateAfterBindDescriptorsInAllPools = max_descriptor_set_size;
- p->maxPerStageDescriptorUpdateAfterBindSamplers = max_descriptor_set_size;
- p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = max_descriptor_set_size;
- p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = max_descriptor_set_size;
- p->maxPerStageDescriptorUpdateAfterBindSampledImages = max_descriptor_set_size;
- p->maxPerStageDescriptorUpdateAfterBindStorageImages = max_descriptor_set_size;
- p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_RTS;
- p->maxPerStageUpdateAfterBindResources = max_descriptor_set_size;
- p->maxDescriptorSetUpdateAfterBindSamplers = max_descriptor_set_size;
- p->maxDescriptorSetUpdateAfterBindUniformBuffers = max_descriptor_set_size;
- p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_UNIFORM_BUFFERS;
- p->maxDescriptorSetUpdateAfterBindStorageBuffers = max_descriptor_set_size;
- p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_STORAGE_BUFFERS;
- p->maxDescriptorSetUpdateAfterBindSampledImages = max_descriptor_set_size;
- p->maxDescriptorSetUpdateAfterBindStorageImages = max_descriptor_set_size;
- p->maxDescriptorSetUpdateAfterBindInputAttachments = MAX_RTS;
-
- p->supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT;
- p->supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT;
- p->independentResolveNone = false;
- p->independentResolve = false;
-
- p->filterMinmaxSingleComponentFormats = true;
- p->filterMinmaxImageComponentMapping = true;
-
- p->maxTimelineSemaphoreValueDifference = UINT64_MAX;
-
- p->framebufferIntegerColorSampleCounts = sample_counts;
-}
-
-static void
-tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice,
- VkPhysicalDeviceVulkan13Properties *p)
-{
- /* TODO move threadsize_base and max_waves to fd_dev_info and use them here */
- p->minSubgroupSize = 64; /* threadsize_base */
- p->maxSubgroupSize = 128; /* threadsize_base * 2 */
- p->maxComputeWorkgroupSubgroups = 16; /* max_waves */
- p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;
-
- /* Inline uniform buffers are just normal UBOs */
- p->maxInlineUniformBlockSize = MAX_UNIFORM_BUFFER_RANGE;
-
- /* Halve the normal limit on the number of descriptors, see below. */
- p->maxPerStageDescriptorInlineUniformBlocks = max_descriptor_set_size / 2;
- p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = max_descriptor_set_size / 2;
- p->maxDescriptorSetInlineUniformBlocks = max_descriptor_set_size / 2;
- p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = max_descriptor_set_size / 2;
- /* Because we halve the normal limit on the number of descriptors, in the
- * worst case each descriptor takes up half the space, leaving the rest for
- * the actual data.
- */
- p->maxInlineUniformTotalSize = MAX_SET_SIZE / 2;
-
- p->integerDotProduct8BitUnsignedAccelerated = false;
- p->integerDotProduct8BitSignedAccelerated = false;
- p->integerDotProduct8BitMixedSignednessAccelerated = false;
- p->integerDotProduct4x8BitPackedUnsignedAccelerated =
- pdevice->info->a6xx.has_dp2acc;
- /* TODO: we should be able to emulate 4x8BitPackedSigned fast enough */
- p->integerDotProduct4x8BitPackedSignedAccelerated = false;
- p->integerDotProduct4x8BitPackedMixedSignednessAccelerated =
- pdevice->info->a6xx.has_dp2acc;
- p->integerDotProduct16BitUnsignedAccelerated = false;
- p->integerDotProduct16BitSignedAccelerated = false;
- p->integerDotProduct16BitMixedSignednessAccelerated = false;
- p->integerDotProduct32BitUnsignedAccelerated = false;
- p->integerDotProduct32BitSignedAccelerated = false;
- p->integerDotProduct32BitMixedSignednessAccelerated = false;
- p->integerDotProduct64BitUnsignedAccelerated = false;
- p->integerDotProduct64BitSignedAccelerated = false;
- p->integerDotProduct64BitMixedSignednessAccelerated = false;
- p->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false;
- p->integerDotProductAccumulatingSaturating8BitSignedAccelerated = false;
- p->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false;
- p->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated =
- pdevice->info->a6xx.has_dp2acc;
- /* TODO: we should be able to emulate Saturating4x8BitPackedSigned fast enough */
- p->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = false;
- p->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated =
- pdevice->info->a6xx.has_dp2acc;
- p->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false;
- p->integerDotProductAccumulatingSaturating16BitSignedAccelerated = false;
- p->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false;
- p->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false;
- p->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false;
- p->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false;
- p->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false;
- p->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false;
- p->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false;
-
- p->storageTexelBufferOffsetAlignmentBytes = 64;
- p->storageTexelBufferOffsetSingleTexelAlignment = false;
- p->uniformTexelBufferOffsetAlignmentBytes = 64;
- p->uniformTexelBufferOffsetSingleTexelAlignment = false;
-
- /* The address space is 4GB for current kernels, so there's no point
- * allowing a larger buffer. Our buffer sizes are 64-bit though, so
- * GetBufferDeviceRequirements won't fall over if someone actually creates
- * a 4GB buffer.
- */
- p->maxBufferSize = 1ull << 32;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
- VkPhysicalDeviceProperties2 *pProperties)
+void
+tu_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
+ VkPhysicalDeviceProperties *pProperties)
{
TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice);
+ VkSampleCountFlags sample_counts = 0xf;
+
+ /* make sure that the entire descriptor set is addressable with a signed
+ * 32-bit int. So the sum of all limits scaled by descriptor size has to
+ * be at most 2 GiB. the combined image & samples object count as one of
+ * both. This limit is for the pipeline layout, not for the set layout, but
+ * there is no set limit, so we just set a pipeline limit. I don't think
+ * any app is going to hit this soon. */
+ size_t max_descriptor_set_size =
+ ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) /
+ (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
+ 32 /* storage buffer, 32 due to potential space wasted on alignment */ +
+ 32 /* sampler, largest when combined with image */ +
+ 64 /* sampled image */ + 64 /* storage image */);
VkPhysicalDeviceLimits limits = {
.maxImageDimension1D = (1 << 14),
@@ -1136,20 +722,20 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
.maxImageDimensionCube = (1 << 14),
.maxImageArrayLayers = (1 << 11),
.maxTexelBufferElements = 128 * 1024 * 1024,
- .maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE,
- .maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE,
+ .maxUniformBufferRange = UINT32_MAX,
+ .maxStorageBufferRange = UINT32_MAX,
.maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE,
.maxMemoryAllocationCount = UINT32_MAX,
.maxSamplerAllocationCount = 64 * 1024,
.bufferImageGranularity = 64, /* A cache line */
- .sparseAddressSpaceSize = 0,
+ .sparseAddressSpaceSize = 0xffffffffu, /* buffer max size */
.maxBoundDescriptorSets = MAX_SETS,
.maxPerStageDescriptorSamplers = max_descriptor_set_size,
.maxPerStageDescriptorUniformBuffers = max_descriptor_set_size,
.maxPerStageDescriptorStorageBuffers = max_descriptor_set_size,
.maxPerStageDescriptorSampledImages = max_descriptor_set_size,
.maxPerStageDescriptorStorageImages = max_descriptor_set_size,
- .maxPerStageDescriptorInputAttachments = MAX_RTS,
+ .maxPerStageDescriptorInputAttachments = max_descriptor_set_size,
.maxPerStageResources = max_descriptor_set_size,
.maxDescriptorSetSamplers = max_descriptor_set_size,
.maxDescriptorSetUniformBuffers = max_descriptor_set_size,
@@ -1158,10 +744,10 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
.maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_STORAGE_BUFFERS,
.maxDescriptorSetSampledImages = max_descriptor_set_size,
.maxDescriptorSetStorageImages = max_descriptor_set_size,
- .maxDescriptorSetInputAttachments = MAX_RTS,
+ .maxDescriptorSetInputAttachments = max_descriptor_set_size,
.maxVertexInputAttributes = 32,
.maxVertexInputBindings = 32,
- .maxVertexInputAttributeOffset = 4095,
+ .maxVertexInputAttributeOffset = 2047,
.maxVertexInputBindingStride = 2048,
.maxVertexOutputComponents = 128,
.maxTessellationGenerationLevel = 64,
@@ -1172,41 +758,41 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
.maxTessellationControlTotalOutputComponents = 4096,
.maxTessellationEvaluationInputComponents = 128,
.maxTessellationEvaluationOutputComponents = 128,
- .maxGeometryShaderInvocations = 32,
+ .maxGeometryShaderInvocations = 127,
.maxGeometryInputComponents = 64,
.maxGeometryOutputComponents = 128,
.maxGeometryOutputVertices = 256,
.maxGeometryTotalOutputComponents = 1024,
- .maxFragmentInputComponents = 124,
+ .maxFragmentInputComponents = 128,
.maxFragmentOutputAttachments = 8,
.maxFragmentDualSrcAttachments = 1,
- .maxFragmentCombinedOutputResources = MAX_RTS + max_descriptor_set_size * 2,
+ .maxFragmentCombinedOutputResources = 8,
.maxComputeSharedMemorySize = 32768,
.maxComputeWorkGroupCount = { 65535, 65535, 65535 },
.maxComputeWorkGroupInvocations = 2048,
- .maxComputeWorkGroupSize = { 1024, 1024, 1024 },
- .subPixelPrecisionBits = 8,
- .subTexelPrecisionBits = 8,
- .mipmapPrecisionBits = 8,
+ .maxComputeWorkGroupSize = { 2048, 2048, 2048 },
+ .subPixelPrecisionBits = 4 /* FIXME */,
+ .subTexelPrecisionBits = 4 /* FIXME */,
+ .mipmapPrecisionBits = 4 /* FIXME */,
.maxDrawIndexedIndexValue = UINT32_MAX,
.maxDrawIndirectCount = UINT32_MAX,
- .maxSamplerLodBias = 4095.0 / 256.0, /* [-16, 15.99609375] */
+ .maxSamplerLodBias = 16,
.maxSamplerAnisotropy = 16,
.maxViewports = MAX_VIEWPORTS,
- .maxViewportDimensions = { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
+ .maxViewportDimensions = { (1 << 14), (1 << 14) },
.viewportBoundsRange = { INT16_MIN, INT16_MAX },
.viewportSubPixelBits = 8,
.minMemoryMapAlignment = 4096, /* A page */
- .minTexelBufferOffsetAlignment = 64,
- .minUniformBufferOffsetAlignment = 64,
- .minStorageBufferOffsetAlignment = 64,
- .minTexelOffset = -16,
- .maxTexelOffset = 15,
+ .minTexelBufferOffsetAlignment = 1,
+ .minUniformBufferOffsetAlignment = 4,
+ .minStorageBufferOffsetAlignment = 4,
+ .minTexelOffset = -32,
+ .maxTexelOffset = 31,
.minTexelGatherOffset = -32,
.maxTexelGatherOffset = 31,
- .minInterpolationOffset = -0.5,
- .maxInterpolationOffset = 0.4375,
- .subPixelInterpolationOffsetBits = 4,
+ .minInterpolationOffset = -2,
+ .maxInterpolationOffset = 2,
+ .subPixelInterpolationOffsetBits = 8,
.maxFramebufferWidth = (1 << 14),
.maxFramebufferHeight = (1 << 14),
.maxFramebufferLayers = (1 << 10),
@@ -1216,65 +802,51 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
.framebufferNoAttachmentsSampleCounts = sample_counts,
.maxColorAttachments = MAX_RTS,
.sampledImageColorSampleCounts = sample_counts,
- .sampledImageIntegerSampleCounts = sample_counts,
+ .sampledImageIntegerSampleCounts = VK_SAMPLE_COUNT_1_BIT,
.sampledImageDepthSampleCounts = sample_counts,
.sampledImageStencilSampleCounts = sample_counts,
.storageImageSampleCounts = VK_SAMPLE_COUNT_1_BIT,
.maxSampleMaskWords = 1,
.timestampComputeAndGraphics = true,
- .timestampPeriod = 1000000000.0 / 19200000.0, /* CP_ALWAYS_ON_COUNTER is fixed 19.2MHz */
+ .timestampPeriod = 1,
.maxClipDistances = 8,
.maxCullDistances = 8,
.maxCombinedClipAndCullDistances = 8,
- .discreteQueuePriorities = 2,
- .pointSizeRange = { 1, 4092 },
- .lineWidthRange = { 1.0, 1.0 },
- .pointSizeGranularity = 0.0625,
- .lineWidthGranularity = 0.0,
- .strictLines = true,
+ .discreteQueuePriorities = 1,
+ .pointSizeRange = { 0.125, 255.875 },
+ .lineWidthRange = { 0.0, 7.9921875 },
+ .pointSizeGranularity = (1.0 / 8.0),
+ .lineWidthGranularity = (1.0 / 128.0),
+ .strictLines = false, /* FINISHME */
.standardSampleLocations = true,
.optimalBufferCopyOffsetAlignment = 128,
.optimalBufferCopyRowPitchAlignment = 128,
.nonCoherentAtomSize = 64,
};
- pProperties->properties = (VkPhysicalDeviceProperties) {
- .apiVersion = TU_API_VERSION,
+ *pProperties = (VkPhysicalDeviceProperties) {
+ .apiVersion = tu_physical_device_api_version(pdevice),
.driverVersion = vk_get_driver_version(),
- .vendorID = 0x5143,
- .deviceID = pdevice->dev_id.chip_id,
+ .vendorID = 0, /* TODO */
+ .deviceID = 0,
.deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
.limits = limits,
.sparseProperties = { 0 },
};
- strcpy(pProperties->properties.deviceName, pdevice->name);
- memcpy(pProperties->properties.pipelineCacheUUID, pdevice->cache_uuid, VK_UUID_SIZE);
-
- VkPhysicalDeviceVulkan11Properties core_1_1 = {
- .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES,
- };
- tu_get_physical_device_properties_1_1(pdevice, &core_1_1);
-
- VkPhysicalDeviceVulkan12Properties core_1_2 = {
- .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES,
- };
- tu_get_physical_device_properties_1_2(pdevice, &core_1_2);
+ strcpy(pProperties->deviceName, pdevice->name);
+ memcpy(pProperties->pipelineCacheUUID, pdevice->cache_uuid, VK_UUID_SIZE);
+}
- VkPhysicalDeviceVulkan13Properties core_1_3 = {
- .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_PROPERTIES,
- };
- tu_get_physical_device_properties_1_3(pdevice, &core_1_3);
+void
+tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
+ VkPhysicalDeviceProperties2 *pProperties)
+{
+ TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice);
+ tu_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties);
vk_foreach_struct(ext, pProperties->pNext)
{
- if (vk_get_physical_device_core_1_1_property_ext(ext, &core_1_1))
- continue;
- if (vk_get_physical_device_core_1_2_property_ext(ext, &core_1_2))
- continue;
- if (vk_get_physical_device_core_1_3_property_ext(ext, &core_1_3))
- continue;
-
switch (ext->sType) {
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR: {
VkPhysicalDevicePushDescriptorPropertiesKHR *properties =
@@ -1282,109 +854,36 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
properties->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: {
- VkPhysicalDeviceTransformFeedbackPropertiesEXT *properties =
- (VkPhysicalDeviceTransformFeedbackPropertiesEXT *)ext;
-
- properties->maxTransformFeedbackStreams = IR3_MAX_SO_STREAMS;
- properties->maxTransformFeedbackBuffers = IR3_MAX_SO_BUFFERS;
- properties->maxTransformFeedbackBufferSize = UINT32_MAX;
- properties->maxTransformFeedbackStreamDataSize = 512;
- properties->maxTransformFeedbackBufferDataSize = 512;
- properties->maxTransformFeedbackBufferDataStride = 512;
- properties->transformFeedbackQueries = true;
- properties->transformFeedbackStreamsLinesTriangles = true;
- properties->transformFeedbackRasterizationStreamSelect = true;
- properties->transformFeedbackDraw = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLE_LOCATIONS_PROPERTIES_EXT: {
- VkPhysicalDeviceSampleLocationsPropertiesEXT *properties =
- (VkPhysicalDeviceSampleLocationsPropertiesEXT *)ext;
- properties->sampleLocationSampleCounts = 0;
- if (pdevice->vk.supported_extensions.EXT_sample_locations) {
- properties->sampleLocationSampleCounts =
- VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT;
- }
- properties->maxSampleLocationGridSize = (VkExtent2D) { 1 , 1 };
- properties->sampleLocationCoordinateRange[0] = 0.0f;
- properties->sampleLocationCoordinateRange[1] = 0.9375f;
- properties->sampleLocationSubPixelBits = 4;
- properties->variableSampleLocations = true;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: {
- VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *props =
- (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext;
- props->maxVertexAttribDivisor = UINT32_MAX;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: {
+ VkPhysicalDeviceIDProperties *properties =
+ (VkPhysicalDeviceIDProperties *) ext;
+ memcpy(properties->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
+ memcpy(properties->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
+ properties->deviceLUIDValid = false;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_PROPERTIES_EXT: {
- VkPhysicalDeviceCustomBorderColorPropertiesEXT *props = (void *)ext;
- props->maxCustomBorderColorSamplers = TU_BORDER_COLOR_COUNT;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES: {
+ VkPhysicalDeviceMultiviewProperties *properties =
+ (VkPhysicalDeviceMultiviewProperties *) ext;
+ properties->maxMultiviewViewCount = MAX_VIEWS;
+ properties->maxMultiviewInstanceIndex = INT_MAX;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR: {
- VkPhysicalDevicePerformanceQueryPropertiesKHR *properties =
- (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext;
- properties->allowCommandBufferQueryCopies = false;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: {
+ VkPhysicalDevicePointClippingProperties *properties =
+ (VkPhysicalDevicePointClippingProperties *) ext;
+ properties->pointClippingBehavior =
+ VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES;
break;
}
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_PROPERTIES_EXT: {
- VkPhysicalDeviceRobustness2PropertiesEXT *props = (void *)ext;
- /* see write_buffer_descriptor() */
- props->robustStorageBufferAccessSizeAlignment = 4;
- /* see write_ubo_descriptor() */
- props->robustUniformBufferAccessSizeAlignment = 16;
- break;
- }
-
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: {
- VkPhysicalDeviceProvokingVertexPropertiesEXT *properties =
- (VkPhysicalDeviceProvokingVertexPropertiesEXT *)ext;
- properties->provokingVertexModePerPipeline = true;
- properties->transformFeedbackPreservesTriangleFanProvokingVertex = false;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_PROPERTIES_EXT: {
- VkPhysicalDeviceLineRasterizationPropertiesEXT *props =
- (VkPhysicalDeviceLineRasterizationPropertiesEXT *)ext;
- props->lineSubPixelPrecisionBits = 8;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: {
- VkPhysicalDeviceDrmPropertiesEXT *props =
- (VkPhysicalDeviceDrmPropertiesEXT *)ext;
- props->hasPrimary = pdevice->has_master;
- props->primaryMajor = pdevice->master_major;
- props->primaryMinor = pdevice->master_minor;
-
- props->hasRender = pdevice->has_local;
- props->renderMajor = pdevice->local_major;
- props->renderMinor = pdevice->local_minor;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MODULE_IDENTIFIER_PROPERTIES_EXT: {
- VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *props =
- (VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *)ext;
- STATIC_ASSERT(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) ==
- sizeof(props->shaderModuleIdentifierAlgorithmUUID));
- memcpy(props->shaderModuleIdentifierAlgorithmUUID,
- vk_shaderModuleIdentifierAlgorithmUUID,
- sizeof(props->shaderModuleIdentifierAlgorithmUUID));
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_PROPERTIES_EXT: {
- VkPhysicalDeviceMultiDrawPropertiesEXT *properties =
- (VkPhysicalDeviceMultiDrawPropertiesEXT *)ext;
- properties->maxMultiDrawCount = 2048;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GRAPHICS_PIPELINE_LIBRARY_PROPERTIES_EXT: {
- VkPhysicalDeviceGraphicsPipelineLibraryPropertiesEXT *props =
- (VkPhysicalDeviceGraphicsPipelineLibraryPropertiesEXT *)ext;
- props->graphicsPipelineLibraryFastLinking = true;
- props->graphicsPipelineLibraryIndependentInterpolationDecoration = true;
+ case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: {
+ VkPhysicalDeviceMaintenance3Properties *properties =
+ (VkPhysicalDeviceMaintenance3Properties *) ext;
+ /* Make sure everything is addressable by a signed 32-bit int, and
+ * our largest descriptors are 96 bytes. */
+ properties->maxPerSetDescriptors = (1ull << 31) / 96;
+ /* Our buffer size fields allow only this much */
+ properties->maxMemoryAllocationSize = 0xFFFFFFFFull;
break;
}
default:
@@ -1397,99 +896,36 @@ static const VkQueueFamilyProperties tu_queue_family_properties = {
.queueFlags =
VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT,
.queueCount = 1,
- .timestampValidBits = 48,
+ .timestampValidBits = 64,
.minImageTransferGranularity = { 1, 1, 1 },
};
-static void
-tu_physical_device_get_global_priority_properties(const struct tu_physical_device *pdevice,
- VkQueueFamilyGlobalPriorityPropertiesKHR *props)
+void
+tu_GetPhysicalDeviceQueueFamilyProperties(
+ VkPhysicalDevice physicalDevice,
+ uint32_t *pQueueFamilyPropertyCount,
+ VkQueueFamilyProperties *pQueueFamilyProperties)
{
- props->priorityCount = MIN2(pdevice->submitqueue_priority_count, 3);
- switch (props->priorityCount) {
- case 1:
- props->priorities[0] = VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
- break;
- case 2:
- props->priorities[0] = VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
- props->priorities[1] = VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR;
- break;
- case 3:
- props->priorities[0] = VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR;
- props->priorities[1] = VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
- props->priorities[2] = VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR;
- break;
- default:
- unreachable("unexpected priority count");
- break;
- }
-}
+ VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pQueueFamilyPropertyCount);
-static int
-tu_physical_device_get_submitqueue_priority(const struct tu_physical_device *pdevice,
- VkQueueGlobalPriorityKHR global_priority,
- bool global_priority_query)
-{
- if (global_priority_query) {
- VkQueueFamilyGlobalPriorityPropertiesKHR props;
- tu_physical_device_get_global_priority_properties(pdevice, &props);
-
- bool valid = false;
- for (uint32_t i = 0; i < props.priorityCount; i++) {
- if (props.priorities[i] == global_priority) {
- valid = true;
- break;
- }
- }
-
- if (!valid)
- return -1;
- }
-
- /* Valid values are from 0 to (pdevice->submitqueue_priority_count - 1),
- * with 0 being the highest priority. This matches what freedreno does.
- */
- int priority;
- if (global_priority == VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR)
- priority = pdevice->submitqueue_priority_count / 2;
- else if (global_priority < VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR)
- priority = pdevice->submitqueue_priority_count - 1;
- else
- priority = 0;
-
- return priority;
+ vk_outarray_append(&out, p) { *p = tu_queue_family_properties; }
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_GetPhysicalDeviceQueueFamilyProperties2(
VkPhysicalDevice physicalDevice,
uint32_t *pQueueFamilyPropertyCount,
VkQueueFamilyProperties2 *pQueueFamilyProperties)
{
- TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice);
+ VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pQueueFamilyPropertyCount);
- VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out,
- pQueueFamilyProperties, pQueueFamilyPropertyCount);
-
- vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p)
+ vk_outarray_append(&out, p)
{
p->queueFamilyProperties = tu_queue_family_properties;
-
- vk_foreach_struct(ext, p->pNext) {
- switch (ext->sType) {
- case VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_KHR: {
- VkQueueFamilyGlobalPriorityPropertiesKHR *props = (void *)ext;
- tu_physical_device_get_global_priority_properties(pdevice, props);
- break;
- }
- default:
- break;
- }
- }
}
}
-uint64_t
+static uint64_t
tu_get_system_heap_size()
{
struct sysinfo info;
@@ -1509,101 +945,50 @@ tu_get_system_heap_size()
return available_ram;
}
-static VkDeviceSize
-tu_get_budget_memory(struct tu_physical_device *physical_device)
-{
- uint64_t heap_size = physical_device->heap.size;
- uint64_t heap_used = physical_device->heap.used;
- uint64_t sys_available;
- ASSERTED bool has_available_memory =
- os_get_available_system_memory(&sys_available);
- assert(has_available_memory);
-
- /*
- * Let's not incite the app to starve the system: report at most 90% of
- * available system memory.
- */
- uint64_t heap_available = sys_available * 9 / 10;
- return MIN2(heap_size, heap_used + heap_available);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_GetPhysicalDeviceMemoryProperties2(VkPhysicalDevice pdev,
- VkPhysicalDeviceMemoryProperties2 *props2)
+void
+tu_GetPhysicalDeviceMemoryProperties(
+ VkPhysicalDevice physicalDevice,
+ VkPhysicalDeviceMemoryProperties *pMemoryProperties)
{
- TU_FROM_HANDLE(tu_physical_device, physical_device, pdev);
-
- VkPhysicalDeviceMemoryProperties *props = &props2->memoryProperties;
- props->memoryHeapCount = 1;
- props->memoryHeaps[0].size = physical_device->heap.size;
- props->memoryHeaps[0].flags = physical_device->heap.flags;
+ pMemoryProperties->memoryHeapCount = 1;
+ pMemoryProperties->memoryHeaps[0].size = tu_get_system_heap_size();
+ pMemoryProperties->memoryHeaps[0].flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT;
- props->memoryTypeCount = 1;
- props->memoryTypes[0].propertyFlags =
+ pMemoryProperties->memoryTypeCount = 1;
+ pMemoryProperties->memoryTypes[0].propertyFlags =
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
- props->memoryTypes[0].heapIndex = 0;
+ pMemoryProperties->memoryTypes[0].heapIndex = 0;
+}
- vk_foreach_struct(ext, props2->pNext)
- {
- switch (ext->sType) {
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT: {
- VkPhysicalDeviceMemoryBudgetPropertiesEXT *memory_budget_props =
- (VkPhysicalDeviceMemoryBudgetPropertiesEXT *) ext;
- memory_budget_props->heapUsage[0] = physical_device->heap.used;
- memory_budget_props->heapBudget[0] = tu_get_budget_memory(physical_device);
-
- /* The heapBudget and heapUsage values must be zero for array elements
- * greater than or equal to VkPhysicalDeviceMemoryProperties::memoryHeapCount
- */
- for (unsigned i = 1; i < VK_MAX_MEMORY_HEAPS; i++) {
- memory_budget_props->heapBudget[i] = 0u;
- memory_budget_props->heapUsage[i] = 0u;
- }
- break;
- }
- default:
- break;
- }
- }
+void
+tu_GetPhysicalDeviceMemoryProperties2(
+ VkPhysicalDevice physicalDevice,
+ VkPhysicalDeviceMemoryProperties2 *pMemoryProperties)
+{
+ return tu_GetPhysicalDeviceMemoryProperties(
+ physicalDevice, &pMemoryProperties->memoryProperties);
}
static VkResult
tu_queue_init(struct tu_device *device,
struct tu_queue *queue,
+ uint32_t queue_family_index,
int idx,
- const VkDeviceQueueCreateInfo *create_info,
- bool global_priority_query)
-{
- const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
- vk_find_struct_const(create_info->pNext,
- DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
- const enum VkQueueGlobalPriorityKHR global_priority = priority_info ?
- priority_info->globalPriority : VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
-
- const int priority = tu_physical_device_get_submitqueue_priority(
- device->physical_device, global_priority, global_priority_query);
- if (priority < 0) {
- return vk_startup_errorf(device->instance, VK_ERROR_INITIALIZATION_FAILED,
- "invalid global priority");
- }
-
- VkResult result = vk_queue_init(&queue->vk, &device->vk, create_info, idx);
- if (result != VK_SUCCESS)
- return result;
-
+ VkDeviceQueueCreateFlags flags)
+{
+ queue->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
queue->device = device;
-#ifndef TU_USE_KGSL
- queue->vk.driver_submit = tu_queue_submit;
-#endif
+ queue->queue_family_index = queue_family_index;
+ queue->queue_idx = idx;
+ queue->flags = flags;
- int ret = tu_drm_submitqueue_new(device, priority, &queue->msm_queue_id);
+ int ret = tu_drm_submitqueue_new(device, 0, &queue->msm_queue_id);
if (ret)
- return vk_startup_errorf(device->instance, VK_ERROR_INITIALIZATION_FAILED,
- "submitqueue create failed");
+ return VK_ERROR_INITIALIZATION_FAILED;
- queue->fence = -1;
+ tu_fence_init(&queue->submit_fence, false);
return VK_SUCCESS;
}
@@ -1611,271 +996,21 @@ tu_queue_init(struct tu_device *device,
static void
tu_queue_finish(struct tu_queue *queue)
{
- vk_queue_finish(&queue->vk);
- if (queue->fence >= 0)
- close(queue->fence);
+ tu_fence_finish(&queue->submit_fence);
tu_drm_submitqueue_close(queue->device, queue->msm_queue_id);
}
-uint64_t
-tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts)
-{
- /* This is based on the 19.2MHz always-on rbbm timer.
- *
- * TODO we should probably query this value from kernel..
- */
- return ts * (1000000000 / 19200000);
-}
-
-static void*
-tu_trace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size)
-{
- struct tu_device *device =
- container_of(utctx, struct tu_device, trace_context);
-
- struct tu_bo *bo;
- tu_bo_init_new(device, &bo, size, false, "trace");
-
- return bo;
-}
-
-static void
-tu_trace_destroy_ts_buffer(struct u_trace_context *utctx, void *timestamps)
-{
- struct tu_device *device =
- container_of(utctx, struct tu_device, trace_context);
- struct tu_bo *bo = timestamps;
-
- tu_bo_finish(device, bo);
-}
-
-static void
-tu_trace_record_ts(struct u_trace *ut, void *cs, void *timestamps,
- unsigned idx, bool end_of_pipe)
-{
- struct tu_bo *bo = timestamps;
- struct tu_cs *ts_cs = cs;
-
- unsigned ts_offset = idx * sizeof(uint64_t);
- tu_cs_emit_pkt7(ts_cs, CP_EVENT_WRITE, 4);
- tu_cs_emit(ts_cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
- tu_cs_emit_qw(ts_cs, bo->iova + ts_offset);
- tu_cs_emit(ts_cs, 0x00000000);
-}
-
-static uint64_t
-tu_trace_read_ts(struct u_trace_context *utctx,
- void *timestamps, unsigned idx, void *flush_data)
-{
- struct tu_device *device =
- container_of(utctx, struct tu_device, trace_context);
- struct tu_bo *bo = timestamps;
- struct tu_u_trace_submission_data *submission_data = flush_data;
-
- /* Only need to stall on results for the first entry: */
- if (idx == 0) {
- tu_device_wait_u_trace(device, submission_data->syncobj);
- }
-
- if (tu_bo_map(device, bo) != VK_SUCCESS) {
- return U_TRACE_NO_TIMESTAMP;
- }
-
- uint64_t *ts = bo->map;
-
- /* Don't translate the no-timestamp marker: */
- if (ts[idx] == U_TRACE_NO_TIMESTAMP)
- return U_TRACE_NO_TIMESTAMP;
-
- return tu_device_ticks_to_ns(device, ts[idx]);
-}
-
-static void
-tu_trace_delete_flush_data(struct u_trace_context *utctx, void *flush_data)
-{
- struct tu_device *device =
- container_of(utctx, struct tu_device, trace_context);
- struct tu_u_trace_submission_data *submission_data = flush_data;
-
- tu_u_trace_submission_data_finish(device, submission_data);
-}
-
-void
-tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
- void *ts_from, uint32_t from_offset,
- void *ts_to, uint32_t to_offset,
- uint32_t count)
-{
- struct tu_cs *cs = cmdstream;
- struct tu_bo *bo_from = ts_from;
- struct tu_bo *bo_to = ts_to;
-
- tu_cs_emit_pkt7(cs, CP_MEMCPY, 5);
- tu_cs_emit(cs, count * sizeof(uint64_t) / sizeof(uint32_t));
- tu_cs_emit_qw(cs, bo_from->iova + from_offset * sizeof(uint64_t));
- tu_cs_emit_qw(cs, bo_to->iova + to_offset * sizeof(uint64_t));
-}
-
-/* Special helpers instead of u_trace_begin_iterator()/u_trace_end_iterator()
- * that ignore tracepoints at the beginning/end that are part of a
- * suspend/resume chain.
- */
-static struct u_trace_iterator
-tu_cmd_begin_iterator(struct tu_cmd_buffer *cmdbuf)
-{
- switch (cmdbuf->state.suspend_resume) {
- case SR_IN_PRE_CHAIN:
- return cmdbuf->trace_renderpass_end;
- case SR_AFTER_PRE_CHAIN:
- case SR_IN_CHAIN_AFTER_PRE_CHAIN:
- return cmdbuf->pre_chain.trace_renderpass_end;
- default:
- return u_trace_begin_iterator(&cmdbuf->trace);
- }
-}
-
-static struct u_trace_iterator
-tu_cmd_end_iterator(struct tu_cmd_buffer *cmdbuf)
-{
- switch (cmdbuf->state.suspend_resume) {
- case SR_IN_PRE_CHAIN:
- return cmdbuf->trace_renderpass_end;
- case SR_IN_CHAIN:
- case SR_IN_CHAIN_AFTER_PRE_CHAIN:
- return cmdbuf->trace_renderpass_start;
- default:
- return u_trace_end_iterator(&cmdbuf->trace);
- }
-}
-VkResult
-tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
- struct u_trace **trace_copy)
+static int
+tu_get_device_extension_index(const char *name)
{
- *cs = vk_zalloc(&cmdbuf->device->vk.alloc, sizeof(struct tu_cs), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-
- if (*cs == NULL) {
- return VK_ERROR_OUT_OF_HOST_MEMORY;
- }
-
- tu_cs_init(*cs, cmdbuf->device, TU_CS_MODE_GROW,
- list_length(&cmdbuf->trace.trace_chunks) * 6 + 3, "trace copy timestamp cs");
-
- tu_cs_begin(*cs);
-
- tu_cs_emit_wfi(*cs);
- tu_cs_emit_pkt7(*cs, CP_WAIT_FOR_ME, 0);
-
- *trace_copy = vk_zalloc(&cmdbuf->device->vk.alloc, sizeof(struct u_trace), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-
- if (*trace_copy == NULL) {
- return VK_ERROR_OUT_OF_HOST_MEMORY;
+ for (unsigned i = 0; i < TU_DEVICE_EXTENSION_COUNT; ++i) {
+ if (strcmp(name, tu_device_extensions[i].extensionName) == 0)
+ return i;
}
-
- u_trace_init(*trace_copy, cmdbuf->trace.utctx);
- u_trace_clone_append(tu_cmd_begin_iterator(cmdbuf),
- tu_cmd_end_iterator(cmdbuf),
- *trace_copy, *cs,
- tu_copy_timestamp_buffer);
-
- tu_cs_emit_wfi(*cs);
-
- tu_cs_end(*cs);
-
- return VK_SUCCESS;
+ return -1;
}
VkResult
-tu_u_trace_submission_data_create(
- struct tu_device *device,
- struct tu_cmd_buffer **cmd_buffers,
- uint32_t cmd_buffer_count,
- struct tu_u_trace_submission_data **submission_data)
-{
- *submission_data =
- vk_zalloc(&device->vk.alloc,
- sizeof(struct tu_u_trace_submission_data), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-
- if (!(*submission_data)) {
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- }
-
- struct tu_u_trace_submission_data *data = *submission_data;
-
- data->cmd_trace_data =
- vk_zalloc(&device->vk.alloc,
- cmd_buffer_count * sizeof(struct tu_u_trace_cmd_data), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-
- if (!data->cmd_trace_data) {
- goto fail;
- }
-
- data->cmd_buffer_count = cmd_buffer_count;
- data->last_buffer_with_tracepoints = -1;
-
- for (uint32_t i = 0; i < cmd_buffer_count; ++i) {
- struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
-
- if (!u_trace_has_points(&cmdbuf->trace))
- continue;
-
- data->last_buffer_with_tracepoints = i;
-
- if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) {
- /* A single command buffer could be submitted several times, but we
- * already baked timestamp iova addresses and trace points are
- * single-use. Therefor we have to copy trace points and create
- * a new timestamp buffer on every submit of reusable command buffer.
- */
- if (tu_create_copy_timestamp_cs(cmdbuf,
- &data->cmd_trace_data[i].timestamp_copy_cs,
- &data->cmd_trace_data[i].trace) != VK_SUCCESS) {
- goto fail;
- }
-
- assert(data->cmd_trace_data[i].timestamp_copy_cs->entry_count == 1);
- } else {
- data->cmd_trace_data[i].trace = &cmdbuf->trace;
- }
- }
-
- assert(data->last_buffer_with_tracepoints != -1);
-
- return VK_SUCCESS;
-
-fail:
- tu_u_trace_submission_data_finish(device, data);
- *submission_data = NULL;
-
- return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-}
-
-void
-tu_u_trace_submission_data_finish(
- struct tu_device *device,
- struct tu_u_trace_submission_data *submission_data)
-{
- for (uint32_t i = 0; i < submission_data->cmd_buffer_count; ++i) {
- /* Only if we had to create a copy of trace we should free it */
- struct tu_u_trace_cmd_data *cmd_data = &submission_data->cmd_trace_data[i];
- if (cmd_data->timestamp_copy_cs) {
- tu_cs_finish(cmd_data->timestamp_copy_cs);
- vk_free(&device->vk.alloc, cmd_data->timestamp_copy_cs);
-
- u_trace_fini(cmd_data->trace);
- vk_free(&device->vk.alloc, cmd_data->trace);
- }
- }
-
- vk_free(&device->vk.alloc, submission_data->cmd_trace_data);
- vk_free(&device->vk.alloc, submission_data->syncobj);
- vk_free(&device->vk.alloc, submission_data);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateDevice(VkPhysicalDevice physicalDevice,
const VkDeviceCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
@@ -1884,92 +1019,59 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
TU_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice);
VkResult result;
struct tu_device *device;
- bool custom_border_colors = false;
- bool perf_query_pools = false;
- bool robust_buffer_access2 = false;
- bool border_color_without_format = false;
- bool global_priority_query = false;
- vk_foreach_struct_const(ext, pCreateInfo->pNext) {
- switch (ext->sType) {
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: {
- const VkPhysicalDeviceCustomBorderColorFeaturesEXT *border_color_features = (const void *)ext;
- custom_border_colors = border_color_features->customBorderColors;
- border_color_without_format =
- border_color_features->customBorderColorWithoutFormat;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: {
- const VkPhysicalDevicePerformanceQueryFeaturesKHR *feature =
- (VkPhysicalDevicePerformanceQueryFeaturesKHR *)ext;
- perf_query_pools = feature->performanceCounterQueryPools;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: {
- VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext;
- robust_buffer_access2 = features->robustBufferAccess2;
- break;
- }
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GLOBAL_PRIORITY_QUERY_FEATURES_KHR: {
- VkPhysicalDeviceGlobalPriorityQueryFeaturesKHR *features = (void *)ext;
- global_priority_query = features->globalPriorityQuery;
- break;
- }
- default:
- break;
+ /* Check enabled features */
+ if (pCreateInfo->pEnabledFeatures) {
+ VkPhysicalDeviceFeatures supported_features;
+ tu_GetPhysicalDeviceFeatures(physicalDevice, &supported_features);
+ VkBool32 *supported_feature = (VkBool32 *) &supported_features;
+ VkBool32 *enabled_feature = (VkBool32 *) pCreateInfo->pEnabledFeatures;
+ unsigned num_features =
+ sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
+ for (uint32_t i = 0; i < num_features; i++) {
+ if (enabled_feature[i] && !supported_feature[i])
+ return vk_error(physical_device->instance,
+ VK_ERROR_FEATURE_NOT_PRESENT);
}
}
- device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
+ device = vk_zalloc2(&physical_device->instance->alloc, pAllocator,
sizeof(*device), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!device)
- return vk_startup_errorf(physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY, "OOM");
-
- struct vk_device_dispatch_table dispatch_table;
- vk_device_dispatch_table_from_entrypoints(
- &dispatch_table, &tu_device_entrypoints, true);
- vk_device_dispatch_table_from_entrypoints(
- &dispatch_table, &wsi_device_entrypoints, false);
-
- result = vk_device_init(&device->vk, &physical_device->vk,
- &dispatch_table, pCreateInfo, pAllocator);
- if (result != VK_SUCCESS) {
- vk_free(&device->vk.alloc, device);
- return vk_startup_errorf(physical_device->instance, result,
- "vk_device_init failed");
- }
+ return vk_error(physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ device->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
device->instance = physical_device->instance;
device->physical_device = physical_device;
- device->fd = physical_device->local_fd;
- device->vk.command_buffer_ops = &tu_cmd_buffer_ops;
- device->vk.check_status = tu_device_check_status;
- mtx_init(&device->bo_mutex, mtx_plain);
- mtx_init(&device->pipeline_mutex, mtx_plain);
- mtx_init(&device->autotune_mutex, mtx_plain);
- u_rwlock_init(&device->dma_bo_lock);
- pthread_mutex_init(&device->submit_mutex, NULL);
+ if (pAllocator)
+ device->alloc = *pAllocator;
+ else
+ device->alloc = physical_device->instance->alloc;
- if (device->instance->debug_flags & TU_DEBUG_BOS)
- device->bo_sizes = _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal);
+ for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) {
+ const char *ext_name = pCreateInfo->ppEnabledExtensionNames[i];
+ int index = tu_get_device_extension_index(ext_name);
+ if (index < 0 ||
+ !physical_device->supported_extensions.extensions[index]) {
+ vk_free(&device->alloc, device);
+ return vk_error(physical_device->instance,
+ VK_ERROR_EXTENSION_NOT_PRESENT);
+ }
-#ifndef TU_USE_KGSL
- vk_device_set_drm_fd(&device->vk, device->fd);
-#endif
+ device->enabled_extensions.extensions[index] = true;
+ }
for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
const VkDeviceQueueCreateInfo *queue_create =
&pCreateInfo->pQueueCreateInfos[i];
uint32_t qfi = queue_create->queueFamilyIndex;
device->queues[qfi] = vk_alloc(
- &device->vk.alloc, queue_create->queueCount * sizeof(struct tu_queue),
+ &device->alloc, queue_create->queueCount * sizeof(struct tu_queue),
8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!device->queues[qfi]) {
- result = vk_startup_errorf(physical_device->instance,
- VK_ERROR_OUT_OF_HOST_MEMORY,
- "OOM");
- goto fail_queues;
+ result = VK_ERROR_OUT_OF_HOST_MEMORY;
+ goto fail;
}
memset(device->queues[qfi], 0,
@@ -1978,221 +1080,50 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
device->queue_count[qfi] = queue_create->queueCount;
for (unsigned q = 0; q < queue_create->queueCount; q++) {
- result = tu_queue_init(device, &device->queues[qfi][q], q,
- queue_create, global_priority_query);
- if (result != VK_SUCCESS) {
- device->queue_count[qfi] = q;
- goto fail_queues;
- }
- }
- }
-
- device->compiler =
- ir3_compiler_create(NULL, &physical_device->dev_id,
- &(struct ir3_compiler_options) {
- .robust_buffer_access2 = robust_buffer_access2,
- .push_ubo_with_preamble = true,
- .disable_cache = true,
- });
- if (!device->compiler) {
- result = vk_startup_errorf(physical_device->instance,
- VK_ERROR_INITIALIZATION_FAILED,
- "failed to initialize ir3 compiler");
- goto fail_queues;
- }
-
- /* Initialize sparse array for refcounting imported BOs */
- util_sparse_array_init(&device->bo_map, sizeof(struct tu_bo), 512);
-
- /* initial sizes, these will increase if there is overflow */
- device->vsc_draw_strm_pitch = 0x1000 + VSC_PAD;
- device->vsc_prim_strm_pitch = 0x4000 + VSC_PAD;
-
- uint32_t global_size = sizeof(struct tu6_global);
- if (custom_border_colors)
- global_size += TU_BORDER_COLOR_COUNT * sizeof(struct bcolor_entry);
-
- tu_bo_suballocator_init(&device->pipeline_suballoc, device,
- 128 * 1024, TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP);
- tu_bo_suballocator_init(&device->autotune_suballoc, device,
- 128 * 1024, 0);
-
- result = tu_bo_init_new(device, &device->global_bo, global_size,
- TU_BO_ALLOC_ALLOW_DUMP, "global");
- if (result != VK_SUCCESS) {
- vk_startup_errorf(device->instance, result, "BO init");
- goto fail_global_bo;
- }
-
- result = tu_bo_map(device, device->global_bo);
- if (result != VK_SUCCESS) {
- vk_startup_errorf(device->instance, result, "BO map");
- goto fail_global_bo_map;
- }
-
- struct tu6_global *global = device->global_bo->map;
- tu_init_clear_blit_shaders(device);
- global->predicate = 0;
- global->vtx_stats_query_not_running = 1;
- global->dbg_one = (uint32_t)-1;
- global->dbg_gmem_total_loads = 0;
- global->dbg_gmem_taken_loads = 0;
- global->dbg_gmem_total_stores = 0;
- global->dbg_gmem_taken_stores = 0;
- for (int i = 0; i < TU_BORDER_COLOR_BUILTIN; i++) {
- VkClearColorValue border_color = vk_border_color_value(i);
- tu6_pack_border_color(&global->bcolor_builtin[i], &border_color,
- vk_border_color_is_int(i));
- }
-
- /* initialize to ones so ffs can be used to find unused slots */
- BITSET_ONES(device->custom_border_color);
-
- result = tu_init_dynamic_rendering(device);
- if (result != VK_SUCCESS) {
- vk_startup_errorf(device->instance, result, "dynamic rendering");
- goto fail_dynamic_rendering;
- }
-
- struct vk_pipeline_cache_create_info pcc_info = { };
- device->mem_cache = vk_pipeline_cache_create(&device->vk, &pcc_info,
- false);
- if (!device->mem_cache) {
- result = VK_ERROR_OUT_OF_HOST_MEMORY;
- vk_startup_errorf(device->instance, result, "create pipeline cache failed");
- goto fail_pipeline_cache;
- }
-
- if (perf_query_pools) {
- /* Prepare command streams setting pass index to the PERF_CNTRS_REG
- * from 0 to 31. One of these will be picked up at cmd submit time
- * when the perf query is executed.
- */
- struct tu_cs *cs;
-
- if (!(device->perfcntrs_pass_cs = calloc(1, sizeof(struct tu_cs)))) {
- result = vk_startup_errorf(device->instance,
- VK_ERROR_OUT_OF_HOST_MEMORY, "OOM");
- goto fail_perfcntrs_pass_alloc;
- }
-
- device->perfcntrs_pass_cs_entries = calloc(32, sizeof(struct tu_cs_entry));
- if (!device->perfcntrs_pass_cs_entries) {
- result = vk_startup_errorf(device->instance,
- VK_ERROR_OUT_OF_HOST_MEMORY, "OOM");
- goto fail_perfcntrs_pass_entries_alloc;
- }
-
- cs = device->perfcntrs_pass_cs;
- tu_cs_init(cs, device, TU_CS_MODE_SUB_STREAM, 96, "perfcntrs cs");
-
- for (unsigned i = 0; i < 32; i++) {
- struct tu_cs sub_cs;
-
- result = tu_cs_begin_sub_stream(cs, 3, &sub_cs);
- if (result != VK_SUCCESS) {
- vk_startup_errorf(device->instance, result,
- "failed to allocate commands streams");
- goto fail_prepare_perfcntrs_pass_cs;
- }
-
- tu_cs_emit_regs(&sub_cs, A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG, 1 << i));
- tu_cs_emit_pkt7(&sub_cs, CP_WAIT_FOR_ME, 0);
-
- device->perfcntrs_pass_cs_entries[i] = tu_cs_end_sub_stream(cs, &sub_cs);
+ result = tu_queue_init(device, &device->queues[qfi][q], qfi, q,
+ queue_create->flags);
+ if (result != VK_SUCCESS)
+ goto fail;
}
}
- /* Initialize a condition variable for timeline semaphore */
- pthread_condattr_t condattr;
- if (pthread_condattr_init(&condattr) != 0) {
- result = vk_startup_errorf(physical_device->instance,
- VK_ERROR_INITIALIZATION_FAILED,
- "pthread condattr init");
- goto fail_timeline_cond;
- }
- if (pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC) != 0) {
- pthread_condattr_destroy(&condattr);
- result = vk_startup_errorf(physical_device->instance,
- VK_ERROR_INITIALIZATION_FAILED,
- "pthread condattr clock setup");
- goto fail_timeline_cond;
- }
- if (pthread_cond_init(&device->timeline_cond, &condattr) != 0) {
- pthread_condattr_destroy(&condattr);
- result = vk_startup_errorf(physical_device->instance,
- VK_ERROR_INITIALIZATION_FAILED,
- "pthread cond init");
- goto fail_timeline_cond;
- }
- pthread_condattr_destroy(&condattr);
-
- result = tu_autotune_init(&device->autotune, device);
- if (result != VK_SUCCESS) {
- goto fail_timeline_cond;
- }
-
- for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
- mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain);
-
- mtx_init(&device->fiber_pvtmem_bo.mtx, mtx_plain);
- mtx_init(&device->wave_pvtmem_bo.mtx, mtx_plain);
-
- mtx_init(&device->mutex, mtx_plain);
-
- device->use_z24uint_s8uint =
- physical_device->info->a6xx.has_z24uint_s8uint &&
- !border_color_without_format;
-
- tu_gpu_tracepoint_config_variable();
+ device->compiler = ir3_compiler_create(NULL, physical_device->gpu_id);
+ if (!device->compiler)
+ goto fail;
- device->submit_count = 0;
- u_trace_context_init(&device->trace_context, device,
- tu_trace_create_ts_buffer,
- tu_trace_destroy_ts_buffer,
- tu_trace_record_ts,
- tu_trace_read_ts,
- tu_trace_delete_flush_data);
+ VkPipelineCacheCreateInfo ci;
+ ci.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
+ ci.pNext = NULL;
+ ci.flags = 0;
+ ci.pInitialData = NULL;
+ ci.initialDataSize = 0;
+ VkPipelineCache pc;
+ result =
+ tu_CreatePipelineCache(tu_device_to_handle(device), &ci, NULL, &pc);
+ if (result != VK_SUCCESS)
+ goto fail;
- tu_breadcrumbs_init(device);
+ device->mem_cache = tu_pipeline_cache_from_handle(pc);
*pDevice = tu_device_to_handle(device);
return VK_SUCCESS;
-fail_timeline_cond:
-fail_prepare_perfcntrs_pass_cs:
- free(device->perfcntrs_pass_cs_entries);
- tu_cs_finish(device->perfcntrs_pass_cs);
-fail_perfcntrs_pass_entries_alloc:
- free(device->perfcntrs_pass_cs);
-fail_perfcntrs_pass_alloc:
- vk_pipeline_cache_destroy(device->mem_cache, &device->vk.alloc);
-fail_pipeline_cache:
- tu_destroy_dynamic_rendering(device);
-fail_dynamic_rendering:
- tu_destroy_clear_blit_shaders(device);
-fail_global_bo_map:
- tu_bo_finish(device, device->global_bo);
- vk_free(&device->vk.alloc, device->bo_list);
-fail_global_bo:
- ir3_compiler_destroy(device->compiler);
- util_sparse_array_finish(&device->bo_map);
-
-fail_queues:
+fail:
for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
for (unsigned q = 0; q < device->queue_count[i]; q++)
tu_queue_finish(&device->queues[i][q]);
- if (device->queues[i])
- vk_free(&device->vk.alloc, device->queues[i]);
+ if (device->queue_count[i])
+ vk_free(&device->alloc, device->queues[i]);
}
- u_rwlock_destroy(&device->dma_bo_lock);
- vk_device_finish(&device->vk);
- vk_free(&device->vk.alloc, device);
+ if (device->compiler)
+ ralloc_free(device->compiler);
+
+ vk_free(&device->alloc, device);
return result;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
{
TU_FROM_HANDLE(tu_device, device, _device);
@@ -2200,158 +1131,231 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
if (!device)
return;
- tu_breadcrumbs_finish(device);
-
- u_trace_context_fini(&device->trace_context);
-
for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
for (unsigned q = 0; q < device->queue_count[i]; q++)
tu_queue_finish(&device->queues[i][q]);
if (device->queue_count[i])
- vk_free(&device->vk.alloc, device->queues[i]);
+ vk_free(&device->alloc, device->queues[i]);
}
- for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++) {
- if (device->scratch_bos[i].initialized)
- tu_bo_finish(device, device->scratch_bos[i].bo);
- }
+ /* the compiler does not use pAllocator */
+ ralloc_free(device->compiler);
- if (device->fiber_pvtmem_bo.bo)
- tu_bo_finish(device, device->fiber_pvtmem_bo.bo);
-
- if (device->wave_pvtmem_bo.bo)
- tu_bo_finish(device, device->wave_pvtmem_bo.bo);
+ VkPipelineCache pc = tu_pipeline_cache_to_handle(device->mem_cache);
+ tu_DestroyPipelineCache(tu_device_to_handle(device), pc, NULL);
- tu_destroy_clear_blit_shaders(device);
-
- tu_destroy_dynamic_rendering(device);
+ vk_free(&device->alloc, device);
+}
- ir3_compiler_destroy(device->compiler);
+VkResult
+tu_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount,
+ VkLayerProperties *pProperties)
+{
+ *pPropertyCount = 0;
+ return VK_SUCCESS;
+}
- vk_pipeline_cache_destroy(device->mem_cache, &device->vk.alloc);
+VkResult
+tu_EnumerateDeviceLayerProperties(VkPhysicalDevice physicalDevice,
+ uint32_t *pPropertyCount,
+ VkLayerProperties *pProperties)
+{
+ *pPropertyCount = 0;
+ return VK_SUCCESS;
+}
- if (device->perfcntrs_pass_cs) {
- free(device->perfcntrs_pass_cs_entries);
- tu_cs_finish(device->perfcntrs_pass_cs);
- free(device->perfcntrs_pass_cs);
+void
+tu_GetDeviceQueue2(VkDevice _device,
+ const VkDeviceQueueInfo2 *pQueueInfo,
+ VkQueue *pQueue)
+{
+ TU_FROM_HANDLE(tu_device, device, _device);
+ struct tu_queue *queue;
+
+ queue =
+ &device->queues[pQueueInfo->queueFamilyIndex][pQueueInfo->queueIndex];
+ if (pQueueInfo->flags != queue->flags) {
+ /* From the Vulkan 1.1.70 spec:
+ *
+ * "The queue returned by vkGetDeviceQueue2 must have the same
+ * flags value from this structure as that used at device
+ * creation time in a VkDeviceQueueCreateInfo instance. If no
+ * matching flags were specified at device creation time then
+ * pQueue will return VK_NULL_HANDLE."
+ */
+ *pQueue = VK_NULL_HANDLE;
+ return;
}
- tu_autotune_fini(&device->autotune, device);
-
- tu_bo_suballocator_finish(&device->pipeline_suballoc);
- tu_bo_suballocator_finish(&device->autotune_suballoc);
+ *pQueue = tu_queue_to_handle(queue);
+}
- util_sparse_array_finish(&device->bo_map);
- u_rwlock_destroy(&device->dma_bo_lock);
+void
+tu_GetDeviceQueue(VkDevice _device,
+ uint32_t queueFamilyIndex,
+ uint32_t queueIndex,
+ VkQueue *pQueue)
+{
+ const VkDeviceQueueInfo2 info =
+ (VkDeviceQueueInfo2) { .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_INFO_2,
+ .queueFamilyIndex = queueFamilyIndex,
+ .queueIndex = queueIndex };
- pthread_cond_destroy(&device->timeline_cond);
- _mesa_hash_table_destroy(device->bo_sizes, NULL);
- vk_free(&device->vk.alloc, device->bo_list);
- vk_device_finish(&device->vk);
- vk_free(&device->vk.alloc, device);
+ tu_GetDeviceQueue2(_device, &info, pQueue);
}
VkResult
-tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo)
+tu_QueueSubmit(VkQueue _queue,
+ uint32_t submitCount,
+ const VkSubmitInfo *pSubmits,
+ VkFence _fence)
{
- unsigned size_log2 = MAX2(util_logbase2_ceil64(size), MIN_SCRATCH_BO_SIZE_LOG2);
- unsigned index = size_log2 - MIN_SCRATCH_BO_SIZE_LOG2;
- assert(index < ARRAY_SIZE(dev->scratch_bos));
+ TU_FROM_HANDLE(tu_queue, queue, _queue);
+
+ for (uint32_t i = 0; i < submitCount; ++i) {
+ const VkSubmitInfo *submit = pSubmits + i;
+ const bool last_submit = (i == submitCount - 1);
+ struct tu_bo_list bo_list;
+ tu_bo_list_init(&bo_list);
+
+ uint32_t entry_count = 0;
+ for (uint32_t j = 0; j < submit->commandBufferCount; ++j) {
+ TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBuffers[j]);
+ entry_count += cmdbuf->cs.entry_count;
+ }
+
+ struct drm_msm_gem_submit_cmd cmds[entry_count];
+ uint32_t entry_idx = 0;
+ for (uint32_t j = 0; j < submit->commandBufferCount; ++j) {
+ TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBuffers[j]);
+ struct tu_cs *cs = &cmdbuf->cs;
+ for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) {
+ cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
+ cmds[entry_idx].submit_idx =
+ tu_bo_list_add(&bo_list, cs->entries[i].bo,
+ MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
+ cmds[entry_idx].submit_offset = cs->entries[i].offset;
+ cmds[entry_idx].size = cs->entries[i].size;
+ cmds[entry_idx].pad = 0;
+ cmds[entry_idx].nr_relocs = 0;
+ cmds[entry_idx].relocs = 0;
+ }
- for (unsigned i = index; i < ARRAY_SIZE(dev->scratch_bos); i++) {
- if (p_atomic_read(&dev->scratch_bos[i].initialized)) {
- /* Fast path: just return the already-allocated BO. */
- *bo = dev->scratch_bos[i].bo;
- return VK_SUCCESS;
+ tu_bo_list_merge(&bo_list, &cmdbuf->bo_list);
}
- }
- /* Slow path: actually allocate the BO. We take a lock because the process
- * of allocating it is slow, and we don't want to block the CPU while it
- * finishes.
- */
- mtx_lock(&dev->scratch_bos[index].construct_mtx);
+ uint32_t flags = MSM_PIPE_3D0;
+ if (last_submit) {
+ flags |= MSM_SUBMIT_FENCE_FD_OUT;
+ }
- /* Another thread may have allocated it already while we were waiting on
- * the lock. We need to check this in order to avoid double-allocating.
- */
- if (dev->scratch_bos[index].initialized) {
- mtx_unlock(&dev->scratch_bos[index].construct_mtx);
- *bo = dev->scratch_bos[index].bo;
- return VK_SUCCESS;
- }
+ struct drm_msm_gem_submit req = {
+ .flags = flags,
+ .queueid = queue->msm_queue_id,
+ .bos = (uint64_t)(uintptr_t) bo_list.bo_infos,
+ .nr_bos = bo_list.count,
+ .cmds = (uint64_t)(uintptr_t)cmds,
+ .nr_cmds = entry_count,
+ };
- unsigned bo_size = 1ull << size_log2;
- VkResult result = tu_bo_init_new(dev, &dev->scratch_bos[index].bo, bo_size,
- TU_BO_ALLOC_NO_FLAGS, "scratch");
- if (result != VK_SUCCESS) {
- mtx_unlock(&dev->scratch_bos[index].construct_mtx);
- return result;
- }
+ int ret = drmCommandWriteRead(queue->device->physical_device->local_fd,
+ DRM_MSM_GEM_SUBMIT,
+ &req, sizeof(req));
+ if (ret) {
+ fprintf(stderr, "submit failed: %s\n", strerror(errno));
+ abort();
+ }
- p_atomic_set(&dev->scratch_bos[index].initialized, true);
+ tu_bo_list_destroy(&bo_list);
- mtx_unlock(&dev->scratch_bos[index].construct_mtx);
+ if (last_submit) {
+ /* no need to merge fences as queue execution is serialized */
+ tu_fence_update_fd(&queue->submit_fence, req.fence_fd);
+ }
+ }
- *bo = dev->scratch_bos[index].bo;
- return VK_SUCCESS;
-}
+ if (_fence != VK_NULL_HANDLE) {
+ TU_FROM_HANDLE(tu_fence, fence, _fence);
+ tu_fence_copy(fence, &queue->submit_fence);
+ }
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount,
- VkLayerProperties *pProperties)
-{
- *pPropertyCount = 0;
return VK_SUCCESS;
}
-/* Only used for kgsl since drm started using common implementation */
-#ifdef TU_USE_KGSL
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_QueueWaitIdle(VkQueue _queue)
{
TU_FROM_HANDLE(tu_queue, queue, _queue);
- if (vk_device_is_lost(&queue->device->vk))
- return VK_ERROR_DEVICE_LOST;
+ tu_fence_wait_idle(&queue->submit_fence);
- if (queue->fence < 0)
- return VK_SUCCESS;
-
- struct pollfd fds = { .fd = queue->fence, .events = POLLIN };
- int ret;
- do {
- ret = poll(&fds, 1, -1);
- } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+ return VK_SUCCESS;
+}
- /* TODO: otherwise set device lost ? */
- assert(ret == 1 && !(fds.revents & (POLLERR | POLLNVAL)));
+VkResult
+tu_DeviceWaitIdle(VkDevice _device)
+{
+ TU_FROM_HANDLE(tu_device, device, _device);
- close(queue->fence);
- queue->fence = -1;
+ for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
+ for (unsigned q = 0; q < device->queue_count[i]; q++) {
+ tu_QueueWaitIdle(tu_queue_to_handle(&device->queues[i][q]));
+ }
+ }
return VK_SUCCESS;
}
-#endif
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_EnumerateInstanceExtensionProperties(const char *pLayerName,
uint32_t *pPropertyCount,
VkExtensionProperties *pProperties)
{
+ VK_OUTARRAY_MAKE(out, pProperties, pPropertyCount);
+
+ /* We spport no lyaers */
if (pLayerName)
return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
- return vk_enumerate_instance_extension_properties(
- &tu_instance_extensions_supported, pPropertyCount, pProperties);
+ for (int i = 0; i < TU_INSTANCE_EXTENSION_COUNT; i++) {
+ if (tu_supported_instance_extensions.extensions[i]) {
+ vk_outarray_append(&out, prop) { *prop = tu_instance_extensions[i]; }
+ }
+ }
+
+ return vk_outarray_status(&out);
}
-VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
+VkResult
+tu_EnumerateDeviceExtensionProperties(VkPhysicalDevice physicalDevice,
+ const char *pLayerName,
+ uint32_t *pPropertyCount,
+ VkExtensionProperties *pProperties)
+{
+ /* We spport no lyaers */
+ TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
+ VK_OUTARRAY_MAKE(out, pProperties, pPropertyCount);
+
+ /* We spport no lyaers */
+ if (pLayerName)
+ return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
+
+ for (int i = 0; i < TU_DEVICE_EXTENSION_COUNT; i++) {
+ if (device->supported_extensions.extensions[i]) {
+ vk_outarray_append(&out, prop) { *prop = tu_device_extensions[i]; }
+ }
+ }
+
+ return vk_outarray_status(&out);
+}
+
+PFN_vkVoidFunction
tu_GetInstanceProcAddr(VkInstance _instance, const char *pName)
{
TU_FROM_HANDLE(tu_instance, instance, _instance);
- return vk_instance_get_proc_addr(&instance->vk,
- &tu_instance_entrypoints,
- pName);
+
+ return tu_lookup_entrypoint_checked(
+ pName, instance ? instance->api_version : 0,
+ instance ? &instance->enabled_extensions : NULL, NULL);
}
/* The loader wants us to expose a second GetInstanceProcAddr function
@@ -2368,30 +1372,22 @@ vk_icdGetInstanceProcAddr(VkInstance instance, const char *pName)
return tu_GetInstanceProcAddr(instance, pName);
}
-/* With version 4+ of the loader interface the ICD should expose
- * vk_icdGetPhysicalDeviceProcAddr()
- */
-PUBLIC
-VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
-vk_icdGetPhysicalDeviceProcAddr(VkInstance _instance,
- const char* pName);
-
PFN_vkVoidFunction
-vk_icdGetPhysicalDeviceProcAddr(VkInstance _instance,
- const char* pName)
+tu_GetDeviceProcAddr(VkDevice _device, const char *pName)
{
- TU_FROM_HANDLE(tu_instance, instance, _instance);
+ TU_FROM_HANDLE(tu_device, device, _device);
- return vk_instance_get_physical_device_proc_addr(&instance->vk, pName);
+ return tu_lookup_entrypoint_checked(pName, device->instance->api_version,
+ &device->instance->enabled_extensions,
+ &device->enabled_extensions);
}
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_AllocateMemory(VkDevice _device,
- const VkMemoryAllocateInfo *pAllocateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkDeviceMemory *pMem)
+static VkResult
+tu_alloc_memory(struct tu_device *device,
+ const VkMemoryAllocateInfo *pAllocateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkDeviceMemory *pMem)
{
- TU_FROM_HANDLE(tu_device, device, _device);
struct tu_device_memory *mem;
VkResult result;
@@ -2403,15 +1399,10 @@ tu_AllocateMemory(VkDevice _device,
return VK_SUCCESS;
}
- struct tu_memory_heap *mem_heap = &device->physical_device->heap;
- uint64_t mem_heap_used = p_atomic_read(&mem_heap->used);
- if (mem_heap_used > mem_heap->size)
- return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
-
- mem = vk_object_alloc(&device->vk, pAllocator, sizeof(*mem),
- VK_OBJECT_TYPE_DEVICE_MEMORY);
+ mem = vk_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (mem == NULL)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
const VkImportMemoryFdInfoKHR *fd_info =
vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR);
@@ -2436,69 +1427,37 @@ tu_AllocateMemory(VkDevice _device,
close(fd_info->fd);
}
} else {
- uint64_t client_address = 0;
- enum tu_bo_alloc_flags alloc_flags = TU_BO_ALLOC_NO_FLAGS;
-
- const VkMemoryOpaqueCaptureAddressAllocateInfo *replay_info =
- vk_find_struct_const(pAllocateInfo->pNext,
- MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO);
- if (replay_info && replay_info->opaqueCaptureAddress) {
- client_address = replay_info->opaqueCaptureAddress;
- alloc_flags |= TU_BO_ALLOC_REPLAYABLE;
- }
-
- const VkMemoryAllocateFlagsInfo *flags_info = vk_find_struct_const(
- pAllocateInfo->pNext, MEMORY_ALLOCATE_FLAGS_INFO);
- if (flags_info &&
- (flags_info->flags &
- VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT)) {
- alloc_flags |= TU_BO_ALLOC_REPLAYABLE;
- }
-
- char name[64] = "vkAllocateMemory()";
- if (device->bo_sizes)
- snprintf(name, ARRAY_SIZE(name), "vkAllocateMemory(%ldkb)",
- (long)DIV_ROUND_UP(pAllocateInfo->allocationSize, 1024));
- result = tu_bo_init_new_explicit_iova(
- device, &mem->bo, pAllocateInfo->allocationSize, client_address,
- alloc_flags, name);
- }
-
- if (result == VK_SUCCESS) {
- mem_heap_used = p_atomic_add_return(&mem_heap->used, mem->bo->size);
- if (mem_heap_used > mem_heap->size) {
- p_atomic_add(&mem_heap->used, -mem->bo->size);
- tu_bo_finish(device, mem->bo);
- result = vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
- "Out of heap memory");
- }
+ result =
+ tu_bo_init_new(device, &mem->bo, pAllocateInfo->allocationSize);
}
if (result != VK_SUCCESS) {
- vk_object_free(&device->vk, pAllocator, mem);
+ vk_free2(&device->alloc, pAllocator, mem);
return result;
}
- /* Track in the device whether our BO list contains any implicit-sync BOs, so
- * we can suppress implicit sync on non-WSI usage.
- */
- const struct wsi_memory_allocate_info *wsi_info =
- vk_find_struct_const(pAllocateInfo->pNext, WSI_MEMORY_ALLOCATE_INFO_MESA);
- if (wsi_info && wsi_info->implicit_sync) {
- mtx_lock(&device->bo_mutex);
- if (!mem->bo->implicit_sync) {
- mem->bo->implicit_sync = true;
- device->implicit_sync_bo_count++;
- }
- mtx_unlock(&device->bo_mutex);
- }
+ mem->size = pAllocateInfo->allocationSize;
+ mem->type_index = pAllocateInfo->memoryTypeIndex;
+
+ mem->map = NULL;
+ mem->user_ptr = NULL;
*pMem = tu_device_memory_to_handle(mem);
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
+VkResult
+tu_AllocateMemory(VkDevice _device,
+ const VkMemoryAllocateInfo *pAllocateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkDeviceMemory *pMem)
+{
+ TU_FROM_HANDLE(tu_device, device, _device);
+ return tu_alloc_memory(device, pAllocateInfo, pAllocator, pMem);
+}
+
+void
tu_FreeMemory(VkDevice _device,
VkDeviceMemory _mem,
const VkAllocationCallbacks *pAllocator)
@@ -2509,12 +1468,11 @@ tu_FreeMemory(VkDevice _device,
if (mem == NULL)
return;
- p_atomic_add(&device->physical_device->heap.used, -mem->bo->size);
- tu_bo_finish(device, mem->bo);
- vk_object_free(&device->vk, pAllocator, mem);
+ tu_bo_finish(device, &mem->bo);
+ vk_free2(&device->alloc, pAllocator, mem);
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_MapMemory(VkDevice _device,
VkDeviceMemory _memory,
VkDeviceSize offset,
@@ -2531,23 +1489,31 @@ tu_MapMemory(VkDevice _device,
return VK_SUCCESS;
}
- if (!mem->bo->map) {
- result = tu_bo_map(device, mem->bo);
+ if (mem->user_ptr) {
+ *ppData = mem->user_ptr;
+ } else if (!mem->map) {
+ result = tu_bo_map(device, &mem->bo);
if (result != VK_SUCCESS)
return result;
+ *ppData = mem->map = mem->bo.map;
+ } else
+ *ppData = mem->map;
+
+ if (*ppData) {
+ *ppData += offset;
+ return VK_SUCCESS;
}
- *ppData = mem->bo->map + offset;
- return VK_SUCCESS;
+ return vk_error(device->instance, VK_ERROR_MEMORY_MAP_FAILED);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_UnmapMemory(VkDevice _device, VkDeviceMemory _memory)
{
- /* TODO: unmap here instead of waiting for FreeMemory */
+ /* I do not see any unmapping done by the freedreno Gallium driver. */
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_FlushMappedMemoryRanges(VkDevice _device,
uint32_t memoryRangeCount,
const VkMappedMemoryRange *pMemoryRanges)
@@ -2555,7 +1521,7 @@ tu_FlushMappedMemoryRanges(VkDevice _device,
return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_InvalidateMappedMemoryRanges(VkDevice _device,
uint32_t memoryRangeCount,
const VkMappedMemoryRange *pMemoryRanges)
@@ -2563,52 +1529,71 @@ tu_InvalidateMappedMemoryRanges(VkDevice _device,
return VK_SUCCESS;
}
-static void
-tu_get_buffer_memory_requirements(uint64_t size,
- VkMemoryRequirements2 *pMemoryRequirements)
+void
+tu_GetBufferMemoryRequirements(VkDevice _device,
+ VkBuffer _buffer,
+ VkMemoryRequirements *pMemoryRequirements)
{
- pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
- .memoryTypeBits = 1,
- .alignment = 64,
- .size = MAX2(align64(size, 64), size),
- };
+ TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
- vk_foreach_struct(ext, pMemoryRequirements->pNext) {
- switch (ext->sType) {
- case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
- VkMemoryDedicatedRequirements *req =
- (VkMemoryDedicatedRequirements *) ext;
- req->requiresDedicatedAllocation = false;
- req->prefersDedicatedAllocation = req->requiresDedicatedAllocation;
- break;
- }
- default:
- break;
- }
- }
+ pMemoryRequirements->memoryTypeBits = 1;
+ pMemoryRequirements->alignment = 16;
+ pMemoryRequirements->size =
+ align64(buffer->size, pMemoryRequirements->alignment);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_GetBufferMemoryRequirements2(
VkDevice device,
const VkBufferMemoryRequirementsInfo2 *pInfo,
VkMemoryRequirements2 *pMemoryRequirements)
{
- TU_FROM_HANDLE(tu_buffer, buffer, pInfo->buffer);
+ tu_GetBufferMemoryRequirements(device, pInfo->buffer,
+ &pMemoryRequirements->memoryRequirements);
+}
+
+void
+tu_GetImageMemoryRequirements(VkDevice _device,
+ VkImage _image,
+ VkMemoryRequirements *pMemoryRequirements)
+{
+ TU_FROM_HANDLE(tu_image, image, _image);
+
+ pMemoryRequirements->memoryTypeBits = 1;
+ pMemoryRequirements->size = image->size;
+ pMemoryRequirements->alignment = image->alignment;
+}
+
+void
+tu_GetImageMemoryRequirements2(VkDevice device,
+ const VkImageMemoryRequirementsInfo2 *pInfo,
+ VkMemoryRequirements2 *pMemoryRequirements)
+{
+ tu_GetImageMemoryRequirements(device, pInfo->image,
+ &pMemoryRequirements->memoryRequirements);
+}
- tu_get_buffer_memory_requirements(buffer->vk.size, pMemoryRequirements);
+void
+tu_GetImageSparseMemoryRequirements(
+ VkDevice device,
+ VkImage image,
+ uint32_t *pSparseMemoryRequirementCount,
+ VkSparseImageMemoryRequirements *pSparseMemoryRequirements)
+{
+ tu_stub();
}
-VKAPI_ATTR void VKAPI_CALL
-tu_GetDeviceBufferMemoryRequirements(
+void
+tu_GetImageSparseMemoryRequirements2(
VkDevice device,
- const VkDeviceBufferMemoryRequirements *pInfo,
- VkMemoryRequirements2 *pMemoryRequirements)
+ const VkImageSparseMemoryRequirementsInfo2 *pInfo,
+ uint32_t *pSparseMemoryRequirementCount,
+ VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
{
- tu_get_buffer_memory_requirements(pInfo->pCreateInfo->size, pMemoryRequirements);
+ tu_stub();
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_GetDeviceMemoryCommitment(VkDevice device,
VkDeviceMemory memory,
VkDeviceSize *pCommittedMemoryInBytes)
@@ -2616,7 +1601,7 @@ tu_GetDeviceMemoryCommitment(VkDevice device,
*pCommittedMemoryInBytes = 0;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_BindBufferMemory2(VkDevice device,
uint32_t bindInfoCount,
const VkBindBufferMemoryInfo *pBindInfos)
@@ -2626,8 +1611,8 @@ tu_BindBufferMemory2(VkDevice device,
TU_FROM_HANDLE(tu_buffer, buffer, pBindInfos[i].buffer);
if (mem) {
- buffer->bo = mem->bo;
- buffer->iova = mem->bo->iova + pBindInfos[i].memoryOffset;
+ buffer->bo = &mem->bo;
+ buffer->bo_offset = pBindInfos[i].memoryOffset;
} else {
buffer->bo = NULL;
}
@@ -2635,7 +1620,23 @@ tu_BindBufferMemory2(VkDevice device,
return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
+tu_BindBufferMemory(VkDevice device,
+ VkBuffer buffer,
+ VkDeviceMemory memory,
+ VkDeviceSize memoryOffset)
+{
+ const VkBindBufferMemoryInfo info = {
+ .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
+ .buffer = buffer,
+ .memory = memory,
+ .memoryOffset = memoryOffset
+ };
+
+ return tu_BindBufferMemory2(device, 1, &info);
+}
+
+VkResult
tu_BindImageMemory2(VkDevice device,
uint32_t bindInfoCount,
const VkBindImageMemoryInfo *pBindInfos)
@@ -2645,18 +1646,34 @@ tu_BindImageMemory2(VkDevice device,
TU_FROM_HANDLE(tu_device_memory, mem, pBindInfos[i].memory);
if (mem) {
- image->bo = mem->bo;
- image->iova = mem->bo->iova + pBindInfos[i].memoryOffset;
+ image->bo = &mem->bo;
+ image->bo_offset = pBindInfos[i].memoryOffset;
} else {
image->bo = NULL;
- image->iova = 0;
+ image->bo_offset = 0;
}
}
return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
+tu_BindImageMemory(VkDevice device,
+ VkImage image,
+ VkDeviceMemory memory,
+ VkDeviceSize memoryOffset)
+{
+ const VkBindImageMemoryInfo info = {
+ .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
+ .image = image,
+ .memory = memory,
+ .memoryOffset = memoryOffset
+ };
+
+ return tu_BindImageMemory2(device, 1, &info);
+}
+
+VkResult
tu_QueueBindSparse(VkQueue _queue,
uint32_t bindInfoCount,
const VkBindSparseInfo *pBindInfo,
@@ -2665,41 +1682,59 @@ tu_QueueBindSparse(VkQueue _queue,
return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+// Queue semaphore functions
+
+VkResult
+tu_CreateSemaphore(VkDevice _device,
+ const VkSemaphoreCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkSemaphore *pSemaphore)
+{
+ TU_FROM_HANDLE(tu_device, device, _device);
+
+ struct tu_semaphore *sem =
+ vk_alloc2(&device->alloc, pAllocator, sizeof(*sem), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (!sem)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ *pSemaphore = tu_semaphore_to_handle(sem);
+ return VK_SUCCESS;
+}
+
+void
+tu_DestroySemaphore(VkDevice _device,
+ VkSemaphore _semaphore,
+ const VkAllocationCallbacks *pAllocator)
+{
+ TU_FROM_HANDLE(tu_device, device, _device);
+ TU_FROM_HANDLE(tu_semaphore, sem, _semaphore);
+ if (!_semaphore)
+ return;
+
+ vk_free2(&device->alloc, pAllocator, sem);
+}
+
+VkResult
tu_CreateEvent(VkDevice _device,
const VkEventCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
VkEvent *pEvent)
{
TU_FROM_HANDLE(tu_device, device, _device);
-
struct tu_event *event =
- vk_object_alloc(&device->vk, pAllocator, sizeof(*event),
- VK_OBJECT_TYPE_EVENT);
- if (!event)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- VkResult result = tu_bo_init_new(device, &event->bo, 0x1000,
- TU_BO_ALLOC_NO_FLAGS, "event");
- if (result != VK_SUCCESS)
- goto fail_alloc;
+ vk_alloc2(&device->alloc, pAllocator, sizeof(*event), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- result = tu_bo_map(device, event->bo);
- if (result != VK_SUCCESS)
- goto fail_map;
+ if (!event)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
*pEvent = tu_event_to_handle(event);
return VK_SUCCESS;
-
-fail_map:
- tu_bo_finish(device, event->bo);
-fail_alloc:
- vk_object_free(&device->vk, pAllocator, event);
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyEvent(VkDevice _device,
VkEvent _event,
const VkAllocationCallbacks *pAllocator)
@@ -2709,40 +1744,38 @@ tu_DestroyEvent(VkDevice _device,
if (!event)
return;
-
- tu_bo_finish(device, event->bo);
- vk_object_free(&device->vk, pAllocator, event);
+ vk_free2(&device->alloc, pAllocator, event);
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_GetEventStatus(VkDevice _device, VkEvent _event)
{
TU_FROM_HANDLE(tu_event, event, _event);
- if (*(uint64_t*) event->bo->map == 1)
+ if (*event->map == 1)
return VK_EVENT_SET;
return VK_EVENT_RESET;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_SetEvent(VkDevice _device, VkEvent _event)
{
TU_FROM_HANDLE(tu_event, event, _event);
- *(uint64_t*) event->bo->map = 1;
+ *event->map = 1;
return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_ResetEvent(VkDevice _device, VkEvent _event)
{
TU_FROM_HANDLE(tu_event, event, _event);
- *(uint64_t*) event->bo->map = 0;
+ *event->map = 0;
return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreateBuffer(VkDevice _device,
const VkBufferCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
@@ -2751,17 +1784,23 @@ tu_CreateBuffer(VkDevice _device,
TU_FROM_HANDLE(tu_device, device, _device);
struct tu_buffer *buffer;
- buffer = vk_buffer_create(&device->vk, pCreateInfo, pAllocator,
- sizeof(*buffer));
+ assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO);
+
+ buffer = vk_alloc2(&device->alloc, pAllocator, sizeof(*buffer), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (buffer == NULL)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ buffer->size = pCreateInfo->size;
+ buffer->usage = pCreateInfo->usage;
+ buffer->flags = pCreateInfo->flags;
*pBuffer = tu_buffer_to_handle(buffer);
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyBuffer(VkDevice _device,
VkBuffer _buffer,
const VkAllocationCallbacks *pAllocator)
@@ -2772,90 +1811,65 @@ tu_DestroyBuffer(VkDevice _device,
if (!buffer)
return;
- vk_buffer_destroy(&device->vk, pAllocator, &buffer->vk);
+ vk_free2(&device->alloc, pAllocator, buffer);
}
-VKAPI_ATTR VkResult VKAPI_CALL
+static uint32_t
+tu_surface_max_layer_count(struct tu_image_view *iview)
+{
+ return iview->type == VK_IMAGE_VIEW_TYPE_3D
+ ? iview->extent.depth
+ : (iview->base_layer + iview->layer_count);
+}
+
+VkResult
tu_CreateFramebuffer(VkDevice _device,
const VkFramebufferCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
VkFramebuffer *pFramebuffer)
{
TU_FROM_HANDLE(tu_device, device, _device);
-
- if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC))
- return vk_common_CreateFramebuffer(_device, pCreateInfo, pAllocator,
- pFramebuffer);
-
- TU_FROM_HANDLE(tu_render_pass, pass, pCreateInfo->renderPass);
struct tu_framebuffer *framebuffer;
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO);
- bool imageless = pCreateInfo->flags & VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT;
-
- size_t size = sizeof(*framebuffer);
- if (!imageless)
- size += sizeof(struct tu_attachment_info) * pCreateInfo->attachmentCount;
- framebuffer = vk_object_alloc(&device->vk, pAllocator, size,
- VK_OBJECT_TYPE_FRAMEBUFFER);
+ size_t size = sizeof(*framebuffer) + sizeof(struct tu_attachment_info) *
+ pCreateInfo->attachmentCount;
+ framebuffer = vk_alloc2(&device->alloc, pAllocator, size, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (framebuffer == NULL)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
framebuffer->attachment_count = pCreateInfo->attachmentCount;
framebuffer->width = pCreateInfo->width;
framebuffer->height = pCreateInfo->height;
framebuffer->layers = pCreateInfo->layers;
+ for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
+ VkImageView _iview = pCreateInfo->pAttachments[i];
+ struct tu_image_view *iview = tu_image_view_from_handle(_iview);
+ framebuffer->attachments[i].attachment = iview;
- if (!imageless) {
- for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
- VkImageView _iview = pCreateInfo->pAttachments[i];
- struct tu_image_view *iview = tu_image_view_from_handle(_iview);
- framebuffer->attachments[i].attachment = iview;
- }
+ framebuffer->width = MIN2(framebuffer->width, iview->extent.width);
+ framebuffer->height = MIN2(framebuffer->height, iview->extent.height);
+ framebuffer->layers =
+ MIN2(framebuffer->layers, tu_surface_max_layer_count(iview));
}
- tu_framebuffer_tiling_config(framebuffer, device, pass);
-
*pFramebuffer = tu_framebuffer_to_handle(framebuffer);
return VK_SUCCESS;
}
void
-tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
- const VkRenderingInfo *pRenderingInfo)
-{
- struct tu_render_pass *pass = &cmd_buffer->dynamic_pass;
- struct tu_framebuffer *framebuffer = &cmd_buffer->dynamic_framebuffer;
-
- framebuffer->attachment_count = pass->attachment_count;
- framebuffer->width = pRenderingInfo->renderArea.offset.x +
- pRenderingInfo->renderArea.extent.width;
- framebuffer->height = pRenderingInfo->renderArea.offset.y +
- pRenderingInfo->renderArea.extent.height;
- framebuffer->layers = pRenderingInfo->layerCount;
-
- tu_framebuffer_tiling_config(framebuffer, cmd_buffer->device, pass);
-}
-
-VKAPI_ATTR void VKAPI_CALL
tu_DestroyFramebuffer(VkDevice _device,
VkFramebuffer _fb,
const VkAllocationCallbacks *pAllocator)
{
TU_FROM_HANDLE(tu_device, device, _device);
-
- if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC)) {
- vk_common_DestroyFramebuffer(_device, _fb, pAllocator);
- return;
- }
-
TU_FROM_HANDLE(tu_framebuffer, fb, _fb);
if (!fb)
return;
-
- vk_object_free(&device->vk, pAllocator, fb);
+ vk_free2(&device->alloc, pAllocator, fb);
}
static void
@@ -2863,89 +1877,9 @@ tu_init_sampler(struct tu_device *device,
struct tu_sampler *sampler,
const VkSamplerCreateInfo *pCreateInfo)
{
- const struct VkSamplerReductionModeCreateInfo *reduction =
- vk_find_struct_const(pCreateInfo->pNext, SAMPLER_REDUCTION_MODE_CREATE_INFO);
- const struct VkSamplerYcbcrConversionInfo *ycbcr_conversion =
- vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO);
- const VkSamplerCustomBorderColorCreateInfoEXT *custom_border_color =
- vk_find_struct_const(pCreateInfo->pNext, SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT);
- /* for non-custom border colors, the VK enum is translated directly to an offset in
- * the border color buffer. custom border colors are located immediately after the
- * builtin colors, and thus an offset of TU_BORDER_COLOR_BUILTIN is added.
- */
- uint32_t border_color = (unsigned) pCreateInfo->borderColor;
- if (pCreateInfo->borderColor == VK_BORDER_COLOR_FLOAT_CUSTOM_EXT ||
- pCreateInfo->borderColor == VK_BORDER_COLOR_INT_CUSTOM_EXT) {
- mtx_lock(&device->mutex);
- border_color = BITSET_FFS(device->custom_border_color) - 1;
- assert(border_color < TU_BORDER_COLOR_COUNT);
- BITSET_CLEAR(device->custom_border_color, border_color);
- mtx_unlock(&device->mutex);
-
- VkClearColorValue color = custom_border_color->customBorderColor;
- if (custom_border_color->format == VK_FORMAT_D24_UNORM_S8_UINT &&
- pCreateInfo->borderColor == VK_BORDER_COLOR_INT_CUSTOM_EXT &&
- device->use_z24uint_s8uint) {
- /* When sampling stencil using the special Z24UINT_S8UINT format, the
- * border color is in the second component. Note: if
- * customBorderColorWithoutFormat is enabled, we may miss doing this
- * here if the format isn't specified, which is why we don't use that
- * format.
- */
- color.uint32[1] = color.uint32[0];
- }
-
- tu6_pack_border_color(device->global_bo->map + gb_offset(bcolor[border_color]),
- &color,
- pCreateInfo->borderColor == VK_BORDER_COLOR_INT_CUSTOM_EXT);
- border_color += TU_BORDER_COLOR_BUILTIN;
- }
-
- unsigned aniso = pCreateInfo->anisotropyEnable ?
- util_last_bit(MIN2((uint32_t)pCreateInfo->maxAnisotropy >> 1, 8)) : 0;
- bool miplinear = (pCreateInfo->mipmapMode == VK_SAMPLER_MIPMAP_MODE_LINEAR);
- float min_lod = CLAMP(pCreateInfo->minLod, 0.0f, 4095.0f / 256.0f);
- float max_lod = CLAMP(pCreateInfo->maxLod, 0.0f, 4095.0f / 256.0f);
-
- sampler->descriptor[0] =
- COND(miplinear, A6XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR) |
- A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(pCreateInfo->magFilter, aniso)) |
- A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(pCreateInfo->minFilter, aniso)) |
- A6XX_TEX_SAMP_0_ANISO(aniso) |
- A6XX_TEX_SAMP_0_WRAP_S(tu6_tex_wrap(pCreateInfo->addressModeU)) |
- A6XX_TEX_SAMP_0_WRAP_T(tu6_tex_wrap(pCreateInfo->addressModeV)) |
- A6XX_TEX_SAMP_0_WRAP_R(tu6_tex_wrap(pCreateInfo->addressModeW)) |
- A6XX_TEX_SAMP_0_LOD_BIAS(pCreateInfo->mipLodBias);
- sampler->descriptor[1] =
- COND(pCreateInfo->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT,
- A6XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF) |
- COND(pCreateInfo->unnormalizedCoordinates, A6XX_TEX_SAMP_1_UNNORM_COORDS) |
- A6XX_TEX_SAMP_1_MIN_LOD(min_lod) |
- A6XX_TEX_SAMP_1_MAX_LOD(max_lod) |
- COND(pCreateInfo->compareEnable,
- A6XX_TEX_SAMP_1_COMPARE_FUNC(tu6_compare_func(pCreateInfo->compareOp)));
- sampler->descriptor[2] = A6XX_TEX_SAMP_2_BCOLOR(border_color);
- sampler->descriptor[3] = 0;
-
- if (reduction) {
- sampler->descriptor[2] |= A6XX_TEX_SAMP_2_REDUCTION_MODE(
- tu6_reduction_mode(reduction->reductionMode));
- }
-
- sampler->ycbcr_sampler = ycbcr_conversion ?
- tu_sampler_ycbcr_conversion_from_handle(ycbcr_conversion->conversion) : NULL;
-
- if (sampler->ycbcr_sampler &&
- sampler->ycbcr_sampler->chroma_filter == VK_FILTER_LINEAR) {
- sampler->descriptor[2] |= A6XX_TEX_SAMP_2_CHROMA_LINEAR;
- }
-
- /* TODO:
- * A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR disables mipmapping, but vk has no NONE mipfilter?
- */
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreateSampler(VkDevice _device,
const VkSamplerCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
@@ -2956,10 +1890,10 @@ tu_CreateSampler(VkDevice _device,
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO);
- sampler = vk_object_alloc(&device->vk, pAllocator, sizeof(*sampler),
- VK_OBJECT_TYPE_SAMPLER);
+ sampler = vk_alloc2(&device->alloc, pAllocator, sizeof(*sampler), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!sampler)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
tu_init_sampler(device, sampler, pCreateInfo);
*pSampler = tu_sampler_to_handle(sampler);
@@ -2967,29 +1901,17 @@ tu_CreateSampler(VkDevice _device,
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroySampler(VkDevice _device,
VkSampler _sampler,
const VkAllocationCallbacks *pAllocator)
{
TU_FROM_HANDLE(tu_device, device, _device);
TU_FROM_HANDLE(tu_sampler, sampler, _sampler);
- uint32_t border_color;
if (!sampler)
return;
-
- border_color = (sampler->descriptor[2] & A6XX_TEX_SAMP_2_BCOLOR__MASK) >> A6XX_TEX_SAMP_2_BCOLOR__SHIFT;
- if (border_color >= TU_BORDER_COLOR_BUILTIN) {
- border_color -= TU_BORDER_COLOR_BUILTIN;
- /* if the sampler had a custom border color, free it. TODO: no lock */
- mtx_lock(&device->mutex);
- assert(!BITSET_TEST(device->custom_border_color, border_color));
- BITSET_SET(device->custom_border_color, border_color);
- mtx_unlock(&device->mutex);
- }
-
- vk_object_free(&device->vk, pAllocator, sampler);
+ vk_free2(&device->alloc, pAllocator, sampler);
}
/* vk_icd.h does not declare this function, so we declare it here to
@@ -3032,21 +1954,12 @@ vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t *pSupportedVersion)
* - The ICD must implement vkCreate{PLATFORM}SurfaceKHR(),
* vkDestroySurfaceKHR(), and other API which uses VKSurfaceKHR,
* because the loader no longer does so.
- *
- * - Loader interface v4 differs from v3 in:
- * - The ICD must implement vk_icdGetPhysicalDeviceProcAddr().
- *
- * - Loader interface v5 differs from v4 in:
- * - The ICD must support Vulkan API version 1.1 and must not return
- * VK_ERROR_INCOMPATIBLE_DRIVER from vkCreateInstance() unless a
- * Vulkan Loader with interface v4 or smaller is being used and the
- * application provides an API version that is greater than 1.0.
*/
- *pSupportedVersion = MIN2(*pSupportedVersion, 5u);
+ *pSupportedVersion = MIN2(*pSupportedVersion, 3u);
return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_GetMemoryFdKHR(VkDevice _device,
const VkMemoryGetFdInfoKHR *pGetFdInfo,
int *pFd)
@@ -3062,15 +1975,15 @@ tu_GetMemoryFdKHR(VkDevice _device,
pGetFdInfo->handleType ==
VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
- int prime_fd = tu_bo_export_dmabuf(device, memory->bo);
+ int prime_fd = tu_bo_export_dmabuf(device, &memory->bo);
if (prime_fd < 0)
- return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
*pFd = prime_fd;
return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_GetMemoryFdPropertiesKHR(VkDevice _device,
VkExternalMemoryHandleTypeFlagBits handleType,
int fd,
@@ -3081,160 +1994,78 @@ tu_GetMemoryFdPropertiesKHR(VkDevice _device,
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
-tu_GetDeviceGroupPeerMemoryFeatures(
- VkDevice device,
- uint32_t heapIndex,
- uint32_t localDeviceIndex,
- uint32_t remoteDeviceIndex,
- VkPeerMemoryFeatureFlags *pPeerMemoryFeatures)
-{
- assert(localDeviceIndex == remoteDeviceIndex);
-
- *pPeerMemoryFeatures = VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT |
- VK_PEER_MEMORY_FEATURE_COPY_DST_BIT |
- VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT |
- VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_GetPhysicalDeviceMultisamplePropertiesEXT(
- VkPhysicalDevice physicalDevice,
- VkSampleCountFlagBits samples,
- VkMultisamplePropertiesEXT* pMultisampleProperties)
-{
- TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice);
-
- if (samples <= VK_SAMPLE_COUNT_4_BIT && pdevice->vk.supported_extensions.EXT_sample_locations)
- pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){ 1, 1 };
- else
- pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){ 0, 0 };
-}
-
-VkDeviceAddress
-tu_GetBufferDeviceAddress(VkDevice _device,
- const VkBufferDeviceAddressInfo* pInfo)
-{
- TU_FROM_HANDLE(tu_buffer, buffer, pInfo->buffer);
-
- return buffer->iova;
-}
-
-uint64_t tu_GetBufferOpaqueCaptureAddress(
- VkDevice device,
- const VkBufferDeviceAddressInfo* pInfo)
+void
+tu_GetPhysicalDeviceExternalSemaphoreProperties(
+ VkPhysicalDevice physicalDevice,
+ const VkPhysicalDeviceExternalSemaphoreInfo *pExternalSemaphoreInfo,
+ VkExternalSemaphoreProperties *pExternalSemaphoreProperties)
{
- /* We care only about memory allocation opaque addresses */
- return 0;
+ pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0;
+ pExternalSemaphoreProperties->compatibleHandleTypes = 0;
+ pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
}
-uint64_t tu_GetDeviceMemoryOpaqueCaptureAddress(
- VkDevice device,
- const VkDeviceMemoryOpaqueCaptureAddressInfo* pInfo)
+void
+tu_GetPhysicalDeviceExternalFenceProperties(
+ VkPhysicalDevice physicalDevice,
+ const VkPhysicalDeviceExternalFenceInfo *pExternalFenceInfo,
+ VkExternalFenceProperties *pExternalFenceProperties)
{
- TU_FROM_HANDLE(tu_device_memory, mem, pInfo->memory);
- return mem->bo->iova;
+ pExternalFenceProperties->exportFromImportedHandleTypes = 0;
+ pExternalFenceProperties->compatibleHandleTypes = 0;
+ pExternalFenceProperties->externalFenceFeatures = 0;
}
-struct tu_debug_bos_entry {
- uint32_t count;
- uint64_t size;
- const char *name;
-};
-
-const char *
-tu_debug_bos_add(struct tu_device *dev, uint64_t size, const char *name)
+VkResult
+tu_CreateDebugReportCallbackEXT(
+ VkInstance _instance,
+ const VkDebugReportCallbackCreateInfoEXT *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkDebugReportCallbackEXT *pCallback)
{
- assert(name);
-
- if (likely(!dev->bo_sizes))
- return NULL;
-
- mtx_lock(&dev->bo_mutex);
- struct hash_entry *entry = _mesa_hash_table_search(dev->bo_sizes, name);
- struct tu_debug_bos_entry *debug_bos;
-
- if (!entry) {
- debug_bos = calloc(1, sizeof(struct tu_debug_bos_entry));
- debug_bos->name = strdup(name);
- _mesa_hash_table_insert(dev->bo_sizes, debug_bos->name, debug_bos);
- } else {
- debug_bos = entry->data;
- }
-
- debug_bos->count++;
- debug_bos->size += align(size, 4096);
- mtx_unlock(&dev->bo_mutex);
-
- return debug_bos->name;
+ TU_FROM_HANDLE(tu_instance, instance, _instance);
+ return vk_create_debug_report_callback(&instance->debug_report_callbacks,
+ pCreateInfo, pAllocator,
+ &instance->alloc, pCallback);
}
void
-tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo)
+tu_DestroyDebugReportCallbackEXT(VkInstance _instance,
+ VkDebugReportCallbackEXT _callback,
+ const VkAllocationCallbacks *pAllocator)
{
- if (likely(!dev->bo_sizes) || !bo->name)
- return;
-
- mtx_lock(&dev->bo_mutex);
- struct hash_entry *entry =
- _mesa_hash_table_search(dev->bo_sizes, bo->name);
- /* If we're finishing the BO, it should have been added already */
- assert(entry);
-
- struct tu_debug_bos_entry *debug_bos = entry->data;
- debug_bos->count--;
- debug_bos->size -= align(bo->size, 4096);
- if (!debug_bos->count) {
- _mesa_hash_table_remove(dev->bo_sizes, entry);
- free((void *) debug_bos->name);
- free(debug_bos);
- }
- mtx_unlock(&dev->bo_mutex);
+ TU_FROM_HANDLE(tu_instance, instance, _instance);
+ vk_destroy_debug_report_callback(&instance->debug_report_callbacks,
+ _callback, pAllocator, &instance->alloc);
}
-static int debug_bos_count_compare(const void *in_a, const void *in_b)
+void
+tu_DebugReportMessageEXT(VkInstance _instance,
+ VkDebugReportFlagsEXT flags,
+ VkDebugReportObjectTypeEXT objectType,
+ uint64_t object,
+ size_t location,
+ int32_t messageCode,
+ const char *pLayerPrefix,
+ const char *pMessage)
{
- struct tu_debug_bos_entry *a = *(struct tu_debug_bos_entry **)in_a;
- struct tu_debug_bos_entry *b = *(struct tu_debug_bos_entry **)in_b;
- return a->count - b->count;
+ TU_FROM_HANDLE(tu_instance, instance, _instance);
+ vk_debug_report(&instance->debug_report_callbacks, flags, objectType,
+ object, location, messageCode, pLayerPrefix, pMessage);
}
void
-tu_debug_bos_print_stats(struct tu_device *dev)
+tu_GetDeviceGroupPeerMemoryFeatures(
+ VkDevice device,
+ uint32_t heapIndex,
+ uint32_t localDeviceIndex,
+ uint32_t remoteDeviceIndex,
+ VkPeerMemoryFeatureFlags *pPeerMemoryFeatures)
{
- if (likely(!dev->bo_sizes))
- return;
-
- mtx_lock(&dev->bo_mutex);
-
- /* Put the HT's sizes data in an array so we can sort by number of allocations. */
- struct util_dynarray dyn;
- util_dynarray_init(&dyn, NULL);
-
- uint32_t size = 0;
- uint32_t count = 0;
- hash_table_foreach(dev->bo_sizes, entry)
- {
- struct tu_debug_bos_entry *debug_bos = (void *) entry->data;
- util_dynarray_append(&dyn, struct tu_debug_bos_entry *, debug_bos);
- size += debug_bos->size / 1024;
- count += debug_bos->count;
- }
-
- qsort(dyn.data,
- util_dynarray_num_elements(&dyn, struct tu_debug_bos_entry *),
- sizeof(struct tu_debug_bos_entryos_entry *), debug_bos_count_compare);
-
- util_dynarray_foreach(&dyn, struct tu_debug_bos_entry *, entryp)
- {
- struct tu_debug_bos_entry *debug_bos = *entryp;
- mesa_logi("%30s: %4d bos, %lld kb\n", debug_bos->name, debug_bos->count,
- (long long) (debug_bos->size / 1024));
- }
-
- mesa_logi("submitted %d bos (%d MB)\n", count, DIV_ROUND_UP(size, 1024));
-
- util_dynarray_fini(&dyn);
+ assert(localDeviceIndex == remoteDeviceIndex);
- mtx_unlock(&dev->bo_mutex);
+ *pPeerMemoryFeatures = VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT |
+ VK_PEER_MEMORY_FEATURE_COPY_DST_BIT |
+ VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT |
+ VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT;
}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_drm.c b/lib/mesa/src/freedreno/vulkan/tu_drm.c
index 9a57c6644..9b2e6f788 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_drm.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_drm.c
@@ -1,61 +1,36 @@
/*
* Copyright © 2018 Google, Inc.
* Copyright © 2015 Intel Corporation
- * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
-#include "tu_drm.h"
+#include "tu_private.h"
#include <errno.h>
#include <fcntl.h>
+#include <stdint.h>
#include <sys/ioctl.h>
-#include <sys/mman.h>
#include <xf86drm.h>
-#ifdef MAJOR_IN_MKDEV
-#include <sys/mkdev.h>
-#endif
-#ifdef MAJOR_IN_SYSMACROS
-#include <sys/sysmacros.h>
-#endif
-
-#include "vk_util.h"
-
#include "drm-uapi/msm_drm.h"
-#include "util/u_debug.h"
-#include "util/hash_table.h"
-#include "util/timespec.h"
-#include "util/os_time.h"
-
-#include "tu_cmd_buffer.h"
-#include "tu_cs.h"
-#include "tu_device.h"
-#include "tu_dynamic_rendering.h"
-
-struct tu_queue_submit
-{
- struct vk_queue_submit *vk_submit;
- struct tu_u_trace_submission_data *u_trace_submission_data;
-
- struct tu_cmd_buffer **cmd_buffers;
- struct drm_msm_gem_submit_cmd *cmds;
- struct drm_msm_gem_submit_syncobj *in_syncobjs;
- struct drm_msm_gem_submit_syncobj *out_syncobjs;
-
- uint32_t nr_cmd_buffers;
- uint32_t nr_in_syncobjs;
- uint32_t nr_out_syncobjs;
- uint32_t entry_count;
- uint32_t perf_pass_index;
-
- bool autotune_fence;
-};
-
-struct tu_u_trace_syncobj
-{
- uint32_t msm_queue_id;
- uint32_t fence;
-};
static int
tu_drm_get_param(const struct tu_physical_device *dev,
@@ -80,7 +55,7 @@ tu_drm_get_param(const struct tu_physical_device *dev,
return 0;
}
-static int
+int
tu_drm_get_gpu_id(const struct tu_physical_device *dev, uint32_t *id)
{
uint64_t value;
@@ -92,7 +67,7 @@ tu_drm_get_gpu_id(const struct tu_physical_device *dev, uint32_t *id)
return 0;
}
-static int
+int
tu_drm_get_gmem_size(const struct tu_physical_device *dev, uint32_t *size)
{
uint64_t value;
@@ -104,85 +79,17 @@ tu_drm_get_gmem_size(const struct tu_physical_device *dev, uint32_t *size)
return 0;
}
-static int
-tu_drm_get_gmem_base(const struct tu_physical_device *dev, uint64_t *base)
-{
- return tu_drm_get_param(dev, MSM_PARAM_GMEM_BASE, base);
-}
-
-static int
-tu_drm_get_va_prop(const struct tu_physical_device *dev,
- uint64_t *va_start, uint64_t *va_size)
-{
- uint64_t value;
- int ret = tu_drm_get_param(dev, MSM_PARAM_VA_START, &value);
- if (ret)
- return ret;
-
- *va_start = value;
-
- ret = tu_drm_get_param(dev, MSM_PARAM_VA_SIZE, &value);
- if (ret)
- return ret;
-
- *va_size = value;
-
- return 0;
-}
-
-static uint32_t
-tu_drm_get_priorities(const struct tu_physical_device *dev)
-{
- uint64_t val = 1;
- tu_drm_get_param(dev, MSM_PARAM_PRIORITIES, &val);
- assert(val >= 1);
-
- return val;
-}
-
-int
-tu_device_get_gpu_timestamp(struct tu_device *dev, uint64_t *ts)
-{
- return tu_drm_get_param(dev->physical_device, MSM_PARAM_TIMESTAMP, ts);
-}
-
-int
-tu_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count)
-{
- int ret = tu_drm_get_param(dev->physical_device, MSM_PARAM_SUSPENDS, suspend_count);
- return ret;
-}
-
-VkResult
-tu_device_check_status(struct vk_device *vk_device)
-{
- struct tu_device *device = container_of(vk_device, struct tu_device, vk);
- struct tu_physical_device *physical_device = device->physical_device;
-
- uint64_t last_fault_count = physical_device->fault_count;
- int ret = tu_drm_get_param(physical_device, MSM_PARAM_FAULTS, &physical_device->fault_count);
- if (ret != 0)
- return vk_device_set_lost(&device->vk, "error getting GPU fault count: %d", ret);
-
- if (last_fault_count != physical_device->fault_count)
- return vk_device_set_lost(&device->vk, "GPU faulted or hung");
-
- return VK_SUCCESS;
-}
-
int
tu_drm_submitqueue_new(const struct tu_device *dev,
int priority,
uint32_t *queue_id)
{
- assert(priority >= 0 &&
- priority < dev->physical_device->submitqueue_priority_count);
struct drm_msm_submitqueue req = {
.flags = 0,
.prio = priority,
};
- int ret = drmCommandWriteRead(dev->fd,
+ int ret = drmCommandWriteRead(dev->physical_device->local_fd,
DRM_MSM_SUBMITQUEUE_NEW, &req, sizeof(req));
if (ret)
return ret;
@@ -194,1156 +101,94 @@ tu_drm_submitqueue_new(const struct tu_device *dev,
void
tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id)
{
- drmCommandWrite(dev->fd, DRM_MSM_SUBMITQUEUE_CLOSE,
+ drmCommandWrite(dev->physical_device->local_fd, DRM_MSM_SUBMITQUEUE_CLOSE,
&queue_id, sizeof(uint32_t));
}
-static void
-tu_gem_close(const struct tu_device *dev, uint32_t gem_handle)
-{
- struct drm_gem_close req = {
- .handle = gem_handle,
- };
-
- drmIoctl(dev->fd, DRM_IOCTL_GEM_CLOSE, &req);
-}
-
-/** Helper for DRM_MSM_GEM_INFO, returns 0 on error. */
-static uint64_t
-tu_gem_info(const struct tu_device *dev, uint32_t gem_handle, uint32_t info)
-{
- struct drm_msm_gem_info req = {
- .handle = gem_handle,
- .info = info,
- };
-
- int ret = drmCommandWriteRead(dev->fd,
- DRM_MSM_GEM_INFO, &req, sizeof(req));
- if (ret < 0)
- return 0;
-
- return req.value;
-}
-
-
-static VkResult
-tu_allocate_userspace_iova(struct tu_device *dev,
- uint32_t gem_handle,
- uint64_t size,
- uint64_t client_iova,
- enum tu_bo_alloc_flags flags,
- uint64_t *iova)
-{
- mtx_lock(&dev->physical_device->vma_mutex);
-
- *iova = 0;
-
- if (flags & TU_BO_ALLOC_REPLAYABLE) {
- if (client_iova) {
- if (util_vma_heap_alloc_addr(&dev->physical_device->vma, client_iova,
- size)) {
- *iova = client_iova;
- } else {
- return VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS;
- }
- } else {
- /* We have to separate replayable IOVAs from ordinary one in order to
- * for them not to clash. The easiest way to do this is to allocate
- * them from the other end of the address space.
- */
- dev->physical_device->vma.alloc_high = true;
- *iova =
- util_vma_heap_alloc(&dev->physical_device->vma, size, 0x1000);
- }
- } else {
- dev->physical_device->vma.alloc_high = false;
- *iova = util_vma_heap_alloc(&dev->physical_device->vma, size, 0x1000);
- }
-
- mtx_unlock(&dev->physical_device->vma_mutex);
-
- if (!*iova)
- return VK_ERROR_OUT_OF_DEVICE_MEMORY;
-
- struct drm_msm_gem_info req = {
- .handle = gem_handle,
- .info = MSM_INFO_SET_IOVA,
- .value = *iova,
- };
-
- int ret =
- drmCommandWriteRead(dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req));
- if (ret < 0)
- return VK_ERROR_OUT_OF_HOST_MEMORY;
-
- return VK_SUCCESS;
-}
-
-static VkResult
-tu_allocate_kernel_iova(struct tu_device *dev,
- uint32_t gem_handle,
- uint64_t *iova)
-{
- *iova = tu_gem_info(dev, gem_handle, MSM_INFO_GET_IOVA);
- if (!*iova)
- return VK_ERROR_OUT_OF_DEVICE_MEMORY;
-
- return VK_SUCCESS;
-}
-
-static VkResult
-tu_bo_init(struct tu_device *dev,
- struct tu_bo *bo,
- uint32_t gem_handle,
- uint64_t size,
- uint64_t client_iova,
- enum tu_bo_alloc_flags flags,
- const char *name)
-{
- VkResult result = VK_SUCCESS;
- uint64_t iova = 0;
-
- assert(!client_iova || dev->physical_device->has_set_iova);
-
- if (dev->physical_device->has_set_iova) {
- result = tu_allocate_userspace_iova(dev, gem_handle, size, client_iova,
- flags, &iova);
- } else {
- result = tu_allocate_kernel_iova(dev, gem_handle, &iova);
- }
-
- if (result != VK_SUCCESS)
- goto fail_bo_list;
-
- name = tu_debug_bos_add(dev, size, name);
-
- mtx_lock(&dev->bo_mutex);
- uint32_t idx = dev->bo_count++;
-
- /* grow the bo list if needed */
- if (idx >= dev->bo_list_size) {
- uint32_t new_len = idx + 64;
- struct drm_msm_gem_submit_bo *new_ptr =
- vk_realloc(&dev->vk.alloc, dev->bo_list, new_len * sizeof(*dev->bo_list),
- 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
- if (!new_ptr) {
- result = VK_ERROR_OUT_OF_HOST_MEMORY;
- goto fail_bo_list;
- }
-
- dev->bo_list = new_ptr;
- dev->bo_list_size = new_len;
- }
-
- bool dump = flags & TU_BO_ALLOC_ALLOW_DUMP;
- dev->bo_list[idx] = (struct drm_msm_gem_submit_bo) {
- .flags = MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE |
- COND(dump, MSM_SUBMIT_BO_DUMP),
- .handle = gem_handle,
- .presumed = iova,
- };
-
- *bo = (struct tu_bo) {
- .gem_handle = gem_handle,
- .size = size,
- .iova = iova,
- .refcnt = 1,
- .bo_list_idx = idx,
- .name = name,
- };
-
- mtx_unlock(&dev->bo_mutex);
-
- return VK_SUCCESS;
-
-fail_bo_list:
- tu_gem_close(dev, gem_handle);
- return result;
-}
-
/**
- * Sets the name in the kernel so that the contents of /debug/dri/0/gem are more
- * useful.
- *
- * We skip this on release builds (when we're also not doing BO debugging) to
- * reduce overhead.
+ * Return gem handle on success. Return 0 on failure.
*/
-static void
-tu_bo_set_kernel_name(struct tu_device *dev, struct tu_bo *bo, const char *name)
+uint32_t
+tu_gem_new(const struct tu_device *dev, uint64_t size, uint32_t flags)
{
- bool kernel_bo_names = dev->bo_sizes != NULL;
-#ifdef DEBUG
- kernel_bo_names = true;
-#endif
- if (!kernel_bo_names)
- return;
-
- struct drm_msm_gem_info req = {
- .handle = bo->gem_handle,
- .info = MSM_INFO_SET_NAME,
- .value = (uintptr_t)(void *)name,
- .len = strlen(name),
- };
-
- int ret = drmCommandWrite(dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req));
- if (ret) {
- mesa_logw_once("Failed to set BO name with DRM_MSM_GEM_INFO: %d",
- ret);
- }
-}
-
-VkResult
-tu_bo_init_new_explicit_iova(struct tu_device *dev,
- struct tu_bo **out_bo,
- uint64_t size,
- uint64_t client_iova,
- enum tu_bo_alloc_flags flags,
- const char *name)
-{
- /* TODO: Choose better flags. As of 2018-11-12, freedreno/drm/msm_bo.c
- * always sets `flags = MSM_BO_WC`, and we copy that behavior here.
- */
struct drm_msm_gem_new req = {
.size = size,
- .flags = MSM_BO_WC
+ .flags = flags,
};
- if (flags & TU_BO_ALLOC_GPU_READ_ONLY)
- req.flags |= MSM_BO_GPU_READONLY;
-
- int ret = drmCommandWriteRead(dev->fd,
+ int ret = drmCommandWriteRead(dev->physical_device->local_fd,
DRM_MSM_GEM_NEW, &req, sizeof(req));
if (ret)
- return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
-
- struct tu_bo* bo = tu_device_lookup_bo(dev, req.handle);
- assert(bo && bo->gem_handle == 0);
-
- VkResult result =
- tu_bo_init(dev, bo, req.handle, size, client_iova, flags, name);
-
- if (result != VK_SUCCESS)
- memset(bo, 0, sizeof(*bo));
- else
- *out_bo = bo;
-
- /* We don't use bo->name here because for the !TU_DEBUG=bo case bo->name is NULL. */
- tu_bo_set_kernel_name(dev, bo, name);
+ return 0;
- return result;
+ return req.handle;
}
-VkResult
-tu_bo_init_dmabuf(struct tu_device *dev,
- struct tu_bo **out_bo,
- uint64_t size,
- int prime_fd)
+uint32_t
+tu_gem_import_dmabuf(const struct tu_device *dev, int prime_fd, uint64_t size)
{
/* lseek() to get the real size */
off_t real_size = lseek(prime_fd, 0, SEEK_END);
lseek(prime_fd, 0, SEEK_SET);
if (real_size < 0 || (uint64_t) real_size < size)
- return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
-
- /* Importing the same dmabuf several times would yield the same
- * gem_handle. Thus there could be a race when destroying
- * BO and importing the same dmabuf from different threads.
- * We must not permit the creation of dmabuf BO and its release
- * to happen in parallel.
- */
- u_rwlock_wrlock(&dev->dma_bo_lock);
+ return 0;
uint32_t gem_handle;
- int ret = drmPrimeFDToHandle(dev->fd, prime_fd,
+ int ret = drmPrimeFDToHandle(dev->physical_device->local_fd, prime_fd,
&gem_handle);
- if (ret) {
- u_rwlock_wrunlock(&dev->dma_bo_lock);
- return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
- }
-
- struct tu_bo* bo = tu_device_lookup_bo(dev, gem_handle);
-
- if (bo->refcnt != 0) {
- p_atomic_inc(&bo->refcnt);
- u_rwlock_wrunlock(&dev->dma_bo_lock);
-
- *out_bo = bo;
- return VK_SUCCESS;
- }
-
- VkResult result =
- tu_bo_init(dev, bo, gem_handle, size, 0, TU_BO_ALLOC_NO_FLAGS, "dmabuf");
-
- if (result != VK_SUCCESS)
- memset(bo, 0, sizeof(*bo));
- else
- *out_bo = bo;
-
- u_rwlock_wrunlock(&dev->dma_bo_lock);
+ if (ret)
+ return 0;
- return result;
+ return gem_handle;
}
int
-tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo)
+tu_gem_export_dmabuf(const struct tu_device *dev, uint32_t gem_handle)
{
int prime_fd;
- int ret = drmPrimeHandleToFD(dev->fd, bo->gem_handle,
- DRM_CLOEXEC | DRM_RDWR, &prime_fd);
+ int ret = drmPrimeHandleToFD(dev->physical_device->local_fd, gem_handle,
+ DRM_CLOEXEC, &prime_fd);
return ret == 0 ? prime_fd : -1;
}
-VkResult
-tu_bo_map(struct tu_device *dev, struct tu_bo *bo)
-{
- if (bo->map)
- return VK_SUCCESS;
-
- uint64_t offset = tu_gem_info(dev, bo->gem_handle, MSM_INFO_GET_OFFSET);
- if (!offset)
- return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
-
- /* TODO: Should we use the wrapper os_mmap() like Freedreno does? */
- void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
- dev->fd, offset);
- if (map == MAP_FAILED)
- return vk_error(dev, VK_ERROR_MEMORY_MAP_FAILED);
-
- bo->map = map;
- return VK_SUCCESS;
-}
-
void
-tu_bo_finish(struct tu_device *dev, struct tu_bo *bo)
-{
- assert(bo->gem_handle);
-
- u_rwlock_rdlock(&dev->dma_bo_lock);
-
- if (!p_atomic_dec_zero(&bo->refcnt)) {
- u_rwlock_rdunlock(&dev->dma_bo_lock);
- return;
- }
-
- if (bo->map)
- munmap(bo->map, bo->size);
-
- tu_debug_bos_del(dev, bo);
-
- mtx_lock(&dev->bo_mutex);
- dev->bo_count--;
- dev->bo_list[bo->bo_list_idx] = dev->bo_list[dev->bo_count];
-
- struct tu_bo* exchanging_bo = tu_device_lookup_bo(dev, dev->bo_list[bo->bo_list_idx].handle);
- exchanging_bo->bo_list_idx = bo->bo_list_idx;
-
- if (bo->implicit_sync)
- dev->implicit_sync_bo_count--;
-
- mtx_unlock(&dev->bo_mutex);
-
- if (dev->physical_device->has_set_iova) {
- mtx_lock(&dev->physical_device->vma_mutex);
- util_vma_heap_free(&dev->physical_device->vma, bo->iova, bo->size);
- mtx_unlock(&dev->physical_device->vma_mutex);
- }
-
- /* Our BO structs are stored in a sparse array in the physical device,
- * so we don't want to free the BO pointer, instead we want to reset it
- * to 0, to signal that array entry as being free.
- */
- uint32_t gem_handle = bo->gem_handle;
- memset(bo, 0, sizeof(*bo));
-
- tu_gem_close(dev, gem_handle);
-
- u_rwlock_rdunlock(&dev->dma_bo_lock);
-}
-
-extern const struct vk_sync_type tu_timeline_sync_type;
-
-static inline bool
-vk_sync_is_tu_timeline_sync(const struct vk_sync *sync)
-{
- return sync->type == &tu_timeline_sync_type;
-}
-
-static struct tu_timeline_sync *
-to_tu_timeline_sync(struct vk_sync *sync)
-{
- assert(sync->type == &tu_timeline_sync_type);
- return container_of(sync, struct tu_timeline_sync, base);
-}
-
-static uint32_t
-tu_syncobj_from_vk_sync(struct vk_sync *sync)
-{
- uint32_t syncobj = -1;
- if (vk_sync_is_tu_timeline_sync(sync)) {
- syncobj = to_tu_timeline_sync(sync)->syncobj;
- } else if (vk_sync_type_is_drm_syncobj(sync->type)) {
- syncobj = vk_sync_as_drm_syncobj(sync)->syncobj;
- }
-
- assert(syncobj != -1);
-
- return syncobj;
-}
-
-static VkResult
-tu_timeline_sync_init(struct vk_device *vk_device,
- struct vk_sync *vk_sync,
- uint64_t initial_value)
-{
- struct tu_device *device = container_of(vk_device, struct tu_device, vk);
- struct tu_timeline_sync *sync = to_tu_timeline_sync(vk_sync);
- uint32_t flags = 0;
-
- assert(device->fd >= 0);
-
- int err = drmSyncobjCreate(device->fd, flags, &sync->syncobj);
-
- if (err < 0) {
- return vk_error(device, VK_ERROR_DEVICE_LOST);
- }
-
- sync->state = initial_value ? TU_TIMELINE_SYNC_STATE_SIGNALED :
- TU_TIMELINE_SYNC_STATE_RESET;
-
- return VK_SUCCESS;
-}
-
-static void
-tu_timeline_sync_finish(struct vk_device *vk_device,
- struct vk_sync *vk_sync)
-{
- struct tu_device *dev = container_of(vk_device, struct tu_device, vk);
- struct tu_timeline_sync *sync = to_tu_timeline_sync(vk_sync);
-
- assert(dev->fd >= 0);
- ASSERTED int err = drmSyncobjDestroy(dev->fd, sync->syncobj);
- assert(err == 0);
-}
-
-static VkResult
-tu_timeline_sync_reset(struct vk_device *vk_device,
- struct vk_sync *vk_sync)
-{
- struct tu_device *dev = container_of(vk_device, struct tu_device, vk);
- struct tu_timeline_sync *sync = to_tu_timeline_sync(vk_sync);
-
- int err = drmSyncobjReset(dev->fd, &sync->syncobj, 1);
- if (err) {
- return vk_errorf(dev, VK_ERROR_UNKNOWN,
- "DRM_IOCTL_SYNCOBJ_RESET failed: %m");
- } else {
- sync->state = TU_TIMELINE_SYNC_STATE_RESET;
- }
-
- return VK_SUCCESS;
-}
-
-static VkResult
-drm_syncobj_wait(struct tu_device *device,
- uint32_t *handles, uint32_t count_handles,
- uint64_t timeout_nsec, bool wait_all)
-{
- uint32_t syncobj_wait_flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
- if (wait_all) syncobj_wait_flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL;
-
- /* syncobj absolute timeouts are signed. clamp OS_TIMEOUT_INFINITE down. */
- timeout_nsec = MIN2(timeout_nsec, (uint64_t)INT64_MAX);
-
- int err = drmSyncobjWait(device->fd, handles,
- count_handles, timeout_nsec,
- syncobj_wait_flags,
- NULL /* first_signaled */);
- if (err && errno == ETIME) {
- return VK_TIMEOUT;
- } else if (err) {
- return vk_errorf(device, VK_ERROR_UNKNOWN,
- "DRM_IOCTL_SYNCOBJ_WAIT failed: %m");
- }
-
- return VK_SUCCESS;
-}
-
-/* Based on anv_bo_sync_wait */
-static VkResult
-tu_timeline_sync_wait(struct vk_device *vk_device,
- uint32_t wait_count,
- const struct vk_sync_wait *waits,
- enum vk_sync_wait_flags wait_flags,
- uint64_t abs_timeout_ns)
-{
- struct tu_device *dev = container_of(vk_device, struct tu_device, vk);
- bool wait_all = !(wait_flags & VK_SYNC_WAIT_ANY);
-
- uint32_t handles[wait_count];
- uint32_t submit_count;
- VkResult ret = VK_SUCCESS;
- uint32_t pending = wait_count;
- struct tu_timeline_sync *submitted_syncs[wait_count];
-
- while (pending) {
- pending = 0;
- submit_count = 0;
-
- for (unsigned i = 0; i < wait_count; ++i) {
- struct tu_timeline_sync *sync = to_tu_timeline_sync(waits[i].sync);
-
- if (sync->state == TU_TIMELINE_SYNC_STATE_RESET) {
- assert(!(wait_flags & VK_SYNC_WAIT_PENDING));
- pending++;
- } else if (sync->state == TU_TIMELINE_SYNC_STATE_SIGNALED) {
- if (wait_flags & VK_SYNC_WAIT_ANY)
- return VK_SUCCESS;
- } else if (sync->state == TU_TIMELINE_SYNC_STATE_SUBMITTED) {
- if (!(wait_flags & VK_SYNC_WAIT_PENDING)) {
- handles[submit_count] = sync->syncobj;
- submitted_syncs[submit_count++] = sync;
- }
- }
- }
-
- if (submit_count > 0) {
- do {
- ret = drm_syncobj_wait(dev, handles, submit_count, abs_timeout_ns, wait_all);
- } while (ret == VK_TIMEOUT && os_time_get_nano() < abs_timeout_ns);
-
- if (ret == VK_SUCCESS) {
- for (unsigned i = 0; i < submit_count; ++i) {
- struct tu_timeline_sync *sync = submitted_syncs[i];
- sync->state = TU_TIMELINE_SYNC_STATE_SIGNALED;
- }
- } else {
- /* return error covering timeout */
- return ret;
- }
- } else if (pending > 0) {
- /* If we've hit this then someone decided to vkWaitForFences before
- * they've actually submitted any of them to a queue. This is a
- * fairly pessimal case, so it's ok to lock here and use a standard
- * pthreads condition variable.
- */
- pthread_mutex_lock(&dev->submit_mutex);
-
- /* It's possible that some of the fences have changed state since the
- * last time we checked. Now that we have the lock, check for
- * pending fences again and don't wait if it's changed.
- */
- uint32_t now_pending = 0;
- for (uint32_t i = 0; i < wait_count; i++) {
- struct tu_timeline_sync *sync = to_tu_timeline_sync(waits[i].sync);
- if (sync->state == TU_TIMELINE_SYNC_STATE_RESET)
- now_pending++;
- }
- assert(now_pending <= pending);
-
- if (now_pending == pending) {
- struct timespec abstime = {
- .tv_sec = abs_timeout_ns / NSEC_PER_SEC,
- .tv_nsec = abs_timeout_ns % NSEC_PER_SEC,
- };
-
- ASSERTED int ret;
- ret = pthread_cond_timedwait(&dev->timeline_cond,
- &dev->submit_mutex, &abstime);
- assert(ret != EINVAL);
- if (os_time_get_nano() >= abs_timeout_ns) {
- pthread_mutex_unlock(&dev->submit_mutex);
- return VK_TIMEOUT;
- }
- }
-
- pthread_mutex_unlock(&dev->submit_mutex);
- }
- }
-
- return ret;
-}
-
-const struct vk_sync_type tu_timeline_sync_type = {
- .size = sizeof(struct tu_timeline_sync),
- .features = VK_SYNC_FEATURE_BINARY |
- VK_SYNC_FEATURE_GPU_WAIT |
- VK_SYNC_FEATURE_GPU_MULTI_WAIT |
- VK_SYNC_FEATURE_CPU_WAIT |
- VK_SYNC_FEATURE_CPU_RESET |
- VK_SYNC_FEATURE_WAIT_ANY |
- VK_SYNC_FEATURE_WAIT_PENDING,
- .init = tu_timeline_sync_init,
- .finish = tu_timeline_sync_finish,
- .reset = tu_timeline_sync_reset,
- .wait_many = tu_timeline_sync_wait,
-};
-
-VkResult
-tu_physical_device_try_create(struct vk_instance *vk_instance,
- struct _drmDevice *drm_device,
- struct vk_physical_device **out)
-{
- struct tu_instance *instance =
- container_of(vk_instance, struct tu_instance, vk);
-
- if (!(drm_device->available_nodes & (1 << DRM_NODE_RENDER)) ||
- drm_device->bustype != DRM_BUS_PLATFORM)
- return VK_ERROR_INCOMPATIBLE_DRIVER;
-
- const char *primary_path = drm_device->nodes[DRM_NODE_PRIMARY];
- const char *path = drm_device->nodes[DRM_NODE_RENDER];
- VkResult result = VK_SUCCESS;
- drmVersionPtr version;
- int fd;
- int master_fd = -1;
-
- fd = open(path, O_RDWR | O_CLOEXEC);
- if (fd < 0) {
- return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
- "failed to open device %s", path);
- }
-
- /* Version 1.6 added SYNCOBJ support. */
- const int min_version_major = 1;
- const int min_version_minor = 6;
-
- version = drmGetVersion(fd);
- if (!version) {
- close(fd);
- return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
- "failed to query kernel driver version for device %s",
- path);
- }
-
- if (strcmp(version->name, "msm")) {
- drmFreeVersion(version);
- close(fd);
- return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
- "device %s does not use the msm kernel driver",
- path);
- }
-
- if (version->version_major != min_version_major ||
- version->version_minor < min_version_minor) {
- result = vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
- "kernel driver for device %s has version %d.%d, "
- "but Vulkan requires version >= %d.%d",
- path,
- version->version_major, version->version_minor,
- min_version_major, min_version_minor);
- drmFreeVersion(version);
- close(fd);
- return result;
- }
-
- struct tu_physical_device *device =
- vk_zalloc(&instance->vk.alloc, sizeof(*device), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
- if (!device) {
- result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- drmFreeVersion(version);
- goto fail;
- }
-
- device->msm_major_version = version->version_major;
- device->msm_minor_version = version->version_minor;
-
- drmFreeVersion(version);
-
- if (instance->debug_flags & TU_DEBUG_STARTUP)
- mesa_logi("Found compatible device '%s'.", path);
-
- device->instance = instance;
-
- if (instance->vk.enabled_extensions.KHR_display) {
- master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
- if (master_fd >= 0) {
- /* TODO: free master_fd is accel is not working? */
- }
- }
-
- device->master_fd = master_fd;
- device->local_fd = fd;
-
- if (tu_drm_get_gpu_id(device, &device->dev_id.gpu_id)) {
- result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
- "could not get GPU ID");
- goto fail;
- }
-
- if (tu_drm_get_param(device, MSM_PARAM_CHIP_ID, &device->dev_id.chip_id)) {
- result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
- "could not get CHIP ID");
- goto fail;
- }
-
- if (tu_drm_get_gmem_size(device, &device->gmem_size)) {
- result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
- "could not get GMEM size");
- goto fail;
- }
- device->gmem_size = debug_get_num_option("TU_GMEM", device->gmem_size);
-
- if (tu_drm_get_gmem_base(device, &device->gmem_base)) {
- result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
- "could not get GMEM size");
- goto fail;
- }
-
- /*
- * device->has_set_iova = !tu_drm_get_va_prop(device, &device->va_start,
- * &device->va_size);
- *
- * If BO is freed while kernel considers it busy, our VMA state gets
- * desynchronized from kernel's VMA state, because kernel waits
- * until BO stops being busy. And whether BO is busy decided at
- * submission granularity.
- *
- * Disable this capability until solution is found.
- */
- device->has_set_iova = false;
-
- struct stat st;
-
- if (stat(primary_path, &st) == 0) {
- device->has_master = true;
- device->master_major = major(st.st_rdev);
- device->master_minor = minor(st.st_rdev);
- } else {
- device->has_master = false;
- device->master_major = 0;
- device->master_minor = 0;
- }
-
- if (stat(path, &st) == 0) {
- device->has_local = true;
- device->local_major = major(st.st_rdev);
- device->local_minor = minor(st.st_rdev);
- } else {
- result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
- "failed to stat DRM render node %s", path);
- goto fail;
- }
-
- int ret = tu_drm_get_param(device, MSM_PARAM_FAULTS, &device->fault_count);
- if (ret != 0) {
- result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
- "Failed to get initial fault count: %d", ret);
- goto fail;
- }
-
- device->submitqueue_priority_count = tu_drm_get_priorities(device);
-
- device->syncobj_type = vk_drm_syncobj_get_type(fd);
- /* we don't support DRM_CAP_SYNCOBJ_TIMELINE, but drm-shim does */
- if (!(device->syncobj_type.features & VK_SYNC_FEATURE_TIMELINE))
- device->timeline_type = vk_sync_timeline_get_type(&tu_timeline_sync_type);
-
- device->sync_types[0] = &device->syncobj_type;
- device->sync_types[1] = &device->timeline_type.sync;
- device->sync_types[2] = NULL;
-
- device->heap.size = tu_get_system_heap_size();
- device->heap.used = 0u;
- device->heap.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT;
-
- result = tu_physical_device_init(device, instance);
-
- if (result == VK_SUCCESS) {
- *out = &device->vk;
- return result;
- }
-
-fail:
- if (device)
- vk_free(&instance->vk.alloc, device);
- close(fd);
- if (master_fd != -1)
- close(master_fd);
- return result;
-}
-
-static VkResult
-tu_queue_submit_create_locked(struct tu_queue *queue,
- struct vk_queue_submit *vk_submit,
- const uint32_t nr_in_syncobjs,
- const uint32_t nr_out_syncobjs,
- uint32_t perf_pass_index,
- struct tu_queue_submit *new_submit)
-{
- VkResult result;
-
- bool u_trace_enabled = u_trace_context_actively_tracing(&queue->device->trace_context);
- bool has_trace_points = false;
-
- struct vk_command_buffer **vk_cmd_buffers = vk_submit->command_buffers;
-
- memset(new_submit, 0, sizeof(struct tu_queue_submit));
-
- new_submit->cmd_buffers = (void *)vk_cmd_buffers;
- new_submit->nr_cmd_buffers = vk_submit->command_buffer_count;
- tu_insert_dynamic_cmdbufs(queue->device, &new_submit->cmd_buffers,
- &new_submit->nr_cmd_buffers);
-
- uint32_t entry_count = 0;
- for (uint32_t j = 0; j < new_submit->nr_cmd_buffers; ++j) {
- struct tu_cmd_buffer *cmdbuf = new_submit->cmd_buffers[j];
-
- if (perf_pass_index != ~0)
- entry_count++;
-
- entry_count += cmdbuf->cs.entry_count;
-
- if (u_trace_enabled && u_trace_has_points(&cmdbuf->trace)) {
- if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
- entry_count++;
-
- has_trace_points = true;
- }
- }
-
- new_submit->autotune_fence =
- tu_autotune_submit_requires_fence(new_submit->cmd_buffers, new_submit->nr_cmd_buffers);
- if (new_submit->autotune_fence)
- entry_count++;
-
- new_submit->cmds = vk_zalloc(&queue->device->vk.alloc,
- entry_count * sizeof(*new_submit->cmds), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-
- if (new_submit->cmds == NULL) {
- result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
- goto fail_cmds;
- }
-
- if (has_trace_points) {
- result =
- tu_u_trace_submission_data_create(
- queue->device, new_submit->cmd_buffers,
- new_submit->nr_cmd_buffers,
- &new_submit->u_trace_submission_data);
-
- if (result != VK_SUCCESS) {
- goto fail_u_trace_submission_data;
- }
- }
-
- /* Allocate without wait timeline semaphores */
- new_submit->in_syncobjs = vk_zalloc(&queue->device->vk.alloc,
- nr_in_syncobjs * sizeof(*new_submit->in_syncobjs), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-
- if (new_submit->in_syncobjs == NULL) {
- result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
- goto fail_in_syncobjs;
- }
-
- /* Allocate with signal timeline semaphores considered */
- new_submit->out_syncobjs = vk_zalloc(&queue->device->vk.alloc,
- nr_out_syncobjs * sizeof(*new_submit->out_syncobjs), 8,
- VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-
- if (new_submit->out_syncobjs == NULL) {
- result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
- goto fail_out_syncobjs;
- }
-
- new_submit->entry_count = entry_count;
- new_submit->nr_in_syncobjs = nr_in_syncobjs;
- new_submit->nr_out_syncobjs = nr_out_syncobjs;
- new_submit->perf_pass_index = perf_pass_index;
- new_submit->vk_submit = vk_submit;
-
- return VK_SUCCESS;
-
-fail_out_syncobjs:
- vk_free(&queue->device->vk.alloc, new_submit->in_syncobjs);
-fail_in_syncobjs:
- if (new_submit->u_trace_submission_data)
- tu_u_trace_submission_data_finish(queue->device,
- new_submit->u_trace_submission_data);
-fail_u_trace_submission_data:
- vk_free(&queue->device->vk.alloc, new_submit->cmds);
-fail_cmds:
- return result;
-}
-
-static void
-tu_queue_submit_finish(struct tu_queue *queue, struct tu_queue_submit *submit)
-{
- vk_free(&queue->device->vk.alloc, submit->cmds);
- vk_free(&queue->device->vk.alloc, submit->in_syncobjs);
- vk_free(&queue->device->vk.alloc, submit->out_syncobjs);
- if (submit->cmd_buffers != (void *) submit->vk_submit->command_buffers)
- vk_free(&queue->device->vk.alloc, submit->cmd_buffers);
-}
-
-static void
-tu_fill_msm_gem_submit(struct tu_device *dev,
- struct drm_msm_gem_submit_cmd *cmd,
- struct tu_cs_entry *cs_entry)
-{
- cmd->type = MSM_SUBMIT_CMD_BUF;
- cmd->submit_idx = cs_entry->bo->bo_list_idx;
- cmd->submit_offset = cs_entry->offset;
- cmd->size = cs_entry->size;
- cmd->pad = 0;
- cmd->nr_relocs = 0;
- cmd->relocs = 0;
-}
-
-static void
-tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
- struct tu_queue_submit *submit,
- struct tu_cs *autotune_cs)
+tu_gem_close(const struct tu_device *dev, uint32_t gem_handle)
{
- struct tu_device *dev = queue->device;
- struct drm_msm_gem_submit_cmd *cmds = submit->cmds;
-
- uint32_t entry_idx = 0;
- for (uint32_t j = 0; j < submit->nr_cmd_buffers; ++j) {
- struct tu_device *dev = queue->device;
- struct tu_cmd_buffer *cmdbuf = submit->cmd_buffers[j];
- struct tu_cs *cs = &cmdbuf->cs;
-
- if (submit->perf_pass_index != ~0) {
- struct tu_cs_entry *perf_cs_entry =
- &dev->perfcntrs_pass_cs_entries[submit->perf_pass_index];
-
- tu_fill_msm_gem_submit(dev, &cmds[entry_idx], perf_cs_entry);
- entry_idx++;
- }
-
- for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) {
- tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &cs->entries[i]);
- }
-
- if (submit->u_trace_submission_data) {
- struct tu_cs *ts_cs =
- submit->u_trace_submission_data->cmd_trace_data[j].timestamp_copy_cs;
- if (ts_cs) {
- tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &ts_cs->entries[0]);
- entry_idx++;
- }
- }
- }
+ struct drm_gem_close req = {
+ .handle = gem_handle,
+ };
- if (autotune_cs) {
- assert(autotune_cs->entry_count == 1);
- tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &autotune_cs->entries[0]);
- entry_idx++;
- }
+ drmIoctl(dev->physical_device->local_fd, DRM_IOCTL_GEM_CLOSE, &req);
}
-static VkResult
-tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
+/** Return UINT64_MAX on error. */
+static uint64_t
+tu_gem_info(const struct tu_device *dev, uint32_t gem_handle, uint32_t info)
{
- queue->device->submit_count++;
-
- struct tu_cs *autotune_cs = NULL;
- if (submit->autotune_fence) {
- autotune_cs = tu_autotune_on_submit(queue->device,
- &queue->device->autotune,
- submit->cmd_buffers,
- submit->nr_cmd_buffers);
- }
-
- uint32_t flags = MSM_PIPE_3D0;
-
- if (submit->vk_submit->wait_count)
- flags |= MSM_SUBMIT_SYNCOBJ_IN;
-
- if (submit->vk_submit->signal_count)
- flags |= MSM_SUBMIT_SYNCOBJ_OUT;
-
- mtx_lock(&queue->device->bo_mutex);
-
- if (queue->device->implicit_sync_bo_count == 0)
- flags |= MSM_SUBMIT_NO_IMPLICIT;
-
- /* drm_msm_gem_submit_cmd requires index of bo which could change at any
- * time when bo_mutex is not locked. So we build submit cmds here the real
- * place to submit.
- */
- tu_queue_build_msm_gem_submit_cmds(queue, submit, autotune_cs);
-
- struct drm_msm_gem_submit req = {
- .flags = flags,
- .queueid = queue->msm_queue_id,
- .bos = (uint64_t)(uintptr_t) queue->device->bo_list,
- .nr_bos = submit->entry_count ? queue->device->bo_count : 0,
- .cmds = (uint64_t)(uintptr_t)submit->cmds,
- .nr_cmds = submit->entry_count,
- .in_syncobjs = (uint64_t)(uintptr_t)submit->in_syncobjs,
- .out_syncobjs = (uint64_t)(uintptr_t)submit->out_syncobjs,
- .nr_in_syncobjs = submit->nr_in_syncobjs,
- .nr_out_syncobjs = submit->nr_out_syncobjs,
- .syncobj_stride = sizeof(struct drm_msm_gem_submit_syncobj),
+ struct drm_msm_gem_info req = {
+ .handle = gem_handle,
+ .info = info,
};
- int ret = drmCommandWriteRead(queue->device->fd,
- DRM_MSM_GEM_SUBMIT,
- &req, sizeof(req));
-
- mtx_unlock(&queue->device->bo_mutex);
-
- tu_debug_bos_print_stats(queue->device);
-
- if (ret)
- return vk_device_set_lost(&queue->device->vk, "submit failed: %m");
-
-#if HAVE_PERFETTO
- tu_perfetto_submit(queue->device, queue->device->submit_count);
-#endif
-
- if (submit->u_trace_submission_data) {
- struct tu_u_trace_submission_data *submission_data =
- submit->u_trace_submission_data;
- submission_data->submission_id = queue->device->submit_count;
- /* We have to allocate it here since it is different between drm/kgsl */
- submission_data->syncobj =
- vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),
- 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
- submission_data->syncobj->fence = req.fence;
- submission_data->syncobj->msm_queue_id = queue->msm_queue_id;
-
- submit->u_trace_submission_data = NULL;
-
- for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) {
- bool free_data = i == submission_data->last_buffer_with_tracepoints;
- if (submission_data->cmd_trace_data[i].trace)
- u_trace_flush(submission_data->cmd_trace_data[i].trace,
- submission_data, free_data);
-
- if (!submission_data->cmd_trace_data[i].timestamp_copy_cs) {
- /* u_trace is owned by cmd_buffer */
- submission_data->cmd_trace_data[i].trace = NULL;
- }
- }
- }
-
- for (uint32_t i = 0; i < submit->vk_submit->wait_count; i++) {
- if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->waits[i].sync))
- continue;
-
- struct tu_timeline_sync *sync =
- container_of(submit->vk_submit->waits[i].sync, struct tu_timeline_sync, base);
-
- assert(sync->state != TU_TIMELINE_SYNC_STATE_RESET);
-
- /* Set SIGNALED to the state of the wait timeline sync since this means the syncobj
- * is done and ready again so this can be garbage-collectioned later.
- */
- sync->state = TU_TIMELINE_SYNC_STATE_SIGNALED;
- }
-
- for (uint32_t i = 0; i < submit->vk_submit->signal_count; i++) {
- if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->signals[i].sync))
- continue;
-
- struct tu_timeline_sync *sync =
- container_of(submit->vk_submit->signals[i].sync, struct tu_timeline_sync, base);
-
- assert(sync->state == TU_TIMELINE_SYNC_STATE_RESET);
- /* Set SUBMITTED to the state of the signal timeline sync so we could wait for
- * this timeline sync until completed if necessary.
- */
- sync->state = TU_TIMELINE_SYNC_STATE_SUBMITTED;
- }
-
- pthread_cond_broadcast(&queue->device->timeline_cond);
-
- return VK_SUCCESS;
-}
+ int ret = drmCommandWriteRead(dev->physical_device->local_fd,
+ DRM_MSM_GEM_INFO, &req, sizeof(req));
+ if (ret == -1)
+ return UINT64_MAX;
-static inline void
-get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
-{
- struct timespec t;
- clock_gettime(CLOCK_MONOTONIC, &t);
- tv->tv_sec = t.tv_sec + ns / 1000000000;
- tv->tv_nsec = t.tv_nsec + ns % 1000000000;
+ return req.value;
}
-VkResult
-tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj)
+/** Return UINT64_MAX on error. */
+uint64_t
+tu_gem_info_offset(const struct tu_device *dev, uint32_t gem_handle)
{
- struct drm_msm_wait_fence req = {
- .fence = syncobj->fence,
- .queueid = syncobj->msm_queue_id,
- };
- int ret;
-
- get_abs_timeout(&req.timeout, 1000000000);
-
- ret = drmCommandWrite(dev->fd, DRM_MSM_WAIT_FENCE, &req, sizeof(req));
- if (ret && (ret != -ETIMEDOUT)) {
- fprintf(stderr, "wait-fence failed! %d (%s)", ret, strerror(errno));
- return VK_TIMEOUT;
- }
-
- return VK_SUCCESS;
+ return tu_gem_info(dev, gem_handle, MSM_INFO_GET_OFFSET);
}
-VkResult
-tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit)
+/** Return UINT64_MAX on error. */
+uint64_t
+tu_gem_info_iova(const struct tu_device *dev, uint32_t gem_handle)
{
- MESA_TRACE_FUNC();
- struct tu_queue *queue = container_of(vk_queue, struct tu_queue, vk);
- uint32_t perf_pass_index = queue->device->perfcntrs_pass_cs ?
- submit->perf_pass_index : ~0;
- struct tu_queue_submit submit_req;
-
- if (unlikely(queue->device->physical_device->instance->debug_flags &
- TU_DEBUG_LOG_SKIP_GMEM_OPS)) {
- tu_dbg_log_gmem_load_store_skips(queue->device);
- }
-
- pthread_mutex_lock(&queue->device->submit_mutex);
-
- VkResult ret = tu_queue_submit_create_locked(queue, submit,
- submit->wait_count, submit->signal_count,
- perf_pass_index, &submit_req);
-
- if (ret != VK_SUCCESS) {
- pthread_mutex_unlock(&queue->device->submit_mutex);
- return ret;
- }
-
- /* note: assuming there won't be any very large semaphore counts */
- struct drm_msm_gem_submit_syncobj *in_syncobjs = submit_req.in_syncobjs;
- struct drm_msm_gem_submit_syncobj *out_syncobjs = submit_req.out_syncobjs;
-
- uint32_t nr_in_syncobjs = 0, nr_out_syncobjs = 0;
-
- for (uint32_t i = 0; i < submit->wait_count; i++) {
- struct vk_sync *sync = submit->waits[i].sync;
-
- in_syncobjs[nr_in_syncobjs++] = (struct drm_msm_gem_submit_syncobj) {
- .handle = tu_syncobj_from_vk_sync(sync),
- .flags = 0,
- };
- }
-
- for (uint32_t i = 0; i < submit->signal_count; i++) {
- struct vk_sync *sync = submit->signals[i].sync;
-
- out_syncobjs[nr_out_syncobjs++] = (struct drm_msm_gem_submit_syncobj) {
- .handle = tu_syncobj_from_vk_sync(sync),
- .flags = 0,
- };
- }
-
- ret = tu_queue_submit_locked(queue, &submit_req);
-
- pthread_mutex_unlock(&queue->device->submit_mutex);
- tu_queue_submit_finish(queue, &submit_req);
-
- if (ret != VK_SUCCESS)
- return ret;
-
- u_trace_context_process(&queue->device->trace_context, true);
-
- return VK_SUCCESS;
+ return tu_gem_info(dev, gem_handle, MSM_INFO_GET_IOVA);
}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_formats.c b/lib/mesa/src/freedreno/vulkan/tu_formats.c
index 75a3ce74e..537b59d25 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_formats.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_formats.c
@@ -1,395 +1,661 @@
+
/*
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
- * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
-#include "tu_formats.h"
+#include "tu_private.h"
-#include "fdl/fd6_format_table.h"
+#include "registers/adreno_common.xml.h"
+#include "registers/a6xx.xml.h"
+#include "util/format_r11g11b10f.h"
+#include "util/format_srgb.h"
+#include "util/u_half.h"
+#include "vk_format.h"
#include "vk_util.h"
-#include "drm-uapi/drm_fourcc.h"
-#include "tu_device.h"
-#include "tu_image.h"
+/**
+ * Declare a format table. A format table is an array of tu_native_format.
+ * It can map a consecutive range of VkFormat to the corresponding
+ * tu_native_format.
+ *
+ * TU_FORMAT_TABLE_FIRST and TU_FORMAT_TABLE_LAST must already be defined and
+ * have the values of the first and last VkFormat of the array respectively.
+ */
+#define TU_FORMAT_TABLE(var) \
+ static const VkFormat var##_first = TU_FORMAT_TABLE_FIRST; \
+ static const VkFormat var##_last = TU_FORMAT_TABLE_LAST; \
+ static const struct tu_native_format var[TU_FORMAT_TABLE_LAST - TU_FORMAT_TABLE_FIRST + 1]
+#undef TU_FORMAT_TABLE_FIRST
+#undef TU_FORMAT_TABLE_LAST
+
+#define VFMT6_x -1
+#define TFMT6_x -1
+#define RB6_x -1
+
+#define TU6_FMT(vkfmt, vtxfmt, texfmt, rbfmt, swapfmt, valid) \
+ [VK_FORMAT_##vkfmt - TU_FORMAT_TABLE_FIRST] = { \
+ .vtx = VFMT6_##vtxfmt, \
+ .tex = TFMT6_##texfmt, \
+ .rb = RB6_##rbfmt, \
+ .swap = swapfmt, \
+ .present = valid, \
+ }
-struct tu_native_format
-tu6_format_vtx(VkFormat vk_format)
-{
- enum pipe_format format = vk_format_to_pipe_format(vk_format);
- struct tu_native_format fmt = {
- .fmt = fd6_vertex_format(format),
- .swap = fd6_vertex_swap(format),
- };
- assert(fmt.fmt != FMT6_NONE);
- return fmt;
-}
+/**
+ * fmt/alias/swap are derived from VkFormat mechanically (and might not even
+ * exist). It is the macro of choice that decides whether a VkFormat is
+ * supported and how.
+ */
+#define TU6_VTC(vk, fmt, alias, swap) TU6_FMT(vk, fmt, fmt, alias, swap, true)
+#define TU6_xTC(vk, fmt, alias, swap) TU6_FMT(vk, x, fmt, alias, swap, true)
+#define TU6_VTx(vk, fmt, alias, swap) TU6_FMT(vk, fmt, fmt, x, swap, true)
+#define TU6_Vxx(vk, fmt, alias, swap) TU6_FMT(vk, fmt, x, x, swap, true)
+#define TU6_xTx(vk, fmt, alias, swap) TU6_FMT(vk, x, fmt, x, swap, true)
+#define TU6_xxx(vk, fmt, alias, swap) TU6_FMT(vk, x, x, x, WZYX, false)
+
+#define TU_FORMAT_TABLE_FIRST VK_FORMAT_UNDEFINED
+#define TU_FORMAT_TABLE_LAST VK_FORMAT_ASTC_12x12_SRGB_BLOCK
+TU_FORMAT_TABLE(tu6_format_table0) = {
+ TU6_xxx(UNDEFINED, x, x, x), /* 0 */
+
+ /* 8-bit packed */
+ TU6_xxx(R4G4_UNORM_PACK8, 4_4_UNORM, R4G4_UNORM, WZXY), /* 1 */
+
+ /* 16-bit packed */
+ TU6_xTC(R4G4B4A4_UNORM_PACK16, 4_4_4_4_UNORM, R4G4B4A4_UNORM, XYZW), /* 2 */
+ TU6_xTC(B4G4R4A4_UNORM_PACK16, 4_4_4_4_UNORM, R4G4B4A4_UNORM, ZYXW), /* 3 */
+ TU6_xTC(R5G6B5_UNORM_PACK16, 5_6_5_UNORM, R5G6B5_UNORM, WXYZ), /* 4 */
+ TU6_xTC(B5G6R5_UNORM_PACK16, 5_6_5_UNORM, R5G6B5_UNORM, WXYZ), /* 5 */
+ TU6_xxx(R5G5B5A1_UNORM_PACK16, 1_5_5_5_UNORM, A1R5G5B5_UNORM, XYZW), /* 6 */
+ TU6_xxx(B5G5R5A1_UNORM_PACK16, 1_5_5_5_UNORM, A1R5G5B5_UNORM, XYZW), /* 7 */
+ TU6_xTC(A1R5G5B5_UNORM_PACK16, 5_5_5_1_UNORM, R5G5B5A1_UNORM, WXYZ), /* 8 */
+
+ /* 8-bit R */
+ TU6_VTC(R8_UNORM, 8_UNORM, R8_UNORM, WZYX), /* 9 */
+ TU6_VTC(R8_SNORM, 8_SNORM, R8_SNORM, WZYX), /* 10 */
+ TU6_Vxx(R8_USCALED, 8_UINT, R8_UINT, WZYX), /* 11 */
+ TU6_Vxx(R8_SSCALED, 8_SINT, R8_SINT, WZYX), /* 12 */
+ TU6_VTC(R8_UINT, 8_UINT, R8_UINT, WZYX), /* 13 */
+ TU6_VTC(R8_SINT, 8_SINT, R8_SINT, WZYX), /* 14 */
+ TU6_xTC(R8_SRGB, 8_UNORM, R8_UNORM, WZYX), /* 15 */
+
+ /* 16-bit RG */
+ TU6_VTC(R8G8_UNORM, 8_8_UNORM, R8G8_UNORM, WZYX), /* 16 */
+ TU6_VTC(R8G8_SNORM, 8_8_SNORM, R8G8_SNORM, WZYX), /* 17 */
+ TU6_Vxx(R8G8_USCALED, 8_8_UINT, R8G8_UINT, WZYX), /* 18 */
+ TU6_Vxx(R8G8_SSCALED, 8_8_SINT, R8G8_SINT, WZYX), /* 19 */
+ TU6_VTC(R8G8_UINT, 8_8_UINT, R8G8_UINT, WZYX), /* 20 */
+ TU6_VTC(R8G8_SINT, 8_8_SINT, R8G8_SINT, WZYX), /* 21 */
+ TU6_xTC(R8G8_SRGB, 8_8_UNORM, R8G8_UNORM, WZYX), /* 22 */
+
+ /* 24-bit RGB */
+ TU6_Vxx(R8G8B8_UNORM, 8_8_8_UNORM, R8G8B8_UNORM, WZYX), /* 23 */
+ TU6_Vxx(R8G8B8_SNORM, 8_8_8_SNORM, R8G8B8_SNORM, WZYX), /* 24 */
+ TU6_Vxx(R8G8B8_USCALED, 8_8_8_UINT, R8G8B8_UINT, WZYX), /* 25 */
+ TU6_Vxx(R8G8B8_SSCALED, 8_8_8_SINT, R8G8B8_SINT, WZYX), /* 26 */
+ TU6_Vxx(R8G8B8_UINT, 8_8_8_UINT, R8G8B8_UINT, WZYX), /* 27 */
+ TU6_Vxx(R8G8B8_SINT, 8_8_8_SINT, R8G8B8_SINT, WZYX), /* 28 */
+ TU6_xxx(R8G8B8_SRGB, 8_8_8_UNORM, R8G8B8_UNORM, WZYX), /* 29 */
+
+ /* 24-bit BGR */
+ TU6_Vxx(B8G8R8_UNORM, 8_8_8_UNORM, R8G8B8_UNORM, WXYZ), /* 30 */
+ TU6_Vxx(B8G8R8_SNORM, 8_8_8_SNORM, R8G8B8_SNORM, WXYZ), /* 31 */
+ TU6_Vxx(B8G8R8_USCALED, 8_8_8_UINT, R8G8B8_UINT, WXYZ), /* 32 */
+ TU6_Vxx(B8G8R8_SSCALED, 8_8_8_SINT, R8G8B8_SINT, WXYZ), /* 33 */
+ TU6_Vxx(B8G8R8_UINT, 8_8_8_UINT, R8G8B8_UINT, WXYZ), /* 34 */
+ TU6_Vxx(B8G8R8_SINT, 8_8_8_SINT, R8G8B8_SINT, WXYZ), /* 35 */
+ TU6_xxx(B8G8R8_SRGB, 8_8_8_UNORM, R8G8B8_UNORM, WXYZ), /* 36 */
+
+ /* 32-bit RGBA */
+ TU6_VTC(R8G8B8A8_UNORM, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX), /* 37 */
+ TU6_VTC(R8G8B8A8_SNORM, 8_8_8_8_SNORM, R8G8B8A8_SNORM, WZYX), /* 38 */
+ TU6_Vxx(R8G8B8A8_USCALED, 8_8_8_8_UINT, R8G8B8A8_UINT, WZYX), /* 39 */
+ TU6_Vxx(R8G8B8A8_SSCALED, 8_8_8_8_SINT, R8G8B8A8_SINT, WZYX), /* 40 */
+ TU6_VTC(R8G8B8A8_UINT, 8_8_8_8_UINT, R8G8B8A8_UINT, WZYX), /* 41 */
+ TU6_VTC(R8G8B8A8_SINT, 8_8_8_8_SINT, R8G8B8A8_SINT, WZYX), /* 42 */
+ TU6_xTC(R8G8B8A8_SRGB, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX), /* 43 */
+
+ /* 32-bit BGRA */
+ TU6_VTC(B8G8R8A8_UNORM, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WXYZ), /* 44 */
+ TU6_VTC(B8G8R8A8_SNORM, 8_8_8_8_SNORM, R8G8B8A8_SNORM, WXYZ), /* 45 */
+ TU6_Vxx(B8G8R8A8_USCALED, 8_8_8_8_UINT, R8G8B8A8_UINT, WXYZ), /* 46 */
+ TU6_Vxx(B8G8R8A8_SSCALED, 8_8_8_8_SINT, R8G8B8A8_SINT, WXYZ), /* 47 */
+ TU6_VTC(B8G8R8A8_UINT, 8_8_8_8_UINT, R8G8B8A8_UINT, WXYZ), /* 48 */
+ TU6_VTC(B8G8R8A8_SINT, 8_8_8_8_SINT, R8G8B8A8_SINT, WXYZ), /* 49 */
+ TU6_xTC(B8G8R8A8_SRGB, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WXYZ), /* 50 */
+
+ /* 32-bit packed */
+ TU6_VTC(A8B8G8R8_UNORM_PACK32, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX), /* 51 */
+ TU6_VTC(A8B8G8R8_SNORM_PACK32, 8_8_8_8_SNORM, R8G8B8A8_SNORM, WZYX), /* 52 */
+ TU6_Vxx(A8B8G8R8_USCALED_PACK32, 8_8_8_8_UINT, R8G8B8A8_UINT, WZYX), /* 53 */
+ TU6_Vxx(A8B8G8R8_SSCALED_PACK32, 8_8_8_8_SINT, R8G8B8A8_SINT, WZYX), /* 54 */
+ TU6_VTC(A8B8G8R8_UINT_PACK32, 8_8_8_8_UINT, R8G8B8A8_UINT, WZYX), /* 55 */
+ TU6_VTC(A8B8G8R8_SINT_PACK32, 8_8_8_8_SINT, R8G8B8A8_SINT, WZYX), /* 56 */
+ TU6_xTC(A8B8G8R8_SRGB_PACK32, 8_8_8_8_UNORM, R8G8B8A8_UNORM, WZYX), /* 57 */
+ TU6_VTC(A2R10G10B10_UNORM_PACK32, 10_10_10_2_UNORM, R10G10B10A2_UNORM, WXYZ), /* 58 */
+ TU6_Vxx(A2R10G10B10_SNORM_PACK32, 10_10_10_2_SNORM, R10G10B10A2_SNORM, WXYZ), /* 59 */
+ TU6_Vxx(A2R10G10B10_USCALED_PACK32, 10_10_10_2_UINT, R10G10B10A2_UINT, WXYZ), /* 60 */
+ TU6_Vxx(A2R10G10B10_SSCALED_PACK32, 10_10_10_2_SINT, R10G10B10A2_SINT, WXYZ), /* 61 */
+ TU6_VTC(A2R10G10B10_UINT_PACK32, 10_10_10_2_UINT, R10G10B10A2_UINT, WXYZ), /* 62 */
+ TU6_Vxx(A2R10G10B10_SINT_PACK32, 10_10_10_2_SINT, R10G10B10A2_SINT, WXYZ), /* 63 */
+ TU6_VTC(A2B10G10R10_UNORM_PACK32, 10_10_10_2_UNORM, R10G10B10A2_UNORM, WZYX), /* 64 */
+ TU6_Vxx(A2B10G10R10_SNORM_PACK32, 10_10_10_2_SNORM, R10G10B10A2_SNORM, WZYX), /* 65 */
+ TU6_Vxx(A2B10G10R10_USCALED_PACK32, 10_10_10_2_UINT, R10G10B10A2_UINT, WZYX), /* 66 */
+ TU6_Vxx(A2B10G10R10_SSCALED_PACK32, 10_10_10_2_SINT, R10G10B10A2_SINT, WZYX), /* 67 */
+ TU6_VTC(A2B10G10R10_UINT_PACK32, 10_10_10_2_UINT, R10G10B10A2_UINT, WZYX), /* 68 */
+ TU6_Vxx(A2B10G10R10_SINT_PACK32, 10_10_10_2_SINT, R10G10B10A2_SINT, WZYX), /* 69 */
+
+ /* 16-bit R */
+ TU6_VTC(R16_UNORM, 16_UNORM, R16_UNORM, WZYX), /* 70 */
+ TU6_VTC(R16_SNORM, 16_SNORM, R16_SNORM, WZYX), /* 71 */
+ TU6_Vxx(R16_USCALED, 16_UINT, R16_UINT, WZYX), /* 72 */
+ TU6_Vxx(R16_SSCALED, 16_SINT, R16_SINT, WZYX), /* 73 */
+ TU6_VTC(R16_UINT, 16_UINT, R16_UINT, WZYX), /* 74 */
+ TU6_VTC(R16_SINT, 16_SINT, R16_SINT, WZYX), /* 75 */
+ TU6_VTC(R16_SFLOAT, 16_FLOAT, R16_FLOAT, WZYX), /* 76 */
+
+ /* 32-bit RG */
+ TU6_VTC(R16G16_UNORM, 16_16_UNORM, R16G16_UNORM, WZYX), /* 77 */
+ TU6_VTC(R16G16_SNORM, 16_16_SNORM, R16G16_SNORM, WZYX), /* 78 */
+ TU6_VTx(R16G16_USCALED, 16_16_UINT, R16G16_UINT, WZYX), /* 79 */
+ TU6_VTx(R16G16_SSCALED, 16_16_SINT, R16G16_SINT, WZYX), /* 80 */
+ TU6_VTC(R16G16_UINT, 16_16_UINT, R16G16_UINT, WZYX), /* 81 */
+ TU6_VTC(R16G16_SINT, 16_16_SINT, R16G16_SINT, WZYX), /* 82 */
+ TU6_VTC(R16G16_SFLOAT, 16_16_FLOAT, R16G16_FLOAT, WZYX), /* 83 */
+
+ /* 48-bit RGB */
+ TU6_Vxx(R16G16B16_UNORM, 16_16_16_UNORM, R16G16B16_UNORM, WZYX), /* 84 */
+ TU6_Vxx(R16G16B16_SNORM, 16_16_16_SNORM, R16G16B16_SNORM, WZYX), /* 85 */
+ TU6_Vxx(R16G16B16_USCALED, 16_16_16_UINT, R16G16B16_UINT, WZYX), /* 86 */
+ TU6_Vxx(R16G16B16_SSCALED, 16_16_16_SINT, R16G16B16_SINT, WZYX), /* 87 */
+ TU6_Vxx(R16G16B16_UINT, 16_16_16_UINT, R16G16B16_UINT, WZYX), /* 88 */
+ TU6_Vxx(R16G16B16_SINT, 16_16_16_SINT, R16G16B16_SINT, WZYX), /* 89 */
+ TU6_Vxx(R16G16B16_SFLOAT, 16_16_16_FLOAT, R16G16B16_FLOAT, WZYX), /* 90 */
+
+ /* 64-bit RGBA */
+ TU6_VTC(R16G16B16A16_UNORM, 16_16_16_16_UNORM, R16G16B16A16_UNORM, WZYX), /* 91 */
+ TU6_VTC(R16G16B16A16_SNORM, 16_16_16_16_SNORM, R16G16B16A16_SNORM, WZYX), /* 92 */
+ TU6_VTx(R16G16B16A16_USCALED, 16_16_16_16_UINT, R16G16B16A16_UINT, WZYX), /* 93 */
+ TU6_VTx(R16G16B16A16_SSCALED, 16_16_16_16_SINT, R16G16B16A16_SINT, WZYX), /* 94 */
+ TU6_VTC(R16G16B16A16_UINT, 16_16_16_16_UINT, R16G16B16A16_UINT, WZYX), /* 95 */
+ TU6_VTC(R16G16B16A16_SINT, 16_16_16_16_SINT, R16G16B16A16_SINT, WZYX), /* 96 */
+ TU6_VTC(R16G16B16A16_SFLOAT, 16_16_16_16_FLOAT, R16G16B16A16_FLOAT, WZYX), /* 97 */
+
+ /* 32-bit R */
+ TU6_VTC(R32_UINT, 32_UINT, R32_UINT, WZYX), /* 98 */
+ TU6_VTC(R32_SINT, 32_SINT, R32_SINT, WZYX), /* 99 */
+ TU6_VTC(R32_SFLOAT, 32_FLOAT, R32_FLOAT, WZYX), /* 100 */
+
+ /* 64-bit RG */
+ TU6_VTC(R32G32_UINT, 32_32_UINT, R32G32_UINT, WZYX), /* 101 */
+ TU6_VTC(R32G32_SINT, 32_32_SINT, R32G32_SINT, WZYX), /* 102 */
+ TU6_VTC(R32G32_SFLOAT, 32_32_FLOAT, R32G32_FLOAT, WZYX), /* 103 */
+
+ /* 96-bit RGB */
+ TU6_VTx(R32G32B32_UINT, 32_32_32_UINT, R32G32B32_UINT, WZYX), /* 104 */
+ TU6_VTx(R32G32B32_SINT, 32_32_32_SINT, R32G32B32_SINT, WZYX), /* 105 */
+ TU6_VTx(R32G32B32_SFLOAT, 32_32_32_FLOAT, R32G32B32_FLOAT, WZYX), /* 106 */
+
+ /* 128-bit RGBA */
+ TU6_VTC(R32G32B32A32_UINT, 32_32_32_32_UINT, R32G32B32A32_UINT, WZYX), /* 107 */
+ TU6_VTC(R32G32B32A32_SINT, 32_32_32_32_SINT, R32G32B32A32_SINT, WZYX), /* 108 */
+ TU6_VTC(R32G32B32A32_SFLOAT, 32_32_32_32_FLOAT, R32G32B32A32_FLOAT, WZYX), /* 109 */
+
+ /* 64-bit R */
+ TU6_xxx(R64_UINT, 64_UINT, R64_UINT, WZYX), /* 110 */
+ TU6_xxx(R64_SINT, 64_SINT, R64_SINT, WZYX), /* 111 */
+ TU6_xxx(R64_SFLOAT, 64_FLOAT, R64_FLOAT, WZYX), /* 112 */
+
+ /* 128-bit RG */
+ TU6_xxx(R64G64_UINT, 64_64_UINT, R64G64_UINT, WZYX), /* 113 */
+ TU6_xxx(R64G64_SINT, 64_64_SINT, R64G64_SINT, WZYX), /* 114 */
+ TU6_xxx(R64G64_SFLOAT, 64_64_FLOAT, R64G64_FLOAT, WZYX), /* 115 */
+
+ /* 192-bit RGB */
+ TU6_xxx(R64G64B64_UINT, 64_64_64_UINT, R64G64B64_UINT, WZYX), /* 116 */
+ TU6_xxx(R64G64B64_SINT, 64_64_64_SINT, R64G64B64_SINT, WZYX), /* 117 */
+ TU6_xxx(R64G64B64_SFLOAT, 64_64_64_FLOAT, R64G64B64_FLOAT, WZYX), /* 118 */
+
+ /* 256-bit RGBA */
+ TU6_xxx(R64G64B64A64_UINT, 64_64_64_64_UINT, R64G64B64A64_UINT, WZYX), /* 119 */
+ TU6_xxx(R64G64B64A64_SINT, 64_64_64_64_SINT, R64G64B64A64_SINT, WZYX), /* 120 */
+ TU6_xxx(R64G64B64A64_SFLOAT, 64_64_64_64_FLOAT, R64G64B64A64_FLOAT, WZYX), /* 121 */
+
+ /* 32-bit packed float */
+ TU6_VTC(B10G11R11_UFLOAT_PACK32, 11_11_10_FLOAT, R11G11B10_FLOAT, WZYX), /* 122 */
+ TU6_xTx(E5B9G9R9_UFLOAT_PACK32, 9_9_9_E5_FLOAT, R9G9B9E5_FLOAT, WZYX), /* 123 */
+
+ /* depth/stencil */
+ TU6_xTC(D16_UNORM, 16_UNORM, R16_UNORM, WZYX), /* 124 */
+ TU6_xTC(X8_D24_UNORM_PACK32, X8Z24_UNORM, X8Z24_UNORM, WZYX), /* 125 */
+ TU6_xTC(D32_SFLOAT, 32_FLOAT, R32_FLOAT, WZYX), /* 126 */
+ TU6_xTC(S8_UINT, 8_UINT, R8_UNORM, WZYX), /* 127 */
+ TU6_xxx(D16_UNORM_S8_UINT, X8Z16_UNORM, X8Z16_UNORM, WZYX), /* 128 */
+ TU6_xTC(D24_UNORM_S8_UINT, X8Z24_UNORM, X8Z24_UNORM, WZYX), /* 129 */
+ TU6_xTC(D32_SFLOAT_S8_UINT, 32_FLOAT, R32_FLOAT, WZYX), /* 130 */
+
+ /* compressed */
+ TU6_xTx(BC1_RGB_UNORM_BLOCK, DXT1, DXT1, WZYX), /* 131 */
+ TU6_xTx(BC1_RGB_SRGB_BLOCK, DXT1, DXT1, WZYX), /* 132 */
+ TU6_xTx(BC1_RGBA_UNORM_BLOCK, DXT1, DXT1, WZYX), /* 133 */
+ TU6_xTx(BC1_RGBA_SRGB_BLOCK, DXT1, DXT1, WZYX), /* 134 */
+ TU6_xTx(BC2_UNORM_BLOCK, DXT3, DXT3, WZYX), /* 135 */
+ TU6_xTx(BC2_SRGB_BLOCK, DXT3, DXT3, WZYX), /* 136 */
+ TU6_xTx(BC3_UNORM_BLOCK, DXT5, DXT5, WZYX), /* 137 */
+ TU6_xTx(BC3_SRGB_BLOCK, DXT5, DXT5, WZYX), /* 138 */
+ TU6_xTx(BC4_UNORM_BLOCK, RGTC1_UNORM, RGTC1_UNORM, WZYX), /* 139 */
+ TU6_xTx(BC4_SNORM_BLOCK, RGTC1_SNORM, RGTC1_SNORM, WZYX), /* 140 */
+ TU6_xTx(BC5_UNORM_BLOCK, RGTC2_UNORM, RGTC2_UNORM, WZYX), /* 141 */
+ TU6_xTx(BC5_SNORM_BLOCK, RGTC2_SNORM, RGTC2_SNORM, WZYX), /* 142 */
+ TU6_xTx(BC6H_UFLOAT_BLOCK, BPTC_UFLOAT, BPTC_UFLOAT, WZYX), /* 143 */
+ TU6_xTx(BC6H_SFLOAT_BLOCK, BPTC_FLOAT, BPTC_FLOAT, WZYX), /* 144 */
+ TU6_xTx(BC7_UNORM_BLOCK, BPTC, BPTC, WZYX), /* 145 */
+ TU6_xTx(BC7_SRGB_BLOCK, BPTC, BPTC, WZYX), /* 146 */
+ TU6_xTx(ETC2_R8G8B8_UNORM_BLOCK, ETC2_RGB8, ETC2_RGB8, WZYX), /* 147 */
+ TU6_xTx(ETC2_R8G8B8_SRGB_BLOCK, ETC2_RGB8, ETC2_RGB8, WZYX), /* 148 */
+ TU6_xTx(ETC2_R8G8B8A1_UNORM_BLOCK, ETC2_RGB8A1, ETC2_RGB8A1, WZYX), /* 149 */
+ TU6_xTx(ETC2_R8G8B8A1_SRGB_BLOCK, ETC2_RGB8A1, ETC2_RGB8A1, WZYX), /* 150 */
+ TU6_xTx(ETC2_R8G8B8A8_UNORM_BLOCK, ETC2_RGBA8, ETC2_RGBA8, WZYX), /* 151 */
+ TU6_xTx(ETC2_R8G8B8A8_SRGB_BLOCK, ETC2_RGBA8, ETC2_RGBA8, WZYX), /* 152 */
+ TU6_xTx(EAC_R11_UNORM_BLOCK, ETC2_R11_UNORM, ETC2_R11_UNORM, WZYX), /* 153 */
+ TU6_xTx(EAC_R11_SNORM_BLOCK, ETC2_R11_SNORM, ETC2_R11_SNORM, WZYX), /* 154 */
+ TU6_xTx(EAC_R11G11_UNORM_BLOCK, ETC2_RG11_UNORM, ETC2_RG11_UNORM, WZYX), /* 155 */
+ TU6_xTx(EAC_R11G11_SNORM_BLOCK, ETC2_RG11_SNORM, ETC2_RG11_SNORM, WZYX), /* 156 */
+ TU6_xTx(ASTC_4x4_UNORM_BLOCK, ASTC_4x4, ASTC_4x4, WZYX), /* 157 */
+ TU6_xTx(ASTC_4x4_SRGB_BLOCK, ASTC_4x4, ASTC_4x4, WZYX), /* 158 */
+ TU6_xTx(ASTC_5x4_UNORM_BLOCK, ASTC_5x4, ASTC_5x4, WZYX), /* 159 */
+ TU6_xTx(ASTC_5x4_SRGB_BLOCK, ASTC_5x4, ASTC_5x4, WZYX), /* 160 */
+ TU6_xTx(ASTC_5x5_UNORM_BLOCK, ASTC_5x5, ASTC_5x5, WZYX), /* 161 */
+ TU6_xTx(ASTC_5x5_SRGB_BLOCK, ASTC_5x5, ASTC_5x5, WZYX), /* 162 */
+ TU6_xTx(ASTC_6x5_UNORM_BLOCK, ASTC_6x5, ASTC_6x5, WZYX), /* 163 */
+ TU6_xTx(ASTC_6x5_SRGB_BLOCK, ASTC_6x5, ASTC_6x5, WZYX), /* 164 */
+ TU6_xTx(ASTC_6x6_UNORM_BLOCK, ASTC_6x6, ASTC_6x6, WZYX), /* 165 */
+ TU6_xTx(ASTC_6x6_SRGB_BLOCK, ASTC_6x6, ASTC_6x6, WZYX), /* 166 */
+ TU6_xTx(ASTC_8x5_UNORM_BLOCK, ASTC_8x5, ASTC_8x5, WZYX), /* 167 */
+ TU6_xTx(ASTC_8x5_SRGB_BLOCK, ASTC_8x5, ASTC_8x5, WZYX), /* 168 */
+ TU6_xTx(ASTC_8x6_UNORM_BLOCK, ASTC_8x6, ASTC_8x6, WZYX), /* 169 */
+ TU6_xTx(ASTC_8x6_SRGB_BLOCK, ASTC_8x6, ASTC_8x6, WZYX), /* 170 */
+ TU6_xTx(ASTC_8x8_UNORM_BLOCK, ASTC_8x8, ASTC_8x8, WZYX), /* 171 */
+ TU6_xTx(ASTC_8x8_SRGB_BLOCK, ASTC_8x8, ASTC_8x8, WZYX), /* 172 */
+ TU6_xTx(ASTC_10x5_UNORM_BLOCK, ASTC_10x5, ASTC_10x5, WZYX), /* 173 */
+ TU6_xTx(ASTC_10x5_SRGB_BLOCK, ASTC_10x5, ASTC_10x5, WZYX), /* 174 */
+ TU6_xTx(ASTC_10x6_UNORM_BLOCK, ASTC_10x6, ASTC_10x6, WZYX), /* 175 */
+ TU6_xTx(ASTC_10x6_SRGB_BLOCK, ASTC_10x6, ASTC_10x6, WZYX), /* 176 */
+ TU6_xTx(ASTC_10x8_UNORM_BLOCK, ASTC_10x8, ASTC_10x8, WZYX), /* 177 */
+ TU6_xTx(ASTC_10x8_SRGB_BLOCK, ASTC_10x8, ASTC_10x8, WZYX), /* 178 */
+ TU6_xTx(ASTC_10x10_UNORM_BLOCK, ASTC_10x10, ASTC_10x10, WZYX), /* 179 */
+ TU6_xTx(ASTC_10x10_SRGB_BLOCK, ASTC_10x10, ASTC_10x10, WZYX), /* 180 */
+ TU6_xTx(ASTC_12x10_UNORM_BLOCK, ASTC_12x10, ASTC_12x10, WZYX), /* 181 */
+ TU6_xTx(ASTC_12x10_SRGB_BLOCK, ASTC_12x10, ASTC_12x10, WZYX), /* 182 */
+ TU6_xTx(ASTC_12x12_UNORM_BLOCK, ASTC_12x12, ASTC_12x12, WZYX), /* 183 */
+ TU6_xTx(ASTC_12x12_SRGB_BLOCK, ASTC_12x12, ASTC_12x12, WZYX), /* 184 */
+};
+#undef TU_FORMAT_TABLE_FIRST
+#undef TU_FORMAT_TABLE_LAST
-bool
-tu6_format_vtx_supported(VkFormat vk_format)
+const struct tu_native_format *
+tu6_get_native_format(VkFormat format)
{
- enum pipe_format format = vk_format_to_pipe_format(vk_format);
- return fd6_vertex_format(format) != FMT6_NONE;
+ const struct tu_native_format *fmt = NULL;
+
+ if (format >= tu6_format_table0_first && format <= tu6_format_table0_last)
+ fmt = &tu6_format_table0[format - tu6_format_table0_first];
+
+ return (fmt && fmt->present) ? fmt : NULL;
}
-/* Map non-colorspace-converted YUV formats to RGB pipe formats where we can,
- * since our hardware doesn't support colorspace conversion.
- *
- * Really, we should probably be returning the RGB formats in
- * vk_format_to_pipe_format, but we don't have all the equivalent pipe formats
- * for VK RGB formats yet, and we'd have to switch all consumers of that
- * function at once.
- */
-enum pipe_format
-tu_vk_format_to_pipe_format(VkFormat vk_format)
+enum a6xx_2d_ifmt
+tu6_rb_fmt_to_ifmt(enum a6xx_color_fmt fmt)
{
- switch (vk_format) {
- case VK_FORMAT_G8B8G8R8_422_UNORM: /* YUYV */
- return PIPE_FORMAT_R8G8_R8B8_UNORM;
- case VK_FORMAT_B8G8R8G8_422_UNORM: /* UYVY */
- return PIPE_FORMAT_G8R8_B8R8_UNORM;
- case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
- return PIPE_FORMAT_G8_B8R8_420_UNORM;
- case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
- return PIPE_FORMAT_G8_B8_R8_420_UNORM;
+ switch (fmt) {
+ case RB6_A8_UNORM:
+ case RB6_R8_UNORM:
+ case RB6_R8_SNORM:
+ case RB6_R8G8_UNORM:
+ case RB6_R8G8_SNORM:
+ case RB6_R8G8B8A8_UNORM:
+ case RB6_R8G8B8_UNORM:
+ case RB6_R8G8B8A8_SNORM:
+ return R2D_UNORM8;
+
+ case RB6_R32_UINT:
+ case RB6_R32_SINT:
+ case RB6_R32G32_UINT:
+ case RB6_R32G32_SINT:
+ case RB6_R32G32B32A32_UINT:
+ case RB6_R32G32B32A32_SINT:
+ return R2D_INT32;
+
+ case RB6_R16_UINT:
+ case RB6_R16_SINT:
+ case RB6_R16G16_UINT:
+ case RB6_R16G16_SINT:
+ case RB6_R16G16B16A16_UINT:
+ case RB6_R16G16B16A16_SINT:
+ return R2D_INT16;
+
+ case RB6_R8_UINT:
+ case RB6_R8_SINT:
+ case RB6_R8G8_UINT:
+ case RB6_R8G8_SINT:
+ case RB6_R8G8B8A8_UINT:
+ case RB6_R8G8B8A8_SINT:
+ return R2D_INT8;
+
+ case RB6_R16_UNORM:
+ case RB6_R16_SNORM:
+ case RB6_R16G16_UNORM:
+ case RB6_R16G16_SNORM:
+ case RB6_R16G16B16A16_UNORM:
+ case RB6_R16G16B16A16_SNORM:
+ case RB6_R32_FLOAT:
+ case RB6_R32G32_FLOAT:
+ case RB6_R32G32B32A32_FLOAT:
+ return R2D_FLOAT32;
+
+ case RB6_R16_FLOAT:
+ case RB6_R16G16_FLOAT:
+ case RB6_R16G16B16A16_FLOAT:
+ return R2D_FLOAT16;
+
+ case RB6_R4G4B4A4_UNORM:
+ case RB6_R5G5B5A1_UNORM:
+ case RB6_R5G6B5_UNORM:
+ case RB6_R10G10B10A2_UNORM:
+ case RB6_R10G10B10A2_UINT:
+ case RB6_R11G11B10_FLOAT:
+ case RB6_X8Z24_UNORM:
+ // ???
+ return 0;
default:
- return vk_format_to_pipe_format(vk_format);
+ unreachable("bad format");
+ return 0;
}
}
-static struct tu_native_format
-tu6_format_color_unchecked(enum pipe_format format, enum a6xx_tile_mode tile_mode)
+static uint32_t
+tu_pack_mask(int bits)
{
- struct tu_native_format fmt = {
- .fmt = fd6_color_format(format, tile_mode),
- .swap = fd6_color_swap(format, tile_mode),
- };
-
- switch (format) {
- case PIPE_FORMAT_Z24X8_UNORM:
- case PIPE_FORMAT_Z24_UNORM_S8_UINT:
- fmt.fmt = FMT6_8_8_8_8_UNORM;
- break;
-
- default:
- break;
- }
-
- return fmt;
+ assert(bits <= 32);
+ return (1ull << bits) - 1;
}
-bool
-tu6_format_color_supported(enum pipe_format format)
+static uint32_t
+tu_pack_float32_for_unorm(float val, int bits)
{
- return tu6_format_color_unchecked(format, TILE6_LINEAR).fmt != FMT6_NONE;
+ const uint32_t max = tu_pack_mask(bits);
+ if (val < 0.0f)
+ return 0;
+ else if (val > 1.0f)
+ return max;
+ else
+ return _mesa_lroundevenf(val * (float) max);
}
-struct tu_native_format
-tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode)
+static uint32_t
+tu_pack_float32_for_snorm(float val, int bits)
{
- struct tu_native_format fmt = tu6_format_color_unchecked(format, tile_mode);
- assert(fmt.fmt != FMT6_NONE);
- return fmt;
+ const int32_t max = tu_pack_mask(bits - 1);
+ int32_t tmp;
+ if (val < -1.0f)
+ tmp = -max;
+ else if (val > 1.0f)
+ tmp = max;
+ else
+ tmp = _mesa_lroundevenf(val * (float) max);
+
+ return tmp & tu_pack_mask(bits);
}
-static struct tu_native_format
-tu6_format_texture_unchecked(enum pipe_format format, enum a6xx_tile_mode tile_mode)
+static uint32_t
+tu_pack_float32_for_uscaled(float val, int bits)
{
- struct tu_native_format fmt = {
- .fmt = fd6_texture_format(format, tile_mode),
- .swap = fd6_texture_swap(format, tile_mode),
- };
-
- switch (format) {
- case PIPE_FORMAT_Z24X8_UNORM:
- case PIPE_FORMAT_Z24_UNORM_S8_UINT:
- /* freedreno uses Z24_UNORM_S8_UINT (sampling) or
- * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 (blits) for this format, while we use
- * FMT6_8_8_8_8_UNORM or FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8
- */
- fmt.fmt = FMT6_8_8_8_8_UNORM;
- break;
+ const uint32_t max = tu_pack_mask(bits);
+ if (val < 0.0f)
+ return 0;
+ else if (val > (float) max)
+ return max;
+ else
+ return (uint32_t) val;
+}
- default:
- break;
- }
+static uint32_t
+tu_pack_float32_for_sscaled(float val, int bits)
+{
+ const int32_t max = tu_pack_mask(bits - 1);
+ const int32_t min = -max - 1;
+ int32_t tmp;
+ if (val < (float) min)
+ tmp = min;
+ else if (val > (float) max)
+ tmp = max;
+ else
+ tmp = (int32_t) val;
+
+ return tmp & tu_pack_mask(bits);
+}
- return fmt;
+static uint32_t
+tu_pack_uint32_for_uint(uint32_t val, int bits)
+{
+ return val & tu_pack_mask(bits);
}
-struct tu_native_format
-tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode)
+static uint32_t
+tu_pack_int32_for_sint(int32_t val, int bits)
{
- struct tu_native_format fmt = tu6_format_texture_unchecked(format, tile_mode);
- assert(fmt.fmt != FMT6_NONE);
- return fmt;
+ return val & tu_pack_mask(bits);
}
-bool
-tu6_format_texture_supported(enum pipe_format format)
+static uint32_t
+tu_pack_float32_for_sfloat(float val, int bits)
{
- return tu6_format_texture_unchecked(format, TILE6_LINEAR).fmt != FMT6_NONE;
+ assert(bits == 16 || bits == 32);
+ return bits == 16 ? util_float_to_half(val) : fui(val);
}
-enum tu6_ubwc_compat_type {
- TU6_UBWC_UNKNOWN_COMPAT,
- TU6_UBWC_R8G8_UNORM,
- TU6_UBWC_R8G8_INT,
- TU6_UBWC_R8G8B8A8_UNORM,
- TU6_UBWC_R8G8B8A8_INT,
- TU6_UBWC_B8G8R8A8_UNORM,
- TU6_UBWC_R16G16_INT,
- TU6_UBWC_R16G16B16A16_INT,
- TU6_UBWC_R32_INT,
- TU6_UBWC_R32G32_INT,
- TU6_UBWC_R32G32B32A32_INT,
- TU6_UBWC_R32_FLOAT,
+union tu_clear_component_value {
+ float float32;
+ int32_t int32;
+ uint32_t uint32;
};
-static enum tu6_ubwc_compat_type
-tu6_ubwc_compat_mode(VkFormat format)
+static uint32_t
+tu_pack_clear_component_value(union tu_clear_component_value val,
+ const struct vk_format_channel_description *ch)
{
- switch (format) {
- case VK_FORMAT_R8G8_UNORM:
- case VK_FORMAT_R8G8_SRGB:
- return TU6_UBWC_R8G8_UNORM;
-
- case VK_FORMAT_R8G8_UINT:
- case VK_FORMAT_R8G8_SINT:
- return TU6_UBWC_R8G8_INT;
-
- case VK_FORMAT_R8G8B8A8_UNORM:
- case VK_FORMAT_R8G8B8A8_SRGB:
- case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
- case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
- return TU6_UBWC_R8G8B8A8_UNORM;
-
- case VK_FORMAT_R8G8B8A8_UINT:
- case VK_FORMAT_R8G8B8A8_SINT:
- case VK_FORMAT_A8B8G8R8_UINT_PACK32:
- case VK_FORMAT_A8B8G8R8_SINT_PACK32:
- return TU6_UBWC_R8G8B8A8_INT;
-
- case VK_FORMAT_R16G16_UINT:
- case VK_FORMAT_R16G16_SINT:
- return TU6_UBWC_R16G16_INT;
-
- case VK_FORMAT_R16G16B16A16_UINT:
- case VK_FORMAT_R16G16B16A16_SINT:
- return TU6_UBWC_R16G16B16A16_INT;
-
- case VK_FORMAT_R32_UINT:
- case VK_FORMAT_R32_SINT:
- return TU6_UBWC_R32_INT;
-
- case VK_FORMAT_R32G32_UINT:
- case VK_FORMAT_R32G32_SINT:
- return TU6_UBWC_R32G32_INT;
-
- case VK_FORMAT_R32G32B32A32_UINT:
- case VK_FORMAT_R32G32B32A32_SINT:
- return TU6_UBWC_R32G32B32A32_INT;
-
- case VK_FORMAT_D32_SFLOAT:
- case VK_FORMAT_R32_SFLOAT:
- /* TODO: a630 blob allows these, but not a660. When is it legal? */
- return TU6_UBWC_UNKNOWN_COMPAT;
-
- case VK_FORMAT_B8G8R8A8_UNORM:
- case VK_FORMAT_B8G8R8A8_SRGB:
- /* The blob doesn't list these as compatible, but they surely are.
- * freedreno's happy to cast between them, and zink would really like
- * to.
- */
- return TU6_UBWC_B8G8R8A8_UNORM;
-
+ uint32_t packed;
+
+ switch (ch->type) {
+ case VK_FORMAT_TYPE_UNSIGNED:
+ /* normalized, scaled, or pure integer */
+ assert(ch->normalized + ch->scaled + ch->pure_integer == 1);
+ if (ch->normalized)
+ packed = tu_pack_float32_for_unorm(val.float32, ch->size);
+ else if (ch->scaled)
+ packed = tu_pack_float32_for_uscaled(val.float32, ch->size);
+ else
+ packed = tu_pack_uint32_for_uint(val.uint32, ch->size);
+ break;
+ case VK_FORMAT_TYPE_SIGNED:
+ /* normalized, scaled, or pure integer */
+ assert(ch->normalized + ch->scaled + ch->pure_integer == 1);
+ if (ch->normalized)
+ packed = tu_pack_float32_for_snorm(val.float32, ch->size);
+ else if (ch->scaled)
+ packed = tu_pack_float32_for_sscaled(val.float32, ch->size);
+ else
+ packed = tu_pack_int32_for_sint(val.int32, ch->size);
+ break;
+ case VK_FORMAT_TYPE_FLOAT:
+ packed = tu_pack_float32_for_sfloat(val.float32, ch->size);
+ break;
default:
- return TU6_UBWC_UNKNOWN_COMPAT;
+ unreachable("unexpected channel type");
+ packed = 0;
+ break;
}
+
+ assert((packed & tu_pack_mask(ch->size)) == packed);
+ return packed;
}
-bool
-tu6_mutable_format_list_ubwc_compatible(const VkImageFormatListCreateInfo *fmt_list)
+static const struct vk_format_channel_description *
+tu_get_format_channel_description(const struct vk_format_description *desc,
+ int comp)
{
- if (!fmt_list || !fmt_list->viewFormatCount)
- return false;
-
- /* We're only looking at format list cross compatibility here, check
- * ubwc_possible() for the base "is the format UBWC-able at all?"
- */
- if (fmt_list->viewFormatCount == 1)
- return true;
-
- enum tu6_ubwc_compat_type type =
- tu6_ubwc_compat_mode(fmt_list->pViewFormats[0]);
- if (type == TU6_UBWC_UNKNOWN_COMPAT)
- return false;
-
- for (uint32_t i = 1; i < fmt_list->viewFormatCount; i++) {
- if (tu6_ubwc_compat_mode(fmt_list->pViewFormats[i]) != type)
- return false;
+ switch (desc->swizzle[comp]) {
+ case VK_SWIZZLE_X:
+ return &desc->channel[0];
+ case VK_SWIZZLE_Y:
+ return &desc->channel[1];
+ case VK_SWIZZLE_Z:
+ return &desc->channel[2];
+ case VK_SWIZZLE_W:
+ return &desc->channel[3];
+ default:
+ return NULL;
}
-
- return true;
}
-static void
-tu_physical_device_get_format_properties(
- struct tu_physical_device *physical_device,
- VkFormat vk_format,
- VkFormatProperties3 *out_properties)
+static union tu_clear_component_value
+tu_get_clear_component_value(const VkClearValue *val, int comp, bool color)
{
- VkFormatFeatureFlags2 linear = 0, optimal = 0, buffer = 0;
- enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
- const struct util_format_description *desc = util_format_description(format);
-
- bool supported_vtx = tu6_format_vtx_supported(vk_format);
- bool supported_color = tu6_format_color_supported(format);
- bool supported_tex = tu6_format_texture_supported(format);
- bool is_npot = !util_is_power_of_two_or_zero(desc->block.bits);
-
- if (format == PIPE_FORMAT_NONE ||
- !(supported_vtx || supported_color || supported_tex)) {
- goto end;
+ union tu_clear_component_value tmp;
+ if (color) {
+ assert(comp < 4);
+ tmp.uint32 = val->color.uint32[comp];
+ } else {
+ assert(comp < 2);
+ if (comp == 0)
+ tmp.float32 = val->depthStencil.depth;
+ else
+ tmp.uint32 = val->depthStencil.stencil;
}
- /* We don't support BufferToImage/ImageToBuffer for npot formats */
- if (!is_npot)
- buffer |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
+ return tmp;
+}
- if (supported_vtx)
- buffer |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT;
+/**
+ * Pack a VkClearValue into a 128-bit buffer. \a format is respected except
+ * for the component order. The components are always packed in WZYX order
+ * (i.e., msb is white and lsb is red).
+ *
+ * Return the number of uint32_t's used.
+ */
+int
+tu_pack_clear_value(const VkClearValue *val, VkFormat format, uint32_t buf[4])
+{
+ const struct vk_format_description *desc = vk_format_description(format);
+ assert(desc && desc->layout == VK_FORMAT_LAYOUT_PLAIN);
+
+ /* S8_UINT is special and has no depth */
+ const int max_components =
+ format == VK_FORMAT_S8_UINT ? 2 : desc->nr_channels;
+
+ int buf_offset = 0;
+ int bit_shift = 0;
+ for (int comp = 0; comp < max_components; comp++) {
+ const struct vk_format_channel_description *ch =
+ tu_get_format_channel_description(desc, comp);
+ if (!ch) {
+ assert(format == VK_FORMAT_S8_UINT && comp == 0);
+ continue;
+ }
- if (supported_tex)
- buffer |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT;
+ union tu_clear_component_value v = tu_get_clear_component_value(
+ val, comp, desc->colorspace != VK_FORMAT_COLORSPACE_ZS);
- /* Don't support anything but texel buffers for non-power-of-two formats
- * with 3 components. We'd need several workarounds for copying and
- * clearing them because they're not renderable.
- */
- if (supported_tex && !is_npot) {
- optimal |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
- VK_FORMAT_FEATURE_TRANSFER_DST_BIT |
- VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
- VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT |
- VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT |
- VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT;
-
- /* no blit src bit for YUYV/NV12/I420 formats */
- if (desc->layout != UTIL_FORMAT_LAYOUT_SUBSAMPLED &&
- desc->layout != UTIL_FORMAT_LAYOUT_PLANAR2 &&
- desc->layout != UTIL_FORMAT_LAYOUT_PLANAR3)
- optimal |= VK_FORMAT_FEATURE_BLIT_SRC_BIT;
-
- if (desc->layout != UTIL_FORMAT_LAYOUT_SUBSAMPLED)
- optimal |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT;
-
- if (!vk_format_is_int(vk_format)) {
- optimal |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
-
- if (physical_device->vk.supported_extensions.EXT_filter_cubic)
- optimal |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_CUBIC_BIT_EXT;
+ /* move to the next uint32_t when there is not enough space */
+ assert(ch->size <= 32);
+ if (bit_shift + ch->size > 32) {
+ buf_offset++;
+ bit_shift = 0;
}
- }
- if (supported_color) {
- assert(supported_tex);
- optimal |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
- VK_FORMAT_FEATURE_BLIT_DST_BIT |
- VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT |
- VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT |
- VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT;
+ if (bit_shift == 0)
+ buf[buf_offset] = 0;
- buffer |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT |
- VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT |
- VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT;
+ buf[buf_offset] |= tu_pack_clear_component_value(v, ch) << bit_shift;
+ bit_shift += ch->size;
+ }
- /* TODO: The blob also exposes these for R16G16_UINT/R16G16_SINT, but we
- * don't have any tests for those.
- */
- if (vk_format == VK_FORMAT_R32_UINT || vk_format == VK_FORMAT_R32_SINT) {
- optimal |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
- buffer |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
- }
+ return buf_offset + 1;
+}
- if (!util_format_is_pure_integer(format))
- optimal |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
+static void
+tu_physical_device_get_format_properties(
+ struct tu_physical_device *physical_device,
+ VkFormat format,
+ VkFormatProperties *out_properties)
+{
+ VkFormatFeatureFlags linear = 0, tiled = 0, buffer = 0;
+ const struct vk_format_description *desc = vk_format_description(format);
+ const struct tu_native_format *native_fmt = tu6_get_native_format(format);
+ if (!desc || !native_fmt) {
+ out_properties->linearTilingFeatures = linear;
+ out_properties->optimalTilingFeatures = tiled;
+ out_properties->bufferFeatures = buffer;
+ return;
}
- /* For the most part, we can do anything with a linear image that we could
- * do with a tiled image. However, we can't support sysmem rendering with a
- * linear depth texture, because we don't know if there's a bit to control
- * the tiling of the depth buffer in BYPASS mode, and the blob also
- * disables linear depth rendering, so there's no way to discover it. We
- * also can't force GMEM mode, because there are other situations where we
- * have to use sysmem rendering. So follow the blob here, and only enable
- * DEPTH_STENCIL_ATTACHMENT_BIT for the optimal features.
- */
- linear = optimal;
- if (tu6_pipe2depth(vk_format) != (enum a6xx_depth_format)~0)
- optimal |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT;
-
- if (!tiling_possible(vk_format) &&
- /* We don't actually support tiling for this format, but we need to
- * fake it as it's required by VK_KHR_sampler_ycbcr_conversion.
- */
- vk_format != VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) {
- optimal = 0;
- }
+ linear |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
+ tiled |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
+ buffer |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
- if (vk_format == VK_FORMAT_G8B8G8R8_422_UNORM ||
- vk_format == VK_FORMAT_B8G8R8G8_422_UNORM ||
- vk_format == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM ||
- vk_format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) {
- /* Disable buffer texturing of subsampled (422) and planar YUV textures.
- * The subsampling requirement comes from "If format is a block-compressed
- * format, then bufferFeatures must not support any features for the
- * format" plus the specification of subsampled as 2x1 compressed block
- * format. I couldn't find the citation for planar, but 1D access of
- * planar YUV would be really silly.
- */
- buffer = 0;
+ if (native_fmt->tex >= 0) {
+ linear |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT;
+ tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT;
+ buffer |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT;
}
- /* We don't support writing into VK__FORMAT_*_PACK16 images/buffers */
- if (desc->nr_channels > 2 && desc->block.bits == 16) {
- buffer &= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT;
- linear &= ~(VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT |
- VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT);
- optimal &= ~(VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT |
- VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT);
+ if (native_fmt->rb >= 0) {
+ linear |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT;
+ tiled |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT;
}
- /* All our depth formats support shadow comparisons. */
- if (vk_format_has_depth(vk_format) && (optimal & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) {
- optimal |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT;
- linear |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT;
+ if (native_fmt->vtx >= 0) {
+ buffer |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT;
}
- /* From the Vulkan 1.3.205 spec, section 19.3 "43.3. Required Format Support":
- *
- * Mandatory format support: depth/stencil with VkImageType
- * VK_IMAGE_TYPE_2D
- * [...]
- * bufferFeatures must not support any features for these formats
- */
- if (vk_format_is_depth_or_stencil(vk_format))
- buffer = 0;
-
- /* D32_SFLOAT_S8_UINT is tiled as two images, so no linear format
- * blob enables some linear features, but its not useful, so don't bother.
- */
- if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT)
- linear = 0;
-
-end:
out_properties->linearTilingFeatures = linear;
- out_properties->optimalTilingFeatures = optimal;
+ out_properties->optimalTilingFeatures = tiled;
out_properties->bufferFeatures = buffer;
}
-VKAPI_ATTR void VKAPI_CALL
+void
+tu_GetPhysicalDeviceFormatProperties(VkPhysicalDevice physicalDevice,
+ VkFormat format,
+ VkFormatProperties *pFormatProperties)
+{
+ TU_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice);
+
+ tu_physical_device_get_format_properties(physical_device, format,
+ pFormatProperties);
+}
+
+void
tu_GetPhysicalDeviceFormatProperties2(
VkPhysicalDevice physicalDevice,
VkFormat format,
@@ -397,61 +663,18 @@ tu_GetPhysicalDeviceFormatProperties2(
{
TU_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice);
- VkFormatProperties3 local_props3;
- VkFormatProperties3 *props3 =
- vk_find_struct(pFormatProperties->pNext, FORMAT_PROPERTIES_3);
- if (!props3)
- props3 = &local_props3;
-
tu_physical_device_get_format_properties(
- physical_device, format, props3);
-
- pFormatProperties->formatProperties = (VkFormatProperties) {
- .linearTilingFeatures = props3->linearTilingFeatures,
- .optimalTilingFeatures = props3->optimalTilingFeatures,
- .bufferFeatures = props3->bufferFeatures,
- };
-
- VkDrmFormatModifierPropertiesListEXT *list =
- vk_find_struct(pFormatProperties->pNext, DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT);
- if (list) {
- VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierPropertiesEXT, out,
- list->pDrmFormatModifierProperties,
- &list->drmFormatModifierCount);
-
- if (pFormatProperties->formatProperties.linearTilingFeatures) {
- vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, &out, mod_props) {
- mod_props->drmFormatModifier = DRM_FORMAT_MOD_LINEAR;
- mod_props->drmFormatModifierPlaneCount = tu6_plane_count(format);
- mod_props->drmFormatModifierTilingFeatures =
- pFormatProperties->formatProperties.linearTilingFeatures;
- }
- }
-
- /* note: ubwc_possible() argument values to be ignored except for format */
- if (pFormatProperties->formatProperties.optimalTilingFeatures &&
- tiling_possible(format) &&
- ubwc_possible(NULL, format, VK_IMAGE_TYPE_2D, 0, 0,
- physical_device->info, VK_SAMPLE_COUNT_1_BIT,
- false)) {
- vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, &out, mod_props) {
- mod_props->drmFormatModifier = DRM_FORMAT_MOD_QCOM_COMPRESSED;
- mod_props->drmFormatModifierPlaneCount = tu6_plane_count(format);
- mod_props->drmFormatModifierTilingFeatures =
- pFormatProperties->formatProperties.optimalTilingFeatures;
- }
- }
- }
+ physical_device, format, &pFormatProperties->formatProperties);
}
static VkResult
tu_get_image_format_properties(
struct tu_physical_device *physical_device,
const VkPhysicalDeviceImageFormatInfo2 *info,
- VkImageFormatProperties *pImageFormatProperties,
- VkFormatFeatureFlags *p_feature_flags)
+ VkImageFormatProperties *pImageFormatProperties)
+
{
- VkFormatProperties3 format_props;
+ VkFormatProperties format_props;
VkFormatFeatureFlags format_feature_flags;
VkExtent3D maxExtent;
uint32_t maxMipLevels;
@@ -460,53 +683,12 @@ tu_get_image_format_properties(
tu_physical_device_get_format_properties(physical_device, info->format,
&format_props);
-
- switch (info->tiling) {
- case VK_IMAGE_TILING_LINEAR:
+ if (info->tiling == VK_IMAGE_TILING_LINEAR) {
format_feature_flags = format_props.linearTilingFeatures;
- break;
-
- case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT: {
- const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *drm_info =
- vk_find_struct_const(info->pNext, PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT);
-
- switch (drm_info->drmFormatModifier) {
- case DRM_FORMAT_MOD_QCOM_COMPRESSED:
- /* falling back to linear/non-UBWC isn't possible with explicit modifier */
-
- /* formats which don't support tiling */
- if (!format_props.optimalTilingFeatures ||
- !tiling_possible(info->format))
- return VK_ERROR_FORMAT_NOT_SUPPORTED;
-
- if (info->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) {
- const VkImageFormatListCreateInfo *format_list =
- vk_find_struct_const(info->pNext,
- IMAGE_FORMAT_LIST_CREATE_INFO);
- if (!tu6_mutable_format_list_ubwc_compatible(format_list))
- return VK_ERROR_FORMAT_NOT_SUPPORTED;
- }
-
- if (!ubwc_possible(NULL, info->format, info->type, info->usage,
- info->usage, physical_device->info, sampleCounts,
- false)) {
- return VK_ERROR_FORMAT_NOT_SUPPORTED;
- }
-
- format_feature_flags = format_props.optimalTilingFeatures;
- break;
- case DRM_FORMAT_MOD_LINEAR:
- format_feature_flags = format_props.linearTilingFeatures;
- break;
- default:
- return VK_ERROR_FORMAT_NOT_SUPPORTED;
- }
- } break;
- case VK_IMAGE_TILING_OPTIMAL:
+ } else if (info->tiling == VK_IMAGE_TILING_OPTIMAL) {
format_feature_flags = format_props.optimalTilingFeatures;
- break;
- default:
- unreachable("bad VkPhysicalDeviceImageFormatInfo2");
+ } else {
+ unreachable("bad VkImageTiling");
}
if (format_feature_flags == 0)
@@ -549,50 +731,29 @@ tu_get_image_format_properties(
VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
!(info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) &&
!(info->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
- sampleCounts |= VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT;
- /* note: most operations support 8 samples (GMEM render/resolve do at least)
- * but some do not (which ones?), just disable 8 samples completely,
- * (no 8x msaa matches the blob driver behavior)
- */
+ sampleCounts |= VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT |
+ VK_SAMPLE_COUNT_8_BIT;
}
- /* From the Vulkan 1.3.206 spec:
- *
- * "VK_IMAGE_CREATE_EXTENDED_USAGE_BIT specifies that the image can be
- * created with usage flags that are not supported for the format the image
- * is created with but are supported for at least one format a VkImageView
- * created from the image can have."
- *
- * This means we should relax checks that only depend on the
- * format_feature_flags, to allow the user to create images that may be
- * e.g. reinterpreted as storage when the original format doesn't allow it.
- * The user will have to check against the format features anyway.
- * Otherwise we'd unnecessarily disallow it.
- */
-
- VkImageUsageFlags image_usage = info->usage;
- if (info->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT)
- image_usage = 0;
-
- if (image_usage & VK_IMAGE_USAGE_SAMPLED_BIT) {
+ if (info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) {
if (!(format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) {
goto unsupported;
}
}
- if (image_usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+ if (info->usage & VK_IMAGE_USAGE_STORAGE_BIT) {
if (!(format_feature_flags & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) {
goto unsupported;
}
}
- if (image_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+ if (info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
if (!(format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) {
goto unsupported;
}
}
- if (image_usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
+ if (info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
if (!(format_feature_flags &
VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) {
goto unsupported;
@@ -611,9 +772,6 @@ tu_get_image_format_properties(
.maxResourceSize = UINT32_MAX,
};
- if (p_feature_flags)
- *p_feature_flags = format_feature_flags;
-
return VK_SUCCESS;
unsupported:
*pImageFormatProperties = (VkImageFormatProperties) {
@@ -627,12 +785,38 @@ unsupported:
return VK_ERROR_FORMAT_NOT_SUPPORTED;
}
+VkResult
+tu_GetPhysicalDeviceImageFormatProperties(
+ VkPhysicalDevice physicalDevice,
+ VkFormat format,
+ VkImageType type,
+ VkImageTiling tiling,
+ VkImageUsageFlags usage,
+ VkImageCreateFlags createFlags,
+ VkImageFormatProperties *pImageFormatProperties)
+{
+ TU_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice);
+
+ const VkPhysicalDeviceImageFormatInfo2 info = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
+ .pNext = NULL,
+ .format = format,
+ .type = type,
+ .tiling = tiling,
+ .usage = usage,
+ .flags = createFlags,
+ };
+
+ return tu_get_image_format_properties(physical_device, &info,
+ pImageFormatProperties);
+}
+
static VkResult
tu_get_external_image_format_properties(
const struct tu_physical_device *physical_device,
const VkPhysicalDeviceImageFormatInfo2 *pImageFormatInfo,
VkExternalMemoryHandleTypeFlagBits handleType,
- VkExternalImageFormatProperties *external_properties)
+ VkExternalMemoryProperties *external_properties)
{
VkExternalMemoryFeatureFlagBits flags = 0;
VkExternalMemoryHandleTypeFlags export_flags = 0;
@@ -659,7 +843,7 @@ tu_get_external_image_format_properties(
VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
break;
default:
- return vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+ return vk_errorf(physical_device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
"VkExternalMemoryTypeFlagBits(0x%x) unsupported for VkImageType(%d)",
handleType, pImageFormatInfo->type);
}
@@ -669,24 +853,21 @@ tu_get_external_image_format_properties(
compat_flags = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
break;
default:
- return vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+ return vk_errorf(physical_device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
"VkExternalMemoryTypeFlagBits(0x%x) unsupported",
handleType);
}
- if (external_properties) {
- external_properties->externalMemoryProperties =
- (VkExternalMemoryProperties) {
- .externalMemoryFeatures = flags,
- .exportFromImportedHandleTypes = export_flags,
- .compatibleHandleTypes = compat_flags,
- };
- }
+ *external_properties = (VkExternalMemoryProperties) {
+ .externalMemoryFeatures = flags,
+ .exportFromImportedHandleTypes = export_flags,
+ .compatibleHandleTypes = compat_flags,
+ };
return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_GetPhysicalDeviceImageFormatProperties2(
VkPhysicalDevice physicalDevice,
const VkPhysicalDeviceImageFormatInfo2 *base_info,
@@ -694,15 +875,11 @@ tu_GetPhysicalDeviceImageFormatProperties2(
{
TU_FROM_HANDLE(tu_physical_device, physical_device, physicalDevice);
const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL;
- const VkPhysicalDeviceImageViewImageFormatInfoEXT *image_view_info = NULL;
VkExternalImageFormatProperties *external_props = NULL;
- VkFilterCubicImageViewImageFormatPropertiesEXT *cubic_props = NULL;
- VkFormatFeatureFlags format_feature_flags;
- VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL;
VkResult result;
- result = tu_get_image_format_properties(physical_device,
- base_info, &base_props->imageFormatProperties, &format_feature_flags);
+ result = tu_get_image_format_properties(
+ physical_device, base_info, &base_props->imageFormatProperties);
if (result != VK_SUCCESS)
return result;
@@ -713,9 +890,6 @@ tu_GetPhysicalDeviceImageFormatProperties2(
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO:
external_info = (const void *) s;
break;
- case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_VIEW_IMAGE_FORMAT_INFO_EXT:
- image_view_info = (const void *) s;
- break;
default:
break;
}
@@ -728,12 +902,6 @@ tu_GetPhysicalDeviceImageFormatProperties2(
case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES:
external_props = (void *) s;
break;
- case VK_STRUCTURE_TYPE_FILTER_CUBIC_IMAGE_VIEW_IMAGE_FORMAT_PROPERTIES_EXT:
- cubic_props = (void *) s;
- break;
- case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES:
- ycbcr_props = (void *) s;
- break;
default:
break;
}
@@ -748,29 +916,11 @@ tu_GetPhysicalDeviceImageFormatProperties2(
if (external_info && external_info->handleType != 0) {
result = tu_get_external_image_format_properties(
physical_device, base_info, external_info->handleType,
- external_props);
+ &external_props->externalMemoryProperties);
if (result != VK_SUCCESS)
goto fail;
}
- if (cubic_props) {
- /* note: blob only allows cubic filtering for 2D and 2D array views
- * its likely we can enable it for 1D and CUBE, needs testing however
- */
- if ((image_view_info->imageViewType == VK_IMAGE_VIEW_TYPE_2D ||
- image_view_info->imageViewType == VK_IMAGE_VIEW_TYPE_2D_ARRAY) &&
- (format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_CUBIC_BIT_EXT)) {
- cubic_props->filterCubic = true;
- cubic_props->filterCubicMinmax = true;
- } else {
- cubic_props->filterCubic = false;
- cubic_props->filterCubicMinmax = false;
- }
- }
-
- if (ycbcr_props)
- ycbcr_props->combinedImageSamplerDescriptorCount = 1;
-
return VK_SUCCESS;
fail:
@@ -782,13 +932,28 @@ fail:
* the implementation for use in vkCreateImage, then all members of
* imageFormatProperties will be filled with zero.
*/
- base_props->imageFormatProperties = (VkImageFormatProperties) {};
+ base_props->imageFormatProperties = (VkImageFormatProperties) { 0 };
}
return result;
}
-VKAPI_ATTR void VKAPI_CALL
+void
+tu_GetPhysicalDeviceSparseImageFormatProperties(
+ VkPhysicalDevice physicalDevice,
+ VkFormat format,
+ VkImageType type,
+ uint32_t samples,
+ VkImageUsageFlags usage,
+ VkImageTiling tiling,
+ uint32_t *pNumProperties,
+ VkSparseImageFormatProperties *pProperties)
+{
+ /* Sparse images are not yet supported. */
+ *pNumProperties = 0;
+}
+
+void
tu_GetPhysicalDeviceSparseImageFormatProperties2(
VkPhysicalDevice physicalDevice,
const VkPhysicalDeviceSparseImageFormatInfo2 *pFormatInfo,
@@ -799,7 +964,7 @@ tu_GetPhysicalDeviceSparseImageFormatProperties2(
*pPropertyCount = 0;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_GetPhysicalDeviceExternalBufferProperties(
VkPhysicalDevice physicalDevice,
const VkPhysicalDeviceExternalBufferInfo *pExternalBufferInfo,
diff --git a/lib/mesa/src/freedreno/vulkan/tu_image.c b/lib/mesa/src/freedreno/vulkan/tu_image.c
index 15a0649a2..657612d42 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_image.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_image.c
@@ -1,733 +1,266 @@
/*
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
- * SPDX-License-Identifier: MIT
*
* based in part on anv driver which is:
* Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
-#include "tu_image.h"
-
-#include "fdl/fd6_format_table.h"
+#include "tu_private.h"
-#include "util/u_debug.h"
-#include "util/format/u_format.h"
+#include "util/debug.h"
+#include "util/u_atomic.h"
+#include "vk_format.h"
#include "vk_util.h"
-#include "drm-uapi/drm_fourcc.h"
-#include "tu_android.h"
-#include "tu_cs.h"
-#include "tu_descriptor_set.h"
-#include "tu_device.h"
-#include "tu_formats.h"
-
-uint32_t
-tu6_plane_count(VkFormat format)
+static inline bool
+image_level_linear(struct tu_image *image, int level)
{
- switch (format) {
- default:
- return 1;
- case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
- case VK_FORMAT_D32_SFLOAT_S8_UINT:
- return 2;
- case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
- return 3;
- }
-}
-
-enum pipe_format
-tu6_plane_format(VkFormat format, uint32_t plane)
-{
- switch (format) {
- case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
- return plane ? PIPE_FORMAT_R8G8_UNORM : PIPE_FORMAT_Y8_UNORM;
- case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
- return PIPE_FORMAT_R8_UNORM;
- case VK_FORMAT_D32_SFLOAT_S8_UINT:
- return plane ? PIPE_FORMAT_S8_UINT : PIPE_FORMAT_Z32_FLOAT;
- default:
- return tu_vk_format_to_pipe_format(format);
- }
-}
-
-uint32_t
-tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask)
-{
- switch (aspect_mask) {
- default:
- assert(aspect_mask != VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT);
- return 0;
- case VK_IMAGE_ASPECT_PLANE_1_BIT:
- case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT:
- return 1;
- case VK_IMAGE_ASPECT_PLANE_2_BIT:
- case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT:
- return 2;
- case VK_IMAGE_ASPECT_STENCIL_BIT:
- return format == VK_FORMAT_D32_SFLOAT_S8_UINT;
- }
-}
-
-enum pipe_format
-tu_format_for_aspect(enum pipe_format format, VkImageAspectFlags aspect_mask)
-{
- switch (format) {
- case PIPE_FORMAT_Z24_UNORM_S8_UINT:
- if (aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT)
- return PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
- if (aspect_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
- if (aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
- return PIPE_FORMAT_Z24_UNORM_S8_UINT;
- else
- return PIPE_FORMAT_X24S8_UINT;
- } else {
- return PIPE_FORMAT_Z24X8_UNORM;
- }
- case PIPE_FORMAT_Z24X8_UNORM:
- if (aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT)
- return PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
- return PIPE_FORMAT_Z24X8_UNORM;
- default:
- return format;
- }
+ unsigned w = u_minify(image->extent.width, level);
+ return w < 16;
}
-static bool
-tu_is_r8g8(enum pipe_format format)
+/* indexed by cpp: */
+static const struct
{
- return (util_format_get_blocksize(format) == 2) &&
- (util_format_get_nr_components(format) == 2);
-}
-
-static bool
-tu_is_r8g8_compatible(enum pipe_format format)
-{
- return (util_format_get_blocksize(format) == 2) &&
- !util_format_is_depth_or_stencil(format);
-}
-
-void
-tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer)
-{
- tu_cs_emit(cs, iview->PITCH);
- tu_cs_emit(cs, iview->layer_size >> 6);
- tu_cs_emit_qw(cs, iview->base_addr + iview->layer_size * layer);
-}
-
-void
-tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
-{
- tu_cs_emit(cs, iview->stencil_PITCH);
- tu_cs_emit(cs, iview->stencil_layer_size >> 6);
- tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
-}
-
-void
-tu_cs_image_depth_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
-{
- tu_cs_emit(cs, iview->depth_PITCH);
- tu_cs_emit(cs, iview->depth_layer_size >> 6);
- tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
-}
-
-void
-tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src)
-{
- tu_cs_emit_qw(cs, iview->base_addr + iview->layer_size * layer);
- /* SP_PS_2D_SRC_PITCH has shifted pitch field */
- tu_cs_emit(cs, iview->PITCH << (src ? 9 : 0));
-}
-
-void
-tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer)
-{
- tu_cs_emit_qw(cs, iview->ubwc_addr + iview->ubwc_layer_size * layer);
- tu_cs_emit(cs, iview->FLAG_BUFFER_PITCH);
-}
+ unsigned pitchalign;
+ unsigned heightalign;
+} tile_alignment[] = {
+ [1] = { 128, 32 }, [2] = { 128, 16 }, [3] = { 128, 16 }, [4] = { 64, 16 },
+ [8] = { 64, 16 }, [12] = { 64, 16 }, [16] = { 64, 16 },
+};
static void
-tu_image_view_init(struct tu_device *device,
- struct tu_image_view *iview,
- const VkImageViewCreateInfo *pCreateInfo,
- bool has_z24uint_s8uint)
+setup_slices(struct tu_image *image, const VkImageCreateInfo *pCreateInfo)
{
- TU_FROM_HANDLE(tu_image, image, pCreateInfo->image);
- const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
- VkFormat vk_format = pCreateInfo->format;
- VkImageAspectFlagBits aspect_mask = pCreateInfo->subresourceRange.aspectMask;
-
- const struct VkSamplerYcbcrConversionInfo *ycbcr_conversion =
- vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO);
- const struct tu_sampler_ycbcr_conversion *conversion = ycbcr_conversion ?
- tu_sampler_ycbcr_conversion_from_handle(ycbcr_conversion->conversion) : NULL;
-
- vk_image_view_init(&device->vk, &iview->vk, false, pCreateInfo);
-
- iview->image = image;
-
- const struct fdl_layout *layouts[3];
-
- layouts[0] = &image->layout[tu6_plane_index(image->vk.format, aspect_mask)];
-
- enum pipe_format format;
- if (aspect_mask != VK_IMAGE_ASPECT_COLOR_BIT)
- format = tu6_plane_format(vk_format, tu6_plane_index(vk_format, aspect_mask));
- else
- format = tu_vk_format_to_pipe_format(vk_format);
-
- if (image->vk.format == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM &&
- aspect_mask == VK_IMAGE_ASPECT_PLANE_0_BIT) {
- if (vk_format == VK_FORMAT_R8_UNORM) {
- /* The 0'th plane of this format has a different UBWC compression. */
- format = PIPE_FORMAT_Y8_UNORM;
+ enum vk_format_layout layout =
+ vk_format_description(pCreateInfo->format)->layout;
+ uint32_t layer_size = 0;
+ uint32_t width = pCreateInfo->extent.width;
+ uint32_t height = pCreateInfo->extent.height;
+ uint32_t depth = pCreateInfo->extent.depth;
+ bool layer_first = pCreateInfo->imageType != VK_IMAGE_TYPE_3D;
+ uint32_t alignment = pCreateInfo->imageType == VK_IMAGE_TYPE_3D ? 4096 : 1;
+ uint32_t cpp = vk_format_get_blocksize(pCreateInfo->format);
+
+ uint32_t heightalign = tile_alignment[cpp].heightalign;
+
+ for (unsigned level = 0; level < pCreateInfo->mipLevels; level++) {
+ struct tu_image_level *slice = &image->levels[level];
+ bool linear_level = image_level_linear(image, level);
+ uint32_t aligned_height = height;
+ uint32_t blocks;
+ uint32_t pitchalign;
+
+ if (image->tile_mode && !linear_level) {
+ pitchalign = tile_alignment[cpp].pitchalign;
+ aligned_height = align(aligned_height, heightalign);
} else {
- /* If the user wants to reinterpret this plane, then they should've
- * set MUTABLE_FORMAT_BIT which should disable UBWC and tiling.
+ pitchalign = 64;
+
+ /* The blits used for mem<->gmem work at a granularity of
+ * 32x32, which can cause faults due to over-fetch on the
+ * last level. The simple solution is to over-allocate a
+ * bit the last level to ensure any over-fetch is harmless.
+ * The pitch is already sufficiently aligned, but height
+ * may not be:
*/
- assert(!layouts[0]->ubwc);
+ if ((level + 1 == pCreateInfo->mipLevels))
+ aligned_height = align(aligned_height, 32);
}
- }
- if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT &&
- (vk_format == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM ||
- vk_format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM)) {
- layouts[1] = &image->layout[1];
- layouts[2] = &image->layout[2];
- }
-
- struct fdl_view_args args = {};
- args.iova = image->iova;
- args.base_array_layer = range->baseArrayLayer;
- args.base_miplevel = range->baseMipLevel;
- args.layer_count = vk_image_subresource_layer_count(&image->vk, range);
- args.level_count = vk_image_subresource_level_count(&image->vk, range);
- args.min_lod_clamp = iview->vk.min_lod;
- args.format = tu_format_for_aspect(format, aspect_mask);
- vk_component_mapping_to_pipe_swizzle(pCreateInfo->components, args.swiz);
- if (conversion) {
- unsigned char conversion_swiz[4], create_swiz[4];
- memcpy(create_swiz, args.swiz, sizeof(create_swiz));
- vk_component_mapping_to_pipe_swizzle(conversion->components,
- conversion_swiz);
- util_format_compose_swizzles(create_swiz, conversion_swiz, args.swiz);
- }
-
- switch (pCreateInfo->viewType) {
- case VK_IMAGE_VIEW_TYPE_1D:
- case VK_IMAGE_VIEW_TYPE_1D_ARRAY:
- args.type = FDL_VIEW_TYPE_1D;
- break;
- case VK_IMAGE_VIEW_TYPE_2D:
- case VK_IMAGE_VIEW_TYPE_2D_ARRAY:
- args.type = FDL_VIEW_TYPE_2D;
- break;
- case VK_IMAGE_VIEW_TYPE_CUBE:
- case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY:
- args.type = FDL_VIEW_TYPE_CUBE;
- break;
- case VK_IMAGE_VIEW_TYPE_3D:
- args.type = FDL_VIEW_TYPE_3D;
- break;
- default:
- unreachable("unknown view type");
- }
+ if (layout == VK_FORMAT_LAYOUT_ASTC)
+ slice->pitch = util_align_npot(
+ width,
+ pitchalign * vk_format_get_blockwidth(pCreateInfo->format));
+ else
+ slice->pitch = align(width, pitchalign);
- STATIC_ASSERT((unsigned)VK_CHROMA_LOCATION_COSITED_EVEN == (unsigned)FDL_CHROMA_LOCATION_COSITED_EVEN);
- STATIC_ASSERT((unsigned)VK_CHROMA_LOCATION_MIDPOINT == (unsigned)FDL_CHROMA_LOCATION_MIDPOINT);
- if (conversion) {
- args.chroma_offsets[0] = (enum fdl_chroma_location) conversion->chroma_offsets[0];
- args.chroma_offsets[1] = (enum fdl_chroma_location) conversion->chroma_offsets[1];
- }
+ slice->offset = layer_size;
+ blocks = vk_format_get_block_count(pCreateInfo->format, slice->pitch,
+ aligned_height);
- fdl6_view_init(&iview->view, layouts, &args, has_z24uint_s8uint);
+ /* 1d array and 2d array textures must all have the same layer size
+ * for each miplevel on a3xx. 3d textures can have different layer
+ * sizes for high levels, but the hw auto-sizer is buggy (or at least
+ * different than what this code does), so as soon as the layer size
+ * range gets into range, we stop reducing it.
+ */
+ if (pCreateInfo->imageType == VK_IMAGE_TYPE_3D &&
+ (level == 1 ||
+ (level > 1 && image->levels[level - 1].size > 0xf000)))
+ slice->size = align(blocks * cpp, alignment);
+ else if (level == 0 || layer_first || alignment == 1)
+ slice->size = align(blocks * cpp, alignment);
+ else
+ slice->size = image->levels[level - 1].size;
- if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
- struct fdl_layout *layout = &image->layout[0];
- iview->depth_base_addr = image->iova +
- fdl_surface_offset(layout, range->baseMipLevel, range->baseArrayLayer);
- iview->depth_layer_size = fdl_layer_stride(layout, range->baseMipLevel);
- iview->depth_PITCH = A6XX_RB_DEPTH_BUFFER_PITCH(fdl_pitch(layout, range->baseMipLevel)).value;
+ layer_size += slice->size * depth;
- layout = &image->layout[1];
- iview->stencil_base_addr = image->iova +
- fdl_surface_offset(layout, range->baseMipLevel, range->baseArrayLayer);
- iview->stencil_layer_size = fdl_layer_stride(layout, range->baseMipLevel);
- iview->stencil_PITCH = A6XX_RB_STENCIL_BUFFER_PITCH(fdl_pitch(layout, range->baseMipLevel)).value;
+ width = u_minify(width, 1);
+ height = u_minify(height, 1);
+ depth = u_minify(depth, 1);
}
-}
-bool
-tiling_possible(VkFormat format)
-{
- if (format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM ||
- format == VK_FORMAT_G8B8G8R8_422_UNORM ||
- format == VK_FORMAT_B8G8R8G8_422_UNORM)
- return false;
-
- return true;
+ image->layer_size = layer_size;
}
-/* Checks if we should advertise UBWC support for the given usage.
- *
- * Used by both vkCreateImage and vkGetPhysicalDeviceFormatProperties2, so the
- * logical tu_device may be NULL.
- */
-bool
-ubwc_possible(struct tu_device *device,
- VkFormat format,
- VkImageType type,
- VkImageUsageFlags usage,
- VkImageUsageFlags stencil_usage,
- const struct fd_dev_info *info,
- VkSampleCountFlagBits samples,
- bool use_z24uint_s8uint)
+VkResult
+tu_image_create(VkDevice _device,
+ const struct tu_image_create_info *create_info,
+ const VkAllocationCallbacks *alloc,
+ VkImage *pImage)
{
- /* no UBWC with compressed formats, E5B9G9R9, S8_UINT
- * (S8_UINT because separate stencil doesn't have UBWC-enable bit)
- */
- if (vk_format_is_compressed(format) ||
- format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 ||
- format == VK_FORMAT_S8_UINT)
- return false;
-
- /* In copy_format, we treat snorm as unorm to avoid clamping. But snorm
- * and unorm are UBWC incompatible for special values such as all 0's or
- * all 1's. Disable UBWC for snorm.
- */
- if (vk_format_is_snorm(format))
- return false;
-
- if (!info->a6xx.has_8bpp_ubwc &&
- (format == VK_FORMAT_R8_UNORM ||
- format == VK_FORMAT_R8_SNORM ||
- format == VK_FORMAT_R8_UINT ||
- format == VK_FORMAT_R8_SINT ||
- format == VK_FORMAT_R8_SRGB))
- return false;
-
- if (type == VK_IMAGE_TYPE_3D) {
- if (device) {
- perf_debug(device,
- "Disabling UBWC for %s 3D image, but it should be "
- "possible to support.",
- util_format_name(vk_format_to_pipe_format(format)));
- }
- return false;
- }
-
- /* Disable UBWC for storage images.
- *
- * The closed GL driver skips UBWC for storage images (and additionally
- * uses linear for writeonly images). We seem to have image tiling working
- * in freedreno in general, so turnip matches that. freedreno also enables
- * UBWC on images, but it's not really tested due to the lack of
- * UBWC-enabled mipmaps in freedreno currently. Just match the closed GL
- * behavior of no UBWC.
- */
- if ((usage | stencil_usage) & VK_IMAGE_USAGE_STORAGE_BIT) {
- if (device) {
- perf_debug(device,
- "Disabling UBWC for %s storage image, but should be "
- "possible to support",
- util_format_name(vk_format_to_pipe_format(format)));
- }
- return false;
+ TU_FROM_HANDLE(tu_device, device, _device);
+ const VkImageCreateInfo *pCreateInfo = create_info->vk_info;
+ struct tu_image *image = NULL;
+ assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO);
+
+ tu_assert(pCreateInfo->mipLevels > 0);
+ tu_assert(pCreateInfo->arrayLayers > 0);
+ tu_assert(pCreateInfo->samples > 0);
+ tu_assert(pCreateInfo->extent.width > 0);
+ tu_assert(pCreateInfo->extent.height > 0);
+ tu_assert(pCreateInfo->extent.depth > 0);
+
+ image = vk_zalloc2(&device->alloc, alloc, sizeof(*image), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (!image)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ image->type = pCreateInfo->imageType;
+
+ image->vk_format = pCreateInfo->format;
+ image->tiling = pCreateInfo->tiling;
+ image->usage = pCreateInfo->usage;
+ image->flags = pCreateInfo->flags;
+ image->extent = pCreateInfo->extent;
+ image->level_count = pCreateInfo->mipLevels;
+ image->layer_count = pCreateInfo->arrayLayers;
+
+ image->exclusive = pCreateInfo->sharingMode == VK_SHARING_MODE_EXCLUSIVE;
+ if (pCreateInfo->sharingMode == VK_SHARING_MODE_CONCURRENT) {
+ for (uint32_t i = 0; i < pCreateInfo->queueFamilyIndexCount; ++i)
+ if (pCreateInfo->pQueueFamilyIndices[i] ==
+ VK_QUEUE_FAMILY_EXTERNAL)
+ image->queue_family_mask |= (1u << TU_MAX_QUEUE_FAMILIES) - 1u;
+ else
+ image->queue_family_mask |=
+ 1u << pCreateInfo->pQueueFamilyIndices[i];
}
- /* Disable UBWC for D24S8 on A630 in some cases
- *
- * VK_IMAGE_ASPECT_STENCIL_BIT image view requires to be able to sample
- * from the stencil component as UINT, however no format allows this
- * on a630 (the special FMT6_Z24_UINT_S8_UINT format is missing)
- *
- * It must be sampled as FMT6_8_8_8_8_UINT, which is not UBWC-compatible
- *
- * If we wish to get the border colors correct without knowing the format
- * when creating the sampler, we also have to use the A630 workaround.
- *
- * Additionally, the special AS_R8G8B8A8 format is broken without UBWC,
- * so we have to fallback to 8_8_8_8_UNORM when UBWC is disabled
- */
- if (!use_z24uint_s8uint &&
- format == VK_FORMAT_D24_UNORM_S8_UINT &&
- (stencil_usage & (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)))
- return false;
-
- /* This meant to disable UBWC for MSAA z24s8, but accidentally disables it
- * for all MSAA. https://gitlab.freedesktop.org/mesa/mesa/-/issues/7438
- */
- if (!info->a6xx.has_z24uint_s8uint && samples > VK_SAMPLE_COUNT_1_BIT) {
- if (device) {
- perf_debug(device,
- "Disabling UBWC for %d-sample %s image, but it should be "
- "possible to support",
- samples,
- util_format_name(vk_format_to_pipe_format(format)));
- }
- return false;
- }
+ image->shareable =
+ vk_find_struct_const(pCreateInfo->pNext,
+ EXTERNAL_MEMORY_IMAGE_CREATE_INFO) != NULL;
- return true;
-}
+ image->tile_mode = pCreateInfo->tiling == VK_IMAGE_TILING_OPTIMAL ? 3 : 0;
+ setup_slices(image, pCreateInfo);
-/* R8G8 have a different block width/height and height alignment from other
- * formats that would normally be compatible (like R16), and so if we are
- * trying to, for example, sample R16 as R8G8 we need to demote to linear.
- */
-static bool
-format_list_reinterprets_r8g8_r16(enum pipe_format format, const VkImageFormatListCreateInfo *fmt_list)
-{
- /* Check if it's actually a 2-cpp color format. */
- if (!tu_is_r8g8_compatible(format))
- return false;
-
- /* If there's no format list, then the app may reinterpret to any compatible
- * format.
- */
- if (!fmt_list || !fmt_list->viewFormatCount)
- return true;
-
- bool has_r8g8 = false;
- bool has_non_r8g8 = false;
- for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) {
- enum pipe_format format =
- tu_vk_format_to_pipe_format(fmt_list->pViewFormats[i]);
- if (tu_is_r8g8(format))
- has_r8g8 = true;
- else
- has_non_r8g8 = true;
- }
- return has_r8g8 && has_non_r8g8;
-}
+ image->size = image->layer_size * pCreateInfo->arrayLayers;
+ *pImage = tu_image_to_handle(image);
-static bool
-format_list_has_swaps(const VkImageFormatListCreateInfo *fmt_list)
-{
- /* If there's no format list, then the app may reinterpret to any compatible
- * format, and presumably one would have the swap set.
- */
- if (!fmt_list || !fmt_list->viewFormatCount)
- return true;
-
- for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) {
- enum pipe_format format =
- tu_vk_format_to_pipe_format(fmt_list->pViewFormats[i]);
-
- if (tu6_format_texture(format, TILE6_LINEAR).swap)
- return true;
- }
- return false;
+ return VK_SUCCESS;
}
-static VkResult
-tu_image_init(struct tu_device *device, struct tu_image *image,
- const VkImageCreateInfo *pCreateInfo, uint64_t modifier,
- const VkSubresourceLayout *plane_layouts)
+void
+tu_image_view_init(struct tu_image_view *iview,
+ struct tu_device *device,
+ const VkImageViewCreateInfo *pCreateInfo)
{
- vk_image_init(&device->vk, &image->vk, pCreateInfo);
- image->vk.drm_format_mod = modifier;
-
- enum a6xx_tile_mode tile_mode = TILE6_3;
- bool ubwc_enabled = true;
-
- /* use linear tiling if requested */
- if (pCreateInfo->tiling == VK_IMAGE_TILING_LINEAR || modifier == DRM_FORMAT_MOD_LINEAR) {
- tile_mode = TILE6_LINEAR;
- ubwc_enabled = false;
- }
-
- /* Force linear tiling for formats with "fake" optimalTilingFeatures */
- if (!tiling_possible(image->vk.format)) {
- tile_mode = TILE6_LINEAR;
- ubwc_enabled = false;
- }
-
- /* No sense in tiling a 1D image, you'd just waste space and cache locality. */
- if (pCreateInfo->imageType == VK_IMAGE_TYPE_1D) {
- tile_mode = TILE6_LINEAR;
- ubwc_enabled = false;
- }
-
- enum pipe_format format =
- tu_vk_format_to_pipe_format(image->vk.format);
- /* Whether a view of the image with an R8G8 format could be made. */
- bool has_r8g8 = tu_is_r8g8(format);
-
- if (ubwc_enabled &&
- !ubwc_possible(device, image->vk.format, pCreateInfo->imageType,
- pCreateInfo->usage, image->vk.stencil_usage,
- device->physical_device->info, pCreateInfo->samples,
- device->use_z24uint_s8uint))
- ubwc_enabled = false;
-
- /* Mutable images can be reinterpreted as any other compatible format.
- * This is a problem with UBWC (compression for different formats is different),
- * but also tiling ("swap" affects how tiled formats are stored in memory)
- * Depth and stencil formats cannot be reintepreted as another format, and
- * cannot be linear with sysmem rendering, so don't fall back for those.
- *
- * TODO:
- * - if the fmt_list contains only formats which are swapped, but compatible
- * with each other (B8G8R8A8_UNORM and B8G8R8A8_UINT for example), then
- * tiling is still possible
- * - figure out which UBWC compressions are compatible to keep it enabled
- */
- if ((pCreateInfo->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) &&
- !vk_format_is_depth_or_stencil(image->vk.format)) {
- const VkImageFormatListCreateInfo *fmt_list =
- vk_find_struct_const(pCreateInfo->pNext, IMAGE_FORMAT_LIST_CREATE_INFO);
- if (!tu6_mutable_format_list_ubwc_compatible(fmt_list)) {
- if (ubwc_enabled) {
- if (fmt_list && fmt_list->viewFormatCount == 2) {
- perf_debug(
- device,
- "Disabling UBWC on %dx%d %s resource due to mutable formats "
- "(fmt list %s, %s)",
- image->vk.extent.width, image->vk.extent.height,
- util_format_name(vk_format_to_pipe_format(image->vk.format)),
- util_format_name(vk_format_to_pipe_format(fmt_list->pViewFormats[0])),
- util_format_name(vk_format_to_pipe_format(fmt_list->pViewFormats[1])));
- } else {
- perf_debug(
- device,
- "Disabling UBWC on %dx%d %s resource due to mutable formats "
- "(fmt list %s)",
- image->vk.extent.width, image->vk.extent.height,
- util_format_name(vk_format_to_pipe_format(image->vk.format)),
- fmt_list ? "present" : "missing");
- }
- ubwc_enabled = false;
- }
-
- if (format_list_reinterprets_r8g8_r16(format, fmt_list) ||
- format_list_has_swaps(fmt_list)) {
- tile_mode = TILE6_LINEAR;
- }
- }
- }
+ TU_FROM_HANDLE(tu_image, image, pCreateInfo->image);
+ const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
- /* expect UBWC enabled if we asked for it */
- if (modifier == DRM_FORMAT_MOD_QCOM_COMPRESSED)
- assert(ubwc_enabled);
- else if (device->physical_device->instance->debug_flags & TU_DEBUG_NOUBWC)
- ubwc_enabled = false;
-
- /* Non-UBWC tiled R8G8 is probably buggy since media formats are always
- * either linear or UBWC. There is no simple test to reproduce the bug.
- * However it was observed in the wild leading to an unrecoverable hang
- * on a650/a660.
- */
- if (has_r8g8 && tile_mode == TILE6_3 && !ubwc_enabled) {
- tile_mode = TILE6_LINEAR;
+ switch (image->type) {
+ case VK_IMAGE_TYPE_1D:
+ case VK_IMAGE_TYPE_2D:
+ assert(range->baseArrayLayer + tu_get_layerCount(image, range) <=
+ image->layer_count);
+ break;
+ case VK_IMAGE_TYPE_3D:
+ assert(range->baseArrayLayer + tu_get_layerCount(image, range) <=
+ tu_minify(image->extent.depth, range->baseMipLevel));
+ break;
+ default:
+ unreachable("bad VkImageType");
}
- for (uint32_t i = 0; i < tu6_plane_count(image->vk.format); i++) {
- struct fdl_layout *layout = &image->layout[i];
- enum pipe_format format = tu6_plane_format(image->vk.format, i);
- uint32_t width0 = pCreateInfo->extent.width;
- uint32_t height0 = pCreateInfo->extent.height;
-
- if (i > 0) {
- switch (image->vk.format) {
- case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
- case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
- /* half width/height on chroma planes */
- width0 = (width0 + 1) >> 1;
- height0 = (height0 + 1) >> 1;
- break;
- case VK_FORMAT_D32_SFLOAT_S8_UINT:
- /* no UBWC for separate stencil */
- ubwc_enabled = false;
- break;
- default:
- break;
- }
- }
-
- struct fdl_explicit_layout plane_layout;
-
- if (plane_layouts) {
- /* only expect simple 2D images for now */
- if (pCreateInfo->mipLevels != 1 ||
- pCreateInfo->arrayLayers != 1 ||
- pCreateInfo->extent.depth != 1)
- return vk_error(device, VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT);
-
- plane_layout.offset = plane_layouts[i].offset;
- plane_layout.pitch = plane_layouts[i].rowPitch;
- /* note: use plane_layouts[0].arrayPitch to support array formats */
- }
-
- layout->tile_mode = tile_mode;
- layout->ubwc = ubwc_enabled;
-
- if (!fdl6_layout(layout, format,
- pCreateInfo->samples,
- width0, height0,
- pCreateInfo->extent.depth,
- pCreateInfo->mipLevels,
- pCreateInfo->arrayLayers,
- pCreateInfo->imageType == VK_IMAGE_TYPE_3D,
- plane_layouts ? &plane_layout : NULL)) {
- assert(plane_layouts); /* can only fail with explicit layout */
- return vk_error(device, VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT);
- }
-
- if (device->instance->debug_flags & TU_DEBUG_LAYOUT)
- fdl_dump_layout(layout);
-
- /* fdl6_layout can't take explicit offset without explicit pitch
- * add offset manually for extra layouts for planes
- */
- if (!plane_layouts && i > 0) {
- uint32_t offset = ALIGN_POT(image->total_size, 4096);
- for (int i = 0; i < pCreateInfo->mipLevels; i++) {
- layout->slices[i].offset += offset;
- layout->ubwc_slices[i].offset += offset;
- }
- layout->size += offset;
- }
-
- image->total_size = MAX2(image->total_size, layout->size);
+ iview->image = image;
+ iview->type = pCreateInfo->viewType;
+ iview->vk_format = pCreateInfo->format;
+ iview->aspect_mask = pCreateInfo->subresourceRange.aspectMask;
+
+ if (iview->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) {
+ iview->vk_format = vk_format_stencil_only(iview->vk_format);
+ } else if (iview->aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) {
+ iview->vk_format = vk_format_depth_only(iview->vk_format);
}
- const struct util_format_description *desc = util_format_description(image->layout[0].format);
- if (util_format_has_depth(desc) && !(device->instance->debug_flags & TU_DEBUG_NOLRZ))
- {
- /* Depth plane is the first one */
- struct fdl_layout *layout = &image->layout[0];
- unsigned width = layout->width0;
- unsigned height = layout->height0;
-
- /* LRZ buffer is super-sampled */
- switch (layout->nr_samples) {
- case 4:
- width *= 2;
- FALLTHROUGH;
- case 2:
- height *= 2;
- break;
- default:
- break;
- }
-
- unsigned lrz_pitch = align(DIV_ROUND_UP(width, 8), 32);
- unsigned lrz_height = align(DIV_ROUND_UP(height, 8), 16);
-
- image->lrz_height = lrz_height;
- image->lrz_pitch = lrz_pitch;
- image->lrz_offset = image->total_size;
- unsigned lrz_size = lrz_pitch * lrz_height * 2;
- image->total_size += lrz_size;
-
- unsigned nblocksx = DIV_ROUND_UP(DIV_ROUND_UP(width, 8), 16);
- unsigned nblocksy = DIV_ROUND_UP(DIV_ROUND_UP(height, 8), 4);
-
- /* Fast-clear buffer is 1bit/block */
- image->lrz_fc_size = DIV_ROUND_UP(nblocksx * nblocksy, 8);
-
- /* Fast-clear buffer cannot be larger than 512 bytes (HW limitation) */
- bool has_lrz_fc = image->lrz_fc_size <= 512 &&
- device->physical_device->info->a6xx.enable_lrz_fast_clear &&
- !unlikely(device->physical_device->instance->debug_flags & TU_DEBUG_NOLRZFC);
+ // should we minify?
+ iview->extent = image->extent;
- if (has_lrz_fc || device->physical_device->info->a6xx.has_lrz_dir_tracking) {
- image->lrz_fc_offset = image->total_size;
- image->total_size += 512;
-
- if (device->physical_device->info->a6xx.has_lrz_dir_tracking) {
- /* Direction tracking uses 1 byte */
- image->total_size += 1;
- /* GRAS_LRZ_DEPTH_VIEW needs 5 bytes: 4 for view data and 1 for padding */
- image->total_size += 5;
- }
- }
-
- if (!has_lrz_fc) {
- image->lrz_fc_size = 0;
- }
- } else {
- image->lrz_height = 0;
- }
+ iview->base_layer = range->baseArrayLayer;
+ iview->layer_count = tu_get_layerCount(image, range);
+ iview->base_mip = range->baseMipLevel;
+ iview->level_count = tu_get_levelCount(image, range);
+}
- return VK_SUCCESS;
+unsigned
+tu_image_queue_family_mask(const struct tu_image *image,
+ uint32_t family,
+ uint32_t queue_family)
+{
+ if (!image->exclusive)
+ return image->queue_family_mask;
+ if (family == VK_QUEUE_FAMILY_EXTERNAL)
+ return (1u << TU_MAX_QUEUE_FAMILIES) - 1u;
+ if (family == VK_QUEUE_FAMILY_IGNORED)
+ return 1u << queue_family;
+ return 1u << family;
}
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_CreateImage(VkDevice _device,
+VkResult
+tu_CreateImage(VkDevice device,
const VkImageCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *alloc,
+ const VkAllocationCallbacks *pAllocator,
VkImage *pImage)
{
- uint64_t modifier = DRM_FORMAT_MOD_INVALID;
- const VkSubresourceLayout *plane_layouts = NULL;
-
- TU_FROM_HANDLE(tu_device, device, _device);
- struct tu_image *image =
- vk_object_zalloc(&device->vk, alloc, sizeof(*image), VK_OBJECT_TYPE_IMAGE);
-
- if (!image)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
- const VkImageDrmFormatModifierListCreateInfoEXT *mod_info =
- vk_find_struct_const(pCreateInfo->pNext,
- IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
- const VkImageDrmFormatModifierExplicitCreateInfoEXT *drm_explicit_info =
- vk_find_struct_const(pCreateInfo->pNext,
- IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
-
- assert(mod_info || drm_explicit_info);
-
- if (mod_info) {
- modifier = DRM_FORMAT_MOD_LINEAR;
- for (unsigned i = 0; i < mod_info->drmFormatModifierCount; i++) {
- if (mod_info->pDrmFormatModifiers[i] == DRM_FORMAT_MOD_QCOM_COMPRESSED)
- modifier = DRM_FORMAT_MOD_QCOM_COMPRESSED;
- }
- } else {
- modifier = drm_explicit_info->drmFormatModifier;
- assert(modifier == DRM_FORMAT_MOD_LINEAR ||
- modifier == DRM_FORMAT_MOD_QCOM_COMPRESSED);
- plane_layouts = drm_explicit_info->pPlaneLayouts;
- }
- } else {
- const struct wsi_image_create_info *wsi_info =
- vk_find_struct_const(pCreateInfo->pNext, WSI_IMAGE_CREATE_INFO_MESA);
- if (wsi_info && wsi_info->scanout)
- modifier = DRM_FORMAT_MOD_LINEAR;
- }
-
#ifdef ANDROID
const VkNativeBufferANDROID *gralloc_info =
vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID);
- int dma_buf;
- if (gralloc_info) {
- VkResult result = tu_gralloc_info(device, gralloc_info, &dma_buf, &modifier);
- if (result != VK_SUCCESS)
- return result;
- }
-#endif
- VkResult result = tu_image_init(device, image, pCreateInfo, modifier,
- plane_layouts);
- if (result != VK_SUCCESS) {
- vk_object_free(&device->vk, alloc, image);
- return result;
- }
-
- *pImage = tu_image_to_handle(image);
-
-#ifdef ANDROID
if (gralloc_info)
- return tu_import_memory_from_gralloc_handle(_device, dma_buf, alloc,
- *pImage);
+ return tu_image_from_gralloc(device, pCreateInfo, gralloc_info,
+ pAllocator, pImage);
#endif
- return VK_SUCCESS;
+
+ return tu_image_create(device,
+ &(struct tu_image_create_info) {
+ .vk_info = pCreateInfo,
+ .scanout = false,
+ },
+ pAllocator, pImage);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyImage(VkDevice _device,
VkImage _image,
const VkAllocationCallbacks *pAllocator)
@@ -738,87 +271,13 @@ tu_DestroyImage(VkDevice _device,
if (!image)
return;
-#ifdef ANDROID
if (image->owned_memory != VK_NULL_HANDLE)
tu_FreeMemory(_device, image->owned_memory, pAllocator);
-#endif
-
- vk_object_free(&device->vk, pAllocator, image);
-}
-
-static void
-tu_get_image_memory_requirements(struct tu_image *image,
- VkMemoryRequirements2 *pMemoryRequirements)
-{
- pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
- .memoryTypeBits = 1,
- .alignment = image->layout[0].base_align,
- .size = image->total_size
- };
-
- vk_foreach_struct(ext, pMemoryRequirements->pNext) {
- switch (ext->sType) {
- case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
- VkMemoryDedicatedRequirements *req =
- (VkMemoryDedicatedRequirements *) ext;
- req->requiresDedicatedAllocation =
- image->vk.external_handle_types != 0;
- req->prefersDedicatedAllocation = req->requiresDedicatedAllocation;
- break;
- }
- default:
- break;
- }
- }
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_GetImageMemoryRequirements2(VkDevice device,
- const VkImageMemoryRequirementsInfo2 *pInfo,
- VkMemoryRequirements2 *pMemoryRequirements)
-{
- TU_FROM_HANDLE(tu_image, image, pInfo->image);
-
- tu_get_image_memory_requirements(image, pMemoryRequirements);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_GetImageSparseMemoryRequirements2(
- VkDevice device,
- const VkImageSparseMemoryRequirementsInfo2 *pInfo,
- uint32_t *pSparseMemoryRequirementCount,
- VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
-{
- tu_stub();
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_GetDeviceImageMemoryRequirements(
- VkDevice _device,
- const VkDeviceImageMemoryRequirements *pInfo,
- VkMemoryRequirements2 *pMemoryRequirements)
-{
- TU_FROM_HANDLE(tu_device, device, _device);
-
- struct tu_image image = {0};
- tu_image_init(device, &image, pInfo->pCreateInfo, DRM_FORMAT_MOD_INVALID,
- NULL);
-
- tu_get_image_memory_requirements(&image, pMemoryRequirements);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_GetDeviceImageSparseMemoryRequirements(
- VkDevice device,
- const VkDeviceImageMemoryRequirements *pInfo,
- uint32_t *pSparseMemoryRequirementCount,
- VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
-{
- tu_stub();
+ vk_free2(&device->alloc, pAllocator, image);
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_GetImageSubresourceLayout(VkDevice _device,
VkImage _image,
const VkImageSubresource *pSubresource,
@@ -826,26 +285,19 @@ tu_GetImageSubresourceLayout(VkDevice _device,
{
TU_FROM_HANDLE(tu_image, image, _image);
- struct fdl_layout *layout =
- &image->layout[tu6_plane_index(image->vk.format, pSubresource->aspectMask)];
- const struct fdl_slice *slice = layout->slices + pSubresource->mipLevel;
-
- pLayout->offset =
- fdl_surface_offset(layout, pSubresource->mipLevel, pSubresource->arrayLayer);
- pLayout->rowPitch = fdl_pitch(layout, pSubresource->mipLevel);
- pLayout->arrayPitch = fdl_layer_stride(layout, pSubresource->mipLevel);
- pLayout->depthPitch = slice->size0;
- pLayout->size = pLayout->depthPitch * layout->depth0;
-
- if (fdl_ubwc_enabled(layout, pSubresource->mipLevel)) {
- /* UBWC starts at offset 0 */
- pLayout->offset = 0;
- /* UBWC scanout won't match what the kernel wants if we have levels/layers */
- assert(image->vk.mip_levels == 1 && image->vk.array_layers == 1);
- }
+ const uint32_t layer_offset = image->layer_size * pSubresource->arrayLayer;
+ const struct tu_image_level *level =
+ image->levels + pSubresource->mipLevel;
+
+ pLayout->offset = layer_offset + level->offset;
+ pLayout->size = level->size;
+ pLayout->rowPitch =
+ level->pitch * vk_format_get_blocksize(image->vk_format);
+ pLayout->arrayPitch = image->layer_size;
+ pLayout->depthPitch = level->size;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreateImageView(VkDevice _device,
const VkImageViewCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
@@ -854,19 +306,19 @@ tu_CreateImageView(VkDevice _device,
TU_FROM_HANDLE(tu_device, device, _device);
struct tu_image_view *view;
- view = vk_object_alloc(&device->vk, pAllocator, sizeof(*view),
- VK_OBJECT_TYPE_IMAGE_VIEW);
+ view = vk_alloc2(&device->alloc, pAllocator, sizeof(*view), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (view == NULL)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- tu_image_view_init(device, view, pCreateInfo, device->use_z24uint_s8uint);
+ tu_image_view_init(view, device, pCreateInfo);
*pView = tu_image_view_to_handle(view);
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyImageView(VkDevice _device,
VkImageView _iview,
const VkAllocationCallbacks *pAllocator)
@@ -876,8 +328,7 @@ tu_DestroyImageView(VkDevice _device,
if (!iview)
return;
-
- vk_object_free(&device->vk, pAllocator, iview);
+ vk_free2(&device->alloc, pAllocator, iview);
}
void
@@ -887,19 +338,13 @@ tu_buffer_view_init(struct tu_buffer_view *view,
{
TU_FROM_HANDLE(tu_buffer, buffer, pCreateInfo->buffer);
- view->buffer = buffer;
-
- uint32_t range = vk_buffer_range(&buffer->vk, pCreateInfo->offset,
- pCreateInfo->range);
- uint8_t swiz[4] = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z,
- PIPE_SWIZZLE_W };
-
- fdl6_buffer_view_init(
- view->descriptor, tu_vk_format_to_pipe_format(pCreateInfo->format),
- swiz, buffer->iova + pCreateInfo->offset, range);
+ view->range = pCreateInfo->range == VK_WHOLE_SIZE
+ ? buffer->size - pCreateInfo->offset
+ : pCreateInfo->range;
+ view->vk_format = pCreateInfo->format;
}
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreateBufferView(VkDevice _device,
const VkBufferViewCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
@@ -908,10 +353,10 @@ tu_CreateBufferView(VkDevice _device,
TU_FROM_HANDLE(tu_device, device, _device);
struct tu_buffer_view *view;
- view = vk_object_alloc(&device->vk, pAllocator, sizeof(*view),
- VK_OBJECT_TYPE_BUFFER_VIEW);
+ view = vk_alloc2(&device->alloc, pAllocator, sizeof(*view), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!view)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
tu_buffer_view_init(view, device, pCreateInfo);
@@ -920,7 +365,7 @@ tu_CreateBufferView(VkDevice _device,
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyBufferView(VkDevice _device,
VkBufferView bufferView,
const VkAllocationCallbacks *pAllocator)
@@ -931,5 +376,5 @@ tu_DestroyBufferView(VkDevice _device,
if (!view)
return;
- vk_object_free(&device->vk, pAllocator, view);
+ vk_free2(&device->alloc, pAllocator, view);
}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_pass.c b/lib/mesa/src/freedreno/vulkan/tu_pass.c
index 84c1c3061..e3d9f23df 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_pass.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_pass.c
@@ -1,796 +1,245 @@
/*
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
- * SPDX-License-Identifier: MIT
*
* based in part on anv driver which is:
* Copyright © 2015 Intel Corporation
- */
-
-#include "tu_pass.h"
-
-#include "vk_util.h"
-
-#include "tu_cmd_buffer.h"
-#include "tu_device.h"
-#include "tu_image.h"
-
-/* Return true if we have to fallback to sysmem rendering because the
- * dependency can't be satisfied with tiled rendering.
- */
-
-static bool
-dep_invalid_for_gmem(const VkSubpassDependency2 *dep,
- VkPipelineStageFlags2 src_stage_mask,
- VkPipelineStageFlags2 dst_stage_mask)
-{
- /* External dependencies don't matter here. */
- if (dep->srcSubpass == VK_SUBPASS_EXTERNAL ||
- dep->dstSubpass == VK_SUBPASS_EXTERNAL)
- return false;
-
- /* We can conceptually break down the process of rewriting a sysmem
- * renderpass into a gmem one into two parts:
- *
- * 1. Split each draw and multisample resolve into N copies, one for each
- * bin. (If hardware binning, add one more copy where the FS is disabled
- * for the binning pass). This is always allowed because the vertex stage
- * is allowed to run an arbitrary number of times and there are no extra
- * ordering constraints within a draw.
- * 2. Take the last copy of the second-to-last draw and slide it down to
- * before the last copy of the last draw. Repeat for each earlier draw
- * until the draw pass for the last bin is complete, then repeat for each
- * earlier bin until we finish with the first bin.
- *
- * During this rearranging process, we can't slide draws past each other in
- * a way that breaks the subpass dependencies. For each draw, we must slide
- * it past (copies of) the rest of the draws in the renderpass. We can
- * slide a draw past another if there isn't a dependency between them, or
- * if the dependenc(ies) are dependencies between framebuffer-space stages
- * only with the BY_REGION bit set. Note that this includes
- * self-dependencies, since these may result in pipeline barriers that also
- * break the rearranging process.
- */
-
- /* This is straight from the Vulkan 1.2 spec, section 6.1.4 "Framebuffer
- * Region Dependencies":
- */
- const VkPipelineStageFlags2 framebuffer_space_stages =
- VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
- VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
- VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
- VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT;
-
- return
- (src_stage_mask & ~(framebuffer_space_stages | VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) ||
- (dst_stage_mask & ~(framebuffer_space_stages | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)) ||
- !(dep->dependencyFlags & VK_DEPENDENCY_BY_REGION_BIT);
-}
-
-static void
-tu_render_pass_add_subpass_dep(struct tu_render_pass *pass,
- const VkSubpassDependency2 *dep)
-{
- uint32_t src = dep->srcSubpass;
- uint32_t dst = dep->dstSubpass;
-
- /* Ignore subpass self-dependencies as they allow the app to call
- * vkCmdPipelineBarrier() inside the render pass and the driver should only
- * do the barrier when called, not when starting the render pass.
- *
- * We cannot decide whether to allow gmem rendering before a barrier
- * is actually emitted, so we delay the decision until then.
- */
- if (src == dst)
- return;
-
- /* From the Vulkan 1.2.195 spec:
- *
- * "If an instance of VkMemoryBarrier2 is included in the pNext chain, srcStageMask,
- * dstStageMask, srcAccessMask, and dstAccessMask parameters are ignored. The synchronization
- * and access scopes instead are defined by the parameters of VkMemoryBarrier2."
- */
- const VkMemoryBarrier2 *barrier =
- vk_find_struct_const(dep->pNext, MEMORY_BARRIER_2);
- VkPipelineStageFlags2 src_stage_mask = barrier ? barrier->srcStageMask : dep->srcStageMask;
- VkAccessFlags2 src_access_mask = barrier ? barrier->srcAccessMask : dep->srcAccessMask;
- VkPipelineStageFlags2 dst_stage_mask = barrier ? barrier->dstStageMask : dep->dstStageMask;
- VkAccessFlags2 dst_access_mask = barrier ? barrier->dstAccessMask : dep->dstAccessMask;
-
- if (dep_invalid_for_gmem(dep, src_stage_mask, dst_stage_mask)) {
- perf_debug((struct tu_device *)pass->base.device, "Disabling gmem rendering due to invalid subpass dependency");
- for (int i = 0; i < ARRAY_SIZE(pass->gmem_pixels); i++)
- pass->gmem_pixels[i] = 0;
- }
-
- struct tu_subpass_barrier *dst_barrier;
- if (dst == VK_SUBPASS_EXTERNAL) {
- dst_barrier = &pass->end_barrier;
- } else {
- dst_barrier = &pass->subpasses[dst].start_barrier;
- }
-
- dst_barrier->src_stage_mask |= src_stage_mask;
- dst_barrier->dst_stage_mask |= dst_stage_mask;
- dst_barrier->src_access_mask |= src_access_mask;
- dst_barrier->dst_access_mask |= dst_access_mask;
-}
-
-/* We currently only care about undefined layouts, because we have to
- * flush/invalidate CCU for those. PREINITIALIZED is the same thing as
- * UNDEFINED for anything not linear tiled, but we don't know yet whether the
- * images used are tiled, so just assume they are.
- */
-
-static bool
-layout_undefined(VkImageLayout layout)
-{
- return layout == VK_IMAGE_LAYOUT_UNDEFINED ||
- layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
-}
-
-/* This implements the following bit of spec text:
- *
- * If there is no subpass dependency from VK_SUBPASS_EXTERNAL to the
- * first subpass that uses an attachment, then an implicit subpass
- * dependency exists from VK_SUBPASS_EXTERNAL to the first subpass it is
- * used in. The implicit subpass dependency only exists if there
- * exists an automatic layout transition away from initialLayout.
- * The subpass dependency operates as if defined with the
- * following parameters:
- *
- * VkSubpassDependency implicitDependency = {
- * .srcSubpass = VK_SUBPASS_EXTERNAL;
- * .dstSubpass = firstSubpass; // First subpass attachment is used in
- * .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
- * .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
- * .srcAccessMask = 0;
- * .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
- * VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
- * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
- * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
- * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
- * .dependencyFlags = 0;
- * };
*
- * Similarly, if there is no subpass dependency from the last subpass
- * that uses an attachment to VK_SUBPASS_EXTERNAL, then an implicit
- * subpass dependency exists from the last subpass it is used in to
- * VK_SUBPASS_EXTERNAL. The implicit subpass dependency only exists
- * if there exists an automatic layout transition into finalLayout.
- * The subpass dependency operates as if defined with the following
- * parameters:
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
*
- * VkSubpassDependency implicitDependency = {
- * .srcSubpass = lastSubpass; // Last subpass attachment is used in
- * .dstSubpass = VK_SUBPASS_EXTERNAL;
- * .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
- * .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
- * .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
- * VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
- * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
- * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
- * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
- * .dstAccessMask = 0;
- * .dependencyFlags = 0;
- * };
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
*
- * Note: currently this is the only use we have for layout transitions,
- * besides needing to invalidate CCU at the beginning, so we also flag
- * transitions from UNDEFINED here.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
-static void
-tu_render_pass_add_implicit_deps(struct tu_render_pass *pass,
- const VkRenderPassCreateInfo2 *info)
-{
- const VkAttachmentDescription2* att = info->pAttachments;
- bool has_external_src[info->subpassCount];
- bool has_external_dst[info->subpassCount];
- bool att_used[pass->attachment_count];
-
- memset(has_external_src, 0, sizeof(has_external_src));
- memset(has_external_dst, 0, sizeof(has_external_dst));
-
- for (uint32_t i = 0; i < info->dependencyCount; i++) {
- uint32_t src = info->pDependencies[i].srcSubpass;
- uint32_t dst = info->pDependencies[i].dstSubpass;
-
- if (src == dst)
- continue;
+#include "tu_private.h"
- if (src == VK_SUBPASS_EXTERNAL)
- has_external_src[dst] = true;
- if (dst == VK_SUBPASS_EXTERNAL)
- has_external_dst[src] = true;
- }
-
- memset(att_used, 0, sizeof(att_used));
-
- for (unsigned i = 0; i < info->subpassCount; i++) {
- const VkSubpassDescription2 *subpass = &info->pSubpasses[i];
- bool src_implicit_dep = false;
-
- for (unsigned j = 0; j < subpass->inputAttachmentCount; j++) {
- uint32_t a = subpass->pInputAttachments[j].attachment;
-
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
-
- uint32_t stencil_layout = vk_format_has_stencil(att[a].format) ?
- vk_att_ref_stencil_layout(&subpass->pInputAttachments[j], att) :
- VK_IMAGE_LAYOUT_UNDEFINED;
- uint32_t stencil_initial_layout = vk_att_desc_stencil_layout(&att[a], false);
-
- if ((att[a].initialLayout != subpass->pInputAttachments[j].layout ||
- stencil_initial_layout != stencil_layout) &&
- !att_used[a] && !has_external_src[i])
- src_implicit_dep = true;
- att_used[a] = true;
- }
-
- for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) {
- uint32_t a = subpass->pColorAttachments[j].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
- if (att[a].initialLayout != subpass->pColorAttachments[j].layout &&
- !att_used[a] && !has_external_src[i])
- src_implicit_dep = true;
- att_used[a] = true;
- }
-
- if (subpass->pDepthStencilAttachment &&
- subpass->pDepthStencilAttachment->attachment != VK_ATTACHMENT_UNUSED) {
- uint32_t a = subpass->pDepthStencilAttachment->attachment;
- uint32_t stencil_layout = vk_att_ref_stencil_layout(subpass->pDepthStencilAttachment, att);
- uint32_t stencil_initial_layout = vk_att_desc_stencil_layout(&att[a], false);
+#include "vk_util.h"
- if ((att[a].initialLayout != subpass->pDepthStencilAttachment->layout ||
- stencil_initial_layout != stencil_layout) &&
- !att_used[a] && !has_external_src[i]) {
- src_implicit_dep = true;
- }
- att_used[a] = true;
- }
+VkResult
+tu_CreateRenderPass(VkDevice _device,
+ const VkRenderPassCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkRenderPass *pRenderPass)
+{
+ TU_FROM_HANDLE(tu_device, device, _device);
+ struct tu_render_pass *pass;
+ size_t size;
+ size_t attachments_offset;
+ VkRenderPassMultiviewCreateInfo *multiview_info = NULL;
- if (subpass->pResolveAttachments) {
- for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) {
- uint32_t a = subpass->pResolveAttachments[j].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
- if (att[a].initialLayout != subpass->pResolveAttachments[j].layout &&
- !att_used[a] && !has_external_src[i])
- src_implicit_dep = true;
- att_used[a] = true;
- }
- }
+ assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO);
- const VkSubpassDescriptionDepthStencilResolve *ds_resolve =
- vk_find_struct_const(subpass->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE);
+ size = sizeof(*pass);
+ size += pCreateInfo->subpassCount * sizeof(pass->subpasses[0]);
+ attachments_offset = size;
+ size += pCreateInfo->attachmentCount * sizeof(pass->attachments[0]);
- if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment &&
- ds_resolve->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED) {
- uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment;
- uint32_t stencil_layout = vk_att_ref_stencil_layout(ds_resolve->pDepthStencilResolveAttachment, att);
- uint32_t stencil_initial_layout = vk_att_desc_stencil_layout(&att[a], false);
+ pass = vk_alloc2(&device->alloc, pAllocator, size, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (pass == NULL)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- if ((att[a].initialLayout != subpass->pDepthStencilAttachment->layout ||
- stencil_initial_layout != stencil_layout) &&
- !att_used[a] && !has_external_src[i])
- src_implicit_dep = true;
- att_used[a] = true;
- }
+ memset(pass, 0, size);
+ pass->attachment_count = pCreateInfo->attachmentCount;
+ pass->subpass_count = pCreateInfo->subpassCount;
+ pass->attachments = (void *) pass + attachments_offset;
- if (src_implicit_dep) {
- tu_render_pass_add_subpass_dep(pass, &(VkSubpassDependency2) {
- .srcSubpass = VK_SUBPASS_EXTERNAL,
- .dstSubpass = i,
- .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
- .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
- .srcAccessMask = 0,
- .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
- VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
- VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
- VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
- VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
- .dependencyFlags = 0,
- });
+ vk_foreach_struct(ext, pCreateInfo->pNext)
+ {
+ switch (ext->sType) {
+ case VK_STRUCTURE_TYPE_RENDER_PASS_MULTIVIEW_CREATE_INFO:
+ multiview_info = (VkRenderPassMultiviewCreateInfo *) ext;
+ break;
+ default:
+ break;
}
}
- memset(att_used, 0, sizeof(att_used));
-
- for (int i = info->subpassCount - 1; i >= 0; i--) {
- const VkSubpassDescription2 *subpass = &info->pSubpasses[i];
- bool dst_implicit_dep = false;
-
- for (unsigned j = 0; j < subpass->inputAttachmentCount; j++) {
- uint32_t a = subpass->pInputAttachments[j].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
-
- uint32_t stencil_layout = vk_format_has_stencil(att[a].format) ?
- vk_att_ref_stencil_layout(&subpass->pInputAttachments[j], att) :
- VK_IMAGE_LAYOUT_UNDEFINED;
- uint32_t stencil_final_layout = vk_att_desc_stencil_layout(&att[a], true);
-
- if ((att[a].finalLayout != subpass->pInputAttachments[j].layout ||
- stencil_final_layout != stencil_layout) &&
- !att_used[a] && !has_external_dst[i])
- dst_implicit_dep = true;
- att_used[a] = true;
- }
-
- for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) {
- uint32_t a = subpass->pColorAttachments[j].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
- if (att[a].finalLayout != subpass->pColorAttachments[j].layout &&
- !att_used[a] && !has_external_dst[i])
- dst_implicit_dep = true;
- att_used[a] = true;
- }
-
- if (subpass->pDepthStencilAttachment &&
- subpass->pDepthStencilAttachment->attachment != VK_ATTACHMENT_UNUSED) {
- uint32_t a = subpass->pDepthStencilAttachment->attachment;
- uint32_t stencil_layout = vk_att_ref_stencil_layout(subpass->pDepthStencilAttachment, att);
- uint32_t stencil_final_layout = vk_att_desc_stencil_layout(&att[a], true);
-
- if ((att[a].finalLayout != subpass->pDepthStencilAttachment->layout ||
- stencil_final_layout != stencil_layout) &&
- !att_used[a] && !has_external_dst[i]) {
- dst_implicit_dep = true;
- }
- att_used[a] = true;
- }
-
- if (subpass->pResolveAttachments) {
- for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) {
- uint32_t a = subpass->pResolveAttachments[j].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
- if (att[a].finalLayout != subpass->pResolveAttachments[j].layout &&
- !att_used[a] && !has_external_dst[i])
- dst_implicit_dep = true;
- att_used[a] = true;
- }
- }
-
- const VkSubpassDescriptionDepthStencilResolve *ds_resolve =
- vk_find_struct_const(subpass->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE);
-
- if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment &&
- ds_resolve->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED) {
- uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment;
- uint32_t stencil_layout = vk_att_ref_stencil_layout(ds_resolve->pDepthStencilResolveAttachment, att);
- uint32_t stencil_final_layout = vk_att_desc_stencil_layout(&att[a], true);
-
- if ((att[a].finalLayout != subpass->pDepthStencilAttachment->layout ||
- stencil_final_layout != stencil_layout) &&
- !att_used[a] && !has_external_src[i])
- dst_implicit_dep = true;
- att_used[a] = true;
- }
+ for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
+ struct tu_render_pass_attachment *att = &pass->attachments[i];
- if (dst_implicit_dep) {
- tu_render_pass_add_subpass_dep(pass, &(VkSubpassDependency2) {
- .srcSubpass = i,
- .dstSubpass = VK_SUBPASS_EXTERNAL,
- .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
- .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
- .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
- VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
- VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
- VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
- VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
- .dstAccessMask = 0,
- .dependencyFlags = 0,
- });
- }
+ att->format = pCreateInfo->pAttachments[i].format;
+ att->samples = pCreateInfo->pAttachments[i].samples;
+ att->load_op = pCreateInfo->pAttachments[i].loadOp;
+ att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp;
+ att->initial_layout = pCreateInfo->pAttachments[i].initialLayout;
+ att->final_layout = pCreateInfo->pAttachments[i].finalLayout;
+ // att->store_op = pCreateInfo->pAttachments[i].storeOp;
+ // att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp;
}
+ uint32_t subpass_attachment_count = 0;
+ struct tu_subpass_attachment *p;
+ for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
+ const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i];
- /* Handle UNDEFINED transitions, similar to the handling in tu_barrier().
- * Assume that if an attachment has an initial layout of UNDEFINED, it gets
- * transitioned eventually.
- */
- for (unsigned i = 0; i < info->attachmentCount; i++) {
- if (layout_undefined(att[i].initialLayout)) {
- if (vk_format_is_depth_or_stencil(att[i].format)) {
- pass->subpasses[0].start_barrier.incoherent_ccu_depth = true;
- } else {
- pass->subpasses[0].start_barrier.incoherent_ccu_color = true;
- }
- }
+ subpass_attachment_count +=
+ desc->inputAttachmentCount + desc->colorAttachmentCount +
+ (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) +
+ (desc->pDepthStencilAttachment != NULL);
}
-}
-/* If an input attachment is used without an intervening write to the same
- * attachment, then we can just use the original image, even in GMEM mode.
- * This is an optimization, but it's also important because it allows us to
- * avoid having to invalidate UCHE at the beginning of each tile due to it
- * becoming invalid. The only reads of GMEM via UCHE should be after an
- * earlier subpass modified it, which only works if there's already an
- * appropriate dependency that will add the CACHE_INVALIDATE anyway. We
- * don't consider this in the dependency code, so this is also required for
- * correctness.
- */
-static void
-tu_render_pass_patch_input_gmem(struct tu_render_pass *pass)
-{
- bool written[pass->attachment_count];
-
- memset(written, 0, sizeof(written));
-
- for (unsigned i = 0; i < pass->subpass_count; i++) {
- struct tu_subpass *subpass = &pass->subpasses[i];
-
- for (unsigned j = 0; j < subpass->input_count; j++) {
- uint32_t a = subpass->input_attachments[j].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
- subpass->input_attachments[j].patch_input_gmem = written[a];
+ if (subpass_attachment_count) {
+ pass->subpass_attachments = vk_alloc2(
+ &device->alloc, pAllocator,
+ subpass_attachment_count * sizeof(struct tu_subpass_attachment), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (pass->subpass_attachments == NULL) {
+ vk_free2(&device->alloc, pAllocator, pass);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
}
+ } else
+ pass->subpass_attachments = NULL;
- for (unsigned j = 0; j < subpass->color_count; j++) {
- uint32_t a = subpass->color_attachments[j].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
- written[a] = true;
+ p = pass->subpass_attachments;
+ for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
+ const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i];
+ uint32_t color_sample_count = 1, depth_sample_count = 1;
+ struct tu_subpass *subpass = &pass->subpasses[i];
- for (unsigned k = 0; k < subpass->input_count; k++) {
- if (subpass->input_attachments[k].attachment == a &&
- !subpass->input_attachments[k].patch_input_gmem) {
- /* For render feedback loops, we have no idea whether the use
- * as a color attachment or input attachment will come first,
- * so we have to always use GMEM in case the color attachment
- * comes first and defensively invalidate UCHE in case the
- * input attachment comes first.
- */
- subpass->feedback_invalidate = true;
- subpass->input_attachments[k].patch_input_gmem = true;
- }
- }
- }
+ subpass->input_count = desc->inputAttachmentCount;
+ subpass->color_count = desc->colorAttachmentCount;
+ if (multiview_info)
+ subpass->view_mask = multiview_info->pViewMasks[i];
- for (unsigned j = 0; j < subpass->resolve_count; j++) {
- uint32_t a = subpass->resolve_attachments[j].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
- written[a] = true;
- }
+ if (desc->inputAttachmentCount > 0) {
+ subpass->input_attachments = p;
+ p += desc->inputAttachmentCount;
- if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
- written[subpass->depth_stencil_attachment.attachment] = true;
- for (unsigned k = 0; k < subpass->input_count; k++) {
- if (subpass->input_attachments[k].attachment ==
- subpass->depth_stencil_attachment.attachment &&
- !subpass->input_attachments[k].patch_input_gmem) {
- subpass->feedback_invalidate = true;
- subpass->input_attachments[k].patch_input_gmem = true;
- }
+ for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
+ subpass->input_attachments[j] = (struct tu_subpass_attachment) {
+ .attachment = desc->pInputAttachments[j].attachment,
+ .layout = desc->pInputAttachments[j].layout,
+ };
+ if (desc->pInputAttachments[j].attachment != VK_ATTACHMENT_UNUSED)
+ pass->attachments[desc->pInputAttachments[j].attachment]
+ .view_mask |= subpass->view_mask;
}
}
- }
-}
-
-static void
-tu_render_pass_check_feedback_loop(struct tu_render_pass *pass)
-{
- for (unsigned i = 0; i < pass->subpass_count; i++) {
- struct tu_subpass *subpass = &pass->subpasses[i];
- for (unsigned j = 0; j < subpass->color_count; j++) {
- uint32_t a = subpass->color_attachments[j].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
- for (unsigned k = 0; k < subpass->input_count; k++) {
- if (subpass->input_attachments[k].attachment == a) {
- subpass->feedback_loop_color = true;
- break;
- }
- }
- }
+ if (desc->colorAttachmentCount > 0) {
+ subpass->color_attachments = p;
+ p += desc->colorAttachmentCount;
- if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
- for (unsigned k = 0; k < subpass->input_count; k++) {
- if (subpass->input_attachments[k].attachment ==
- subpass->depth_stencil_attachment.attachment) {
- subpass->feedback_loop_ds = true;
- break;
+ for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
+ subpass->color_attachments[j] = (struct tu_subpass_attachment) {
+ .attachment = desc->pColorAttachments[j].attachment,
+ .layout = desc->pColorAttachments[j].layout,
+ };
+ if (desc->pColorAttachments[j].attachment !=
+ VK_ATTACHMENT_UNUSED) {
+ pass->attachments[desc->pColorAttachments[j].attachment]
+ .view_mask |= subpass->view_mask;
+ color_sample_count =
+ pCreateInfo
+ ->pAttachments[desc->pColorAttachments[j].attachment]
+ .samples;
}
}
}
- }
-}
-
-static void update_samples(struct tu_subpass *subpass,
- VkSampleCountFlagBits samples)
-{
- assert(subpass->samples == 0 || subpass->samples == samples);
- subpass->samples = samples;
-}
-static void
-tu_render_pass_calc_hash(struct tu_render_pass *pass)
-{
- #define HASH(hash, data) XXH64(&(data), sizeof(data), hash)
-
- uint64_t hash = HASH(0, pass->attachment_count);
- hash = XXH64(pass->attachments,
- pass->attachment_count * sizeof(pass->attachments[0]), hash);
- hash = HASH(hash, pass->subpass_count);
- for (unsigned i = 0; i < pass->subpass_count; i++) {
- hash = HASH(hash, pass->subpasses[i].samples);
- hash = HASH(hash, pass->subpasses[i].input_count);
- hash = HASH(hash, pass->subpasses[i].color_count);
- hash = HASH(hash, pass->subpasses[i].resolve_count);
- }
-
- pass->autotune_hash = hash;
-
- #undef HASH
-}
-
-static void
-tu_render_pass_cond_config(struct tu_render_pass *pass)
-{
- for (uint32_t i = 0; i < pass->attachment_count; i++) {
- struct tu_render_pass_attachment *att = &pass->attachments[i];
-
- att->cond_load_allowed =
- (att->load || att->load_stencil) && !att->clear_mask && !att->will_be_resolved;
- att->cond_store_allowed =
- (att->store || att->store_stencil) && !att->clear_mask;
- }
-}
-
-static void
-tu_render_pass_gmem_config(struct tu_render_pass *pass,
- const struct tu_physical_device *phys_dev)
-{
- for (enum tu_gmem_layout layout = 0; layout < TU_GMEM_LAYOUT_COUNT;
- layout++) {
- /* From the VK_KHR_multiview spec:
- *
- * Multiview is all-or-nothing for a render pass - that is, either all
- * subpasses must have a non-zero view mask (though some subpasses may
- * have only one view) or all must be zero.
- *
- * This means we only have to check one of the view masks.
- */
- if (pass->subpasses[0].multiview_mask) {
- /* It seems multiview must use sysmem rendering. */
- pass->gmem_pixels[layout] = 0;
- continue;
- }
-
- /* log2(gmem_align/(tile_align_w*tile_align_h)) */
- uint32_t block_align_shift = 3;
- uint32_t tile_align_w = phys_dev->info->tile_align_w;
- uint32_t gmem_align = (1 << block_align_shift) * tile_align_w *
- phys_dev->info->tile_align_h;
-
- /* calculate total bytes per pixel */
- uint32_t cpp_total = 0;
- for (uint32_t i = 0; i < pass->attachment_count; i++) {
- struct tu_render_pass_attachment *att = &pass->attachments[i];
- bool cpp1 = (att->cpp == 1);
- if (att->gmem) {
- cpp_total += att->cpp;
-
- /* take into account the separate stencil: */
- if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
- cpp1 = (att->samples == 1);
- cpp_total += att->samples;
- }
+ subpass->has_resolve = false;
+ if (desc->pResolveAttachments) {
+ subpass->resolve_attachments = p;
+ p += desc->colorAttachmentCount;
- /* texture pitch must be aligned to 64, use a tile_align_w that is
- * a multiple of 64 for cpp==1 attachment to work as input
- * attachment
- */
- if (cpp1 && tile_align_w % 64 != 0) {
- tile_align_w *= 2;
- block_align_shift -= 1;
+ for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
+ uint32_t a = desc->pResolveAttachments[j].attachment;
+ subpass->resolve_attachments[j] = (struct tu_subpass_attachment) {
+ .attachment = desc->pResolveAttachments[j].attachment,
+ .layout = desc->pResolveAttachments[j].layout,
+ };
+ if (a != VK_ATTACHMENT_UNUSED) {
+ subpass->has_resolve = true;
+ pass->attachments[desc->pResolveAttachments[j].attachment]
+ .view_mask |= subpass->view_mask;
}
}
}
- pass->tile_align_w = tile_align_w;
-
- /* no gmem attachments */
- if (cpp_total == 0) {
- /* any value non-zero value so tiling config works with no
- * attachments
- */
- pass->gmem_pixels[layout] = 1024 * 1024;
- continue;
- }
-
- /* TODO: this algorithm isn't optimal
- * for example, two attachments with cpp = {1, 4}
- * result: nblocks = {12, 52}, pixels = 196608
- * optimal: nblocks = {13, 51}, pixels = 208896
- */
- uint32_t gmem_size = layout == TU_GMEM_LAYOUT_FULL
- ? phys_dev->gmem_size
- : phys_dev->ccu_offset_gmem;
- uint32_t gmem_blocks = gmem_size / gmem_align;
- uint32_t offset = 0, pixels = ~0u, i;
- for (i = 0; i < pass->attachment_count; i++) {
- struct tu_render_pass_attachment *att = &pass->attachments[i];
- if (!att->gmem)
- continue;
-
- att->gmem_offset[layout] = offset;
-
- uint32_t align = MAX2(1, att->cpp >> block_align_shift);
- uint32_t nblocks =
- MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align);
-
- if (nblocks > gmem_blocks)
- break;
-
- gmem_blocks -= nblocks;
- cpp_total -= att->cpp;
- offset += nblocks * gmem_align;
- pixels = MIN2(pixels, nblocks * gmem_align / att->cpp);
-
- /* repeat the same for separate stencil */
- if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
- att->gmem_offset_stencil[layout] = offset;
-
- /* note: for s8_uint, block align is always 1 */
- uint32_t nblocks = gmem_blocks * att->samples / cpp_total;
- if (nblocks > gmem_blocks)
- break;
-
- gmem_blocks -= nblocks;
- cpp_total -= att->samples;
- offset += nblocks * gmem_align;
- pixels = MIN2(pixels, nblocks * gmem_align / att->samples);
+ if (desc->pDepthStencilAttachment) {
+ subpass->depth_stencil_attachment = (struct tu_subpass_attachment) {
+ .attachment = desc->pDepthStencilAttachment->attachment,
+ .layout = desc->pDepthStencilAttachment->layout,
+ };
+ if (desc->pDepthStencilAttachment->attachment !=
+ VK_ATTACHMENT_UNUSED) {
+ pass->attachments[desc->pDepthStencilAttachment->attachment]
+ .view_mask |= subpass->view_mask;
+ depth_sample_count =
+ pCreateInfo
+ ->pAttachments[desc->pDepthStencilAttachment->attachment]
+ .samples;
}
+ } else {
+ subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED;
}
- /* if the loop didn't complete then the gmem config is impossible */
- if (i == pass->attachment_count)
- pass->gmem_pixels[layout] = pixels;
+ subpass->max_sample_count =
+ MAX2(color_sample_count, depth_sample_count);
}
-}
-static void
-tu_render_pass_bandwidth_config(struct tu_render_pass *pass)
-{
- pass->gmem_bandwidth_per_pixel = 0;
- pass->sysmem_bandwidth_per_pixel = 0;
-
- for (uint32_t i = 0; i < pass->attachment_count; i++) {
- const struct tu_render_pass_attachment *att = &pass->attachments[i];
-
- /* approximate tu_load_gmem_attachment */
- if (att->load)
- pass->gmem_bandwidth_per_pixel += att->cpp;
-
- /* approximate tu_store_gmem_attachment */
- if (att->store)
- pass->gmem_bandwidth_per_pixel += att->cpp;
-
- /* approximate tu_clear_sysmem_attachment */
- if (att->clear_mask)
- pass->sysmem_bandwidth_per_pixel += att->cpp;
-
- /* approximate tu6_emit_sysmem_resolves */
- if (att->will_be_resolved) {
- pass->sysmem_bandwidth_per_pixel +=
- att->cpp + att->cpp / att->samples;
+ for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) {
+ uint32_t dst = pCreateInfo->pDependencies[i].dstSubpass;
+ if (dst == VK_SUBPASS_EXTERNAL) {
+ pass->end_barrier.src_stage_mask =
+ pCreateInfo->pDependencies[i].srcStageMask;
+ pass->end_barrier.src_access_mask =
+ pCreateInfo->pDependencies[i].srcAccessMask;
+ pass->end_barrier.dst_access_mask =
+ pCreateInfo->pDependencies[i].dstAccessMask;
+ } else {
+ pass->subpasses[dst].start_barrier.src_stage_mask =
+ pCreateInfo->pDependencies[i].srcStageMask;
+ pass->subpasses[dst].start_barrier.src_access_mask =
+ pCreateInfo->pDependencies[i].srcAccessMask;
+ pass->subpasses[dst].start_barrier.dst_access_mask =
+ pCreateInfo->pDependencies[i].dstAccessMask;
}
}
-}
-
-static void
-attachment_set_ops(struct tu_device *device,
- struct tu_render_pass_attachment *att,
- VkAttachmentLoadOp load_op,
- VkAttachmentLoadOp stencil_load_op,
- VkAttachmentStoreOp store_op,
- VkAttachmentStoreOp stencil_store_op)
-{
- if (device->instance->debug_flags & TU_DEBUG_DONT_CARE_AS_LOAD) {
- if (load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
- load_op = VK_ATTACHMENT_LOAD_OP_LOAD;
- if (stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
- stencil_load_op = VK_ATTACHMENT_LOAD_OP_LOAD;
- }
-
- /* load/store ops */
- att->clear_mask =
- (load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) ? VK_IMAGE_ASPECT_COLOR_BIT : 0;
- att->load = (load_op == VK_ATTACHMENT_LOAD_OP_LOAD);
- att->store = (store_op == VK_ATTACHMENT_STORE_OP_STORE);
-
- bool stencil_clear = (stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR);
- bool stencil_load = (stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD);
- bool stencil_store = (stencil_store_op == VK_ATTACHMENT_STORE_OP_STORE);
- switch (att->format) {
- case VK_FORMAT_D24_UNORM_S8_UINT: /* || stencil load/store */
- if (att->clear_mask)
- att->clear_mask = VK_IMAGE_ASPECT_DEPTH_BIT;
- if (stencil_clear)
- att->clear_mask |= VK_IMAGE_ASPECT_STENCIL_BIT;
- if (stencil_load)
- att->load = true;
- if (stencil_store)
- att->store = true;
- break;
- case VK_FORMAT_S8_UINT: /* replace load/store with stencil load/store */
- att->clear_mask = stencil_clear ? VK_IMAGE_ASPECT_COLOR_BIT : 0;
- att->load = stencil_load;
- att->store = stencil_store;
- break;
- case VK_FORMAT_D32_SFLOAT_S8_UINT: /* separate stencil */
- if (att->clear_mask)
- att->clear_mask = VK_IMAGE_ASPECT_DEPTH_BIT;
- if (stencil_clear)
- att->clear_mask |= VK_IMAGE_ASPECT_STENCIL_BIT;
- if (stencil_load)
- att->load_stencil = true;
- if (stencil_store)
- att->store_stencil = true;
- break;
- default:
- break;
- }
-}
-
-static bool
-is_depth_stencil_resolve_enabled(const VkSubpassDescriptionDepthStencilResolve *depth_stencil_resolve)
-{
- if (depth_stencil_resolve &&
- depth_stencil_resolve->pDepthStencilResolveAttachment &&
- depth_stencil_resolve->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED) {
- return true;
- }
- return false;
-}
-
-static void
-tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const VkRenderPassCreateInfo2 *pCreateInfo)
-{
- struct tu_subpass *subpass = &pass->subpasses[i];
+ *pRenderPass = tu_render_pass_to_handle(pass);
- pass->attachments[a].gmem = true;
- update_samples(subpass, pCreateInfo->pAttachments[a].samples);
- pass->attachments[a].clear_views |= subpass->multiview_mask;
+ return VK_SUCCESS;
}
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_CreateRenderPass2(VkDevice _device,
- const VkRenderPassCreateInfo2 *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkRenderPass *pRenderPass)
+VkResult
+tu_CreateRenderPass2KHR(VkDevice _device,
+ const VkRenderPassCreateInfo2KHR *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkRenderPass *pRenderPass)
{
TU_FROM_HANDLE(tu_device, device, _device);
-
- if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC))
- return vk_common_CreateRenderPass2(_device, pCreateInfo, pAllocator,
- pRenderPass);
-
struct tu_render_pass *pass;
size_t size;
size_t attachments_offset;
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2);
+ assert(pCreateInfo->sType ==
+ VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR);
size = sizeof(*pass);
size += pCreateInfo->subpassCount * sizeof(pass->subpasses[0]);
attachments_offset = size;
size += pCreateInfo->attachmentCount * sizeof(pass->attachments[0]);
- pass = vk_object_zalloc(&device->vk, pAllocator, size,
- VK_OBJECT_TYPE_RENDER_PASS);
+ pass = vk_alloc2(&device->alloc, pAllocator, size, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (pass == NULL)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ memset(pass, 0, size);
pass->attachment_count = pCreateInfo->attachmentCount;
pass->subpass_count = pCreateInfo->subpassCount;
pass->attachments = (void *) pass + attachments_offset;
@@ -800,82 +249,58 @@ tu_CreateRenderPass2(VkDevice _device,
att->format = pCreateInfo->pAttachments[i].format;
att->samples = pCreateInfo->pAttachments[i].samples;
- /* for d32s8, cpp is for the depth image, and
- * att->samples will be used as the cpp for the stencil image
- */
- if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT)
- att->cpp = 4 * att->samples;
- else
- att->cpp = vk_format_get_blocksize(att->format) * att->samples;
- /* Initially not allocated into gmem, tu_subpass_use_attachment() will move it there. */
- att->gmem = false;
-
- VkAttachmentLoadOp loadOp = pCreateInfo->pAttachments[i].loadOp;
- VkAttachmentLoadOp stencilLoadOp = pCreateInfo->pAttachments[i].stencilLoadOp;
-
- attachment_set_ops(device, att, loadOp, stencilLoadOp,
- pCreateInfo->pAttachments[i].storeOp,
- pCreateInfo->pAttachments[i].stencilStoreOp);
+ att->load_op = pCreateInfo->pAttachments[i].loadOp;
+ att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp;
+ att->initial_layout = pCreateInfo->pAttachments[i].initialLayout;
+ att->final_layout = pCreateInfo->pAttachments[i].finalLayout;
+ // att->store_op = pCreateInfo->pAttachments[i].storeOp;
+ // att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp;
}
uint32_t subpass_attachment_count = 0;
struct tu_subpass_attachment *p;
for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
- const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i];
- const VkSubpassDescriptionDepthStencilResolve *ds_resolve =
- vk_find_struct_const(desc->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE);
+ const VkSubpassDescription2KHR *desc = &pCreateInfo->pSubpasses[i];
subpass_attachment_count +=
desc->inputAttachmentCount + desc->colorAttachmentCount +
(desc->pResolveAttachments ? desc->colorAttachmentCount : 0) +
- (is_depth_stencil_resolve_enabled(ds_resolve) ? 1 : 0);
+ (desc->pDepthStencilAttachment != NULL);
}
if (subpass_attachment_count) {
pass->subpass_attachments = vk_alloc2(
- &device->vk.alloc, pAllocator,
+ &device->alloc, pAllocator,
subpass_attachment_count * sizeof(struct tu_subpass_attachment), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (pass->subpass_attachments == NULL) {
- vk_object_free(&device->vk, pAllocator, pass);
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+ vk_free2(&device->alloc, pAllocator, pass);
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
}
} else
pass->subpass_attachments = NULL;
p = pass->subpass_attachments;
for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
- const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i];
- const VkSubpassDescriptionDepthStencilResolve *ds_resolve =
- vk_find_struct_const(desc->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE);
+ const VkSubpassDescription2KHR *desc = &pCreateInfo->pSubpasses[i];
+ uint32_t color_sample_count = 1, depth_sample_count = 1;
struct tu_subpass *subpass = &pass->subpasses[i];
subpass->input_count = desc->inputAttachmentCount;
subpass->color_count = desc->colorAttachmentCount;
- subpass->resolve_count = 0;
- subpass->resolve_depth_stencil = is_depth_stencil_resolve_enabled(ds_resolve);
- subpass->samples = 0;
- subpass->srgb_cntl = 0;
-
- const VkSubpassDescriptionFlagBits raster_order_access_bits =
- VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_EXT |
- VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_EXT |
- VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_EXT;
-
- subpass->raster_order_attachment_access = desc->flags & raster_order_access_bits;
-
- subpass->multiview_mask = desc->viewMask;
+ subpass->view_mask = desc->viewMask;
if (desc->inputAttachmentCount > 0) {
subpass->input_attachments = p;
p += desc->inputAttachmentCount;
for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
- uint32_t a = desc->pInputAttachments[j].attachment;
- subpass->input_attachments[j].attachment = a;
- /* Note: attachments only used as input attachments will be read
- * directly instead of through gmem, so we don't mark input
- * attachments as needing gmem.
- */
+ subpass->input_attachments[j] = (struct tu_subpass_attachment) {
+ .attachment = desc->pInputAttachments[j].attachment,
+ .layout = desc->pInputAttachments[j].layout,
+ };
+ if (desc->pInputAttachments[j].attachment != VK_ATTACHMENT_UNUSED)
+ pass->attachments[desc->pInputAttachments[j].attachment]
+ .view_mask |= subpass->view_mask;
}
}
@@ -884,313 +309,108 @@ tu_CreateRenderPass2(VkDevice _device,
p += desc->colorAttachmentCount;
for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
- uint32_t a = desc->pColorAttachments[j].attachment;
- subpass->color_attachments[j].attachment = a;
-
- if (a != VK_ATTACHMENT_UNUSED) {
- tu_subpass_use_attachment(pass, i, a, pCreateInfo);
-
- if (vk_format_is_srgb(pass->attachments[a].format))
- subpass->srgb_cntl |= 1 << j;
+ subpass->color_attachments[j] = (struct tu_subpass_attachment) {
+ .attachment = desc->pColorAttachments[j].attachment,
+ .layout = desc->pColorAttachments[j].layout,
+ };
+ if (desc->pColorAttachments[j].attachment !=
+ VK_ATTACHMENT_UNUSED) {
+ pass->attachments[desc->pColorAttachments[j].attachment]
+ .view_mask |= subpass->view_mask;
+ color_sample_count =
+ pCreateInfo
+ ->pAttachments[desc->pColorAttachments[j].attachment]
+ .samples;
}
}
}
- subpass->resolve_attachments = (desc->pResolveAttachments || subpass->resolve_depth_stencil) ? p : NULL;
+ subpass->has_resolve = false;
if (desc->pResolveAttachments) {
+ subpass->resolve_attachments = p;
p += desc->colorAttachmentCount;
- subpass->resolve_count += desc->colorAttachmentCount;
- for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
- subpass->resolve_attachments[j].attachment =
- desc->pResolveAttachments[j].attachment;
- uint32_t src_a = desc->pColorAttachments[j].attachment;
- if (src_a != VK_ATTACHMENT_UNUSED) {
- pass->attachments[src_a].will_be_resolved =
- desc->pResolveAttachments[j].attachment != VK_ATTACHMENT_UNUSED;
+ for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
+ uint32_t a = desc->pResolveAttachments[j].attachment;
+ subpass->resolve_attachments[j] = (struct tu_subpass_attachment) {
+ .attachment = desc->pResolveAttachments[j].attachment,
+ .layout = desc->pResolveAttachments[j].layout,
+ };
+ if (a != VK_ATTACHMENT_UNUSED) {
+ subpass->has_resolve = true;
+ pass->attachments[desc->pResolveAttachments[j].attachment]
+ .view_mask |= subpass->view_mask;
}
}
}
- if (subpass->resolve_depth_stencil) {
- p++;
- subpass->resolve_count++;
- uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment;
- subpass->resolve_attachments[subpass->resolve_count - 1].attachment = a;
-
- uint32_t src_a = desc->pDepthStencilAttachment->attachment;
- if (src_a != VK_ATTACHMENT_UNUSED) {
- pass->attachments[src_a].will_be_resolved = a != VK_ATTACHMENT_UNUSED;
+ if (desc->pDepthStencilAttachment) {
+ subpass->depth_stencil_attachment = (struct tu_subpass_attachment) {
+ .attachment = desc->pDepthStencilAttachment->attachment,
+ .layout = desc->pDepthStencilAttachment->layout,
+ };
+ if (desc->pDepthStencilAttachment->attachment !=
+ VK_ATTACHMENT_UNUSED) {
+ pass->attachments[desc->pDepthStencilAttachment->attachment]
+ .view_mask |= subpass->view_mask;
+ depth_sample_count =
+ pCreateInfo
+ ->pAttachments[desc->pDepthStencilAttachment->attachment]
+ .samples;
}
+ } else {
+ subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED;
}
- uint32_t a = desc->pDepthStencilAttachment ?
- desc->pDepthStencilAttachment->attachment : VK_ATTACHMENT_UNUSED;
- subpass->depth_stencil_attachment.attachment = a;
- if (a != VK_ATTACHMENT_UNUSED)
- tu_subpass_use_attachment(pass, i, a, pCreateInfo);
- }
-
- tu_render_pass_patch_input_gmem(pass);
-
- tu_render_pass_check_feedback_loop(pass);
-
- /* disable unused attachments */
- for (uint32_t i = 0; i < pass->attachment_count; i++) {
- struct tu_render_pass_attachment *att = &pass->attachments[i];
- if (!att->gmem) {
- att->clear_mask = 0;
- att->load = false;
- }
+ subpass->max_sample_count =
+ MAX2(color_sample_count, depth_sample_count);
}
- tu_render_pass_cond_config(pass);
- tu_render_pass_gmem_config(pass, device->physical_device);
- tu_render_pass_bandwidth_config(pass);
- tu_render_pass_calc_hash(pass);
-
for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) {
- tu_render_pass_add_subpass_dep(pass, &pCreateInfo->pDependencies[i]);
+ uint32_t dst = pCreateInfo->pDependencies[i].dstSubpass;
+ if (dst == VK_SUBPASS_EXTERNAL) {
+ pass->end_barrier.src_stage_mask =
+ pCreateInfo->pDependencies[i].srcStageMask;
+ pass->end_barrier.src_access_mask =
+ pCreateInfo->pDependencies[i].srcAccessMask;
+ pass->end_barrier.dst_access_mask =
+ pCreateInfo->pDependencies[i].dstAccessMask;
+ } else {
+ pass->subpasses[dst].start_barrier.src_stage_mask =
+ pCreateInfo->pDependencies[i].srcStageMask;
+ pass->subpasses[dst].start_barrier.src_access_mask =
+ pCreateInfo->pDependencies[i].srcAccessMask;
+ pass->subpasses[dst].start_barrier.dst_access_mask =
+ pCreateInfo->pDependencies[i].dstAccessMask;
+ }
}
- tu_render_pass_add_implicit_deps(pass, pCreateInfo);
-
*pRenderPass = tu_render_pass_to_handle(pass);
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyRenderPass(VkDevice _device,
VkRenderPass _pass,
const VkAllocationCallbacks *pAllocator)
{
TU_FROM_HANDLE(tu_device, device, _device);
-
- if (unlikely(device->instance->debug_flags & TU_DEBUG_DYNAMIC)) {
- vk_common_DestroyRenderPass(_device, _pass, pAllocator);
- return;
- }
-
TU_FROM_HANDLE(tu_render_pass, pass, _pass);
if (!_pass)
return;
-
- vk_free2(&device->vk.alloc, pAllocator, pass->subpass_attachments);
- vk_object_free(&device->vk, pAllocator, pass);
-}
-
-static void
-tu_setup_dynamic_attachment(struct tu_render_pass_attachment *att,
- struct tu_image_view *view)
-{
- att->format = view->vk.format;
- att->samples = view->image->layout->nr_samples;
-
- /* for d32s8, cpp is for the depth image, and
- * att->samples will be used as the cpp for the stencil image
- */
- if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT)
- att->cpp = 4 * att->samples;
- else
- att->cpp = vk_format_get_blocksize(att->format) * att->samples;
-}
-
-void
-tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
- const VkRenderingInfo *info)
-{
- struct tu_device *device = cmd_buffer->device;
- struct tu_render_pass *pass = &cmd_buffer->dynamic_pass;
- struct tu_subpass *subpass = &cmd_buffer->dynamic_subpass;
-
- pass->subpass_count = 1;
- pass->attachments = cmd_buffer->dynamic_rp_attachments;
-
- subpass->color_count = subpass->resolve_count = info->colorAttachmentCount;
- subpass->resolve_depth_stencil = false;
- subpass->color_attachments = cmd_buffer->dynamic_color_attachments;
- subpass->resolve_attachments = cmd_buffer->dynamic_resolve_attachments;
- subpass->feedback_invalidate = false;
- subpass->feedback_loop_ds = subpass->feedback_loop_color = false;
- subpass->input_count = 0;
- subpass->samples = 0;
- subpass->srgb_cntl = 0;
- subpass->raster_order_attachment_access = false;
- subpass->multiview_mask = info->viewMask;
-
- uint32_t a = 0;
- for (uint32_t i = 0; i < info->colorAttachmentCount; i++) {
- struct tu_render_pass_attachment *att = &pass->attachments[a];
- const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i];
-
- if (att_info->imageView == VK_NULL_HANDLE) {
- subpass->color_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
- subpass->resolve_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
- continue;
- }
-
- TU_FROM_HANDLE(tu_image_view, view, att_info->imageView);
- tu_setup_dynamic_attachment(att, view);
- att->gmem = true;
- att->clear_views = info->viewMask;
- attachment_set_ops(device, att, att_info->loadOp, 0,
- att_info->storeOp, 0);
- subpass->color_attachments[i].attachment = a++;
-
- subpass->samples = view->image->layout->nr_samples;
-
- if (vk_format_is_srgb(view->vk.format))
- subpass->srgb_cntl |= 1 << i;
-
- if (att_info->resolveMode != VK_RESOLVE_MODE_NONE) {
- struct tu_render_pass_attachment *resolve_att = &pass->attachments[a];
- TU_FROM_HANDLE(tu_image_view, resolve_view, att_info->resolveImageView);
- tu_setup_dynamic_attachment(resolve_att, resolve_view);
- resolve_att->gmem = false;
- attachment_set_ops(device, resolve_att,
- VK_ATTACHMENT_LOAD_OP_DONT_CARE, 0,
- VK_ATTACHMENT_STORE_OP_STORE, 0);
- subpass->resolve_attachments[i].attachment = a++;
- att->will_be_resolved = true;
- } else {
- subpass->resolve_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
- att->will_be_resolved = false;
- }
- }
-
- if (info->pDepthAttachment || info->pStencilAttachment) {
- const struct VkRenderingAttachmentInfo *common_info =
- (info->pDepthAttachment &&
- info->pDepthAttachment->imageView != VK_NULL_HANDLE) ?
- info->pDepthAttachment :
- info->pStencilAttachment;
-
- if (common_info && common_info->imageView != VK_NULL_HANDLE) {
- TU_FROM_HANDLE(tu_image_view, view, common_info->imageView);
-
- struct tu_render_pass_attachment *att = &pass->attachments[a];
- tu_setup_dynamic_attachment(att, view);
- att->gmem = true;
- att->clear_views = info->viewMask;
- subpass->depth_stencil_attachment.attachment = a++;
-
- attachment_set_ops(device, att,
- info->pDepthAttachment ? info->pDepthAttachment->loadOp : 0,
- info->pStencilAttachment ? info->pStencilAttachment->loadOp : 0,
- info->pDepthAttachment ? info->pDepthAttachment->storeOp : 0,
- info->pStencilAttachment ? info->pStencilAttachment->storeOp : 0);
-
- subpass->samples = view->image->layout->nr_samples;
-
- if (common_info->resolveMode != VK_RESOLVE_MODE_NONE) {
- unsigned i = subpass->resolve_count++;
- struct tu_render_pass_attachment *resolve_att = &pass->attachments[a];
- TU_FROM_HANDLE(tu_image_view, resolve_view,
- common_info->resolveImageView);
- tu_setup_dynamic_attachment(resolve_att, resolve_view);
- resolve_att->gmem = false;
- attachment_set_ops(device, resolve_att,
- VK_ATTACHMENT_LOAD_OP_DONT_CARE,
- VK_ATTACHMENT_LOAD_OP_DONT_CARE,
- VK_ATTACHMENT_STORE_OP_STORE,
- VK_ATTACHMENT_STORE_OP_STORE);
- subpass->resolve_attachments[i].attachment = a++;
- att->will_be_resolved = true;
- subpass->resolve_depth_stencil = true;
- } else {
- att->will_be_resolved = false;
- }
- } else {
- subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED;
- }
- } else {
- subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED;
- }
-
- pass->attachment_count = a;
-
- tu_render_pass_cond_config(pass);
- tu_render_pass_gmem_config(pass, device->physical_device);
- tu_render_pass_bandwidth_config(pass);
- tu_render_pass_calc_hash(pass);
+ vk_free2(&device->alloc, pAllocator, pass->subpass_attachments);
+ vk_free2(&device->alloc, pAllocator, pass);
}
void
-tu_setup_dynamic_inheritance(struct tu_cmd_buffer *cmd_buffer,
- const VkCommandBufferInheritanceRenderingInfo *info)
-{
- struct tu_render_pass *pass = &cmd_buffer->dynamic_pass;
- struct tu_subpass *subpass = &cmd_buffer->dynamic_subpass;
-
- pass->subpass_count = 1;
- pass->attachments = cmd_buffer->dynamic_rp_attachments;
-
- subpass->color_count = info->colorAttachmentCount;
- subpass->resolve_count = 0;
- subpass->resolve_depth_stencil = false;
- subpass->color_attachments = cmd_buffer->dynamic_color_attachments;
- subpass->resolve_attachments = NULL;
- subpass->feedback_invalidate = false;
- subpass->feedback_loop_ds = subpass->feedback_loop_color = false;
- subpass->input_count = 0;
- subpass->samples = 0;
- subpass->srgb_cntl = 0;
- subpass->raster_order_attachment_access = false;
- subpass->multiview_mask = info->viewMask;
- subpass->samples = info->rasterizationSamples;
-
- unsigned a = 0;
- for (unsigned i = 0; i < info->colorAttachmentCount; i++) {
- struct tu_render_pass_attachment *att = &pass->attachments[a];
- VkFormat format = info->pColorAttachmentFormats[i];
-
- if (format == VK_FORMAT_UNDEFINED) {
- subpass->color_attachments[i].attachment = VK_ATTACHMENT_UNUSED;
- continue;
- }
-
- att->format = format;
- att->samples = info->rasterizationSamples;
- subpass->samples = info->rasterizationSamples;
- subpass->color_attachments[i].attachment = a++;
-
- /* conservatively assume that the attachment may be conditionally
- * loaded/stored.
- */
- att->cond_load_allowed = att->cond_store_allowed = true;
- }
-
- if (info->depthAttachmentFormat != VK_FORMAT_UNDEFINED ||
- info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED) {
- struct tu_render_pass_attachment *att = &pass->attachments[a];
- att->format = info->depthAttachmentFormat != VK_FORMAT_UNDEFINED ?
- info->depthAttachmentFormat : info->stencilAttachmentFormat;
- att->samples = info->rasterizationSamples;
- subpass->depth_stencil_attachment.attachment = a++;
- att->cond_load_allowed = att->cond_store_allowed = true;
- } else {
- subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED;
- }
-}
-
-VKAPI_ATTR void VKAPI_CALL
tu_GetRenderAreaGranularity(VkDevice _device,
VkRenderPass renderPass,
VkExtent2D *pGranularity)
{
TU_FROM_HANDLE(tu_device, device, _device);
- pGranularity->width = device->physical_device->info->gmem_align_w;
- pGranularity->height = device->physical_device->info->gmem_align_h;
-}
-
-uint32_t
-tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index)
-{
- if (subpass->resolve_depth_stencil &&
- index == (subpass->resolve_count - 1))
- return subpass->depth_stencil_attachment.attachment;
- return subpass->color_attachments[index].attachment;
+ pGranularity->width = device->physical_device->tile_align_w;
+ pGranularity->height = device->physical_device->tile_align_h;
}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_pipeline.c b/lib/mesa/src/freedreno/vulkan/tu_pipeline.c
index d4d3c9735..9964020a8 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_pipeline.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_pipeline.c
@@ -1,297 +1,140 @@
/*
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
- * SPDX-License-Identifier: MIT
*
* based in part on anv driver which is:
* Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
-#include "tu_pipeline.h"
+#include "tu_private.h"
-#include "common/freedreno_guardband.h"
-
-#include "ir3/ir3_nir.h"
#include "main/menums.h"
#include "nir/nir.h"
#include "nir/nir_builder.h"
-#include "nir/nir_serialize.h"
#include "spirv/nir_spirv.h"
-#include "util/u_debug.h"
+#include "util/debug.h"
#include "util/mesa-sha1.h"
-#include "vk_pipeline.h"
-#include "vk_render_pass.h"
+#include "util/u_atomic.h"
+#include "vk_format.h"
#include "vk_util.h"
-#include "tu_cmd_buffer.h"
#include "tu_cs.h"
-#include "tu_device.h"
-#include "tu_drm.h"
-#include "tu_formats.h"
-#include "tu_lrz.h"
-#include "tu_pass.h"
-
-/* Emit IB that preloads the descriptors that the shader uses */
-
-static void
-emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
- enum a6xx_state_block sb, unsigned base, unsigned offset,
- unsigned count)
-{
- /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
- * clear if emitting more packets will even help anything. Presumably the
- * descriptor cache is relatively small, and these packets stop doing
- * anything when there are too many descriptors.
- */
- tu_cs_emit_pkt7(cs, opcode, 3);
- tu_cs_emit(cs,
- CP_LOAD_STATE6_0_STATE_TYPE(st) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
- CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
- CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
- tu_cs_emit_qw(cs, offset | (base << 28));
-}
-
-static unsigned
-tu6_load_state_size(struct tu_pipeline *pipeline,
- struct tu_pipeline_layout *layout)
-{
- const unsigned load_state_size = 4;
- unsigned size = 0;
- for (unsigned i = 0; i < layout->num_sets; i++) {
- if (!(pipeline->active_desc_sets & (1u << i)))
- continue;
-
- struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
- for (unsigned j = 0; j < set_layout->binding_count; j++) {
- struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
- unsigned count = 0;
- /* See comment in tu6_emit_load_state(). */
- VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
- unsigned stage_count = util_bitcount(stages);
-
- if (!binding->array_size)
- continue;
-
- switch (binding->type) {
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
- case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
- /* IBO-backed resources only need one packet for all graphics stages */
- if (stage_count)
- count += 1;
- break;
- case VK_DESCRIPTOR_TYPE_SAMPLER:
- case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
- case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
- /* Textures and UBO's needs a packet for each stage */
- count = stage_count;
- break;
- case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- /* Because of how we pack combined images and samplers, we
- * currently can't use one packet for the whole array.
- */
- count = stage_count * binding->array_size * 2;
- break;
- case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
- case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
- break;
- default:
- unreachable("bad descriptor type");
- }
- size += count * load_state_size;
- }
- }
- return size;
-}
-
-static void
-tu6_emit_load_state(struct tu_pipeline *pipeline,
- struct tu_pipeline_layout *layout)
-{
- unsigned size = tu6_load_state_size(pipeline, layout);
- if (size == 0)
- return;
-
- struct tu_cs cs;
- tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
-
- for (unsigned i = 0; i < layout->num_sets; i++) {
- /* From 13.2.7. Descriptor Set Binding:
- *
- * A compatible descriptor set must be bound for all set numbers that
- * any shaders in a pipeline access, at the time that a draw or
- * dispatch command is recorded to execute using that pipeline.
- * However, if none of the shaders in a pipeline statically use any
- * bindings with a particular set number, then no descriptor set need
- * be bound for that set number, even if the pipeline layout includes
- * a non-trivial descriptor set layout for that set number.
- *
- * This means that descriptor sets unused by the pipeline may have a
- * garbage or 0 BINDLESS_BASE register, which will cause context faults
- * when prefetching descriptors from these sets. Skip prefetching for
- * descriptors from them to avoid this. This is also an optimization,
- * since these prefetches would be useless.
- */
- if (!(pipeline->active_desc_sets & (1u << i)))
- continue;
-
- struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
- for (unsigned j = 0; j < set_layout->binding_count; j++) {
- struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
- unsigned base = i;
- unsigned offset = binding->offset / 4;
- /* Note: amber sets VK_SHADER_STAGE_ALL for its descriptor layout, and
- * zink has descriptors for each stage in the push layout even if some
- * stages aren't present in a used pipeline. We don't want to emit
- * loads for unused descriptors.
- */
- VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
- unsigned count = binding->array_size;
-
- /* If this is a variable-count descriptor, then the array_size is an
- * upper bound on the size, but we don't know how many descriptors
- * will actually be used. Therefore we can't pre-load them here.
- */
- if (j == set_layout->binding_count - 1 &&
- set_layout->has_variable_descriptors)
- continue;
-
- if (count == 0 || stages == 0)
- continue;
- switch (binding->type) {
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- base = MAX_SETS;
- offset = (layout->set[i].dynamic_offset_start +
- binding->dynamic_offset_offset) / 4;
- FALLTHROUGH;
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
- case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: {
- unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4);
- /* IBO-backed resources only need one packet for all graphics stages */
- if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
- emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
- base, offset, count * mul);
- }
- if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
- emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
- base, offset, count * mul);
- }
- break;
- }
- case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
- case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
- /* nothing - input attachment doesn't use bindless */
- break;
- case VK_DESCRIPTOR_TYPE_SAMPLER:
- case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
- case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
- tu_foreach_stage(stage, stages) {
- emit_load_state(&cs, tu6_stage2opcode(stage),
- binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
- ST6_SHADER : ST6_CONSTANTS,
- tu6_stage2texsb(stage), base, offset, count);
- }
- break;
- }
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- base = MAX_SETS;
- offset = (layout->set[i].dynamic_offset_start +
- binding->dynamic_offset_offset) / 4;
- FALLTHROUGH;
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
- case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: {
- tu_foreach_stage(stage, stages) {
- emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
- tu6_stage2shadersb(stage), base, offset, count);
- }
- break;
- }
- case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
- tu_foreach_stage(stage, stages) {
- /* TODO: We could emit less CP_LOAD_STATE6 if we used
- * struct-of-arrays instead of array-of-structs.
- */
- for (unsigned i = 0; i < count; i++) {
- unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
- unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
- emit_load_state(&cs, tu6_stage2opcode(stage),
- ST6_CONSTANTS, tu6_stage2texsb(stage),
- base, tex_offset, 1);
- emit_load_state(&cs, tu6_stage2opcode(stage),
- ST6_SHADER, tu6_stage2texsb(stage),
- base, sam_offset, 1);
- }
- }
- break;
- }
- default:
- unreachable("bad descriptor type");
- }
- }
- }
-
- pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
-}
struct tu_pipeline_builder
{
struct tu_device *device;
- void *mem_ctx;
- struct vk_pipeline_cache *cache;
+ struct tu_pipeline_cache *cache;
const VkAllocationCallbacks *alloc;
const VkGraphicsPipelineCreateInfo *create_info;
- struct tu_pipeline_layout layout;
-
- struct tu_compiled_shaders *compiled_shaders;
-
- struct tu_const_state const_state[MESA_SHADER_FRAGMENT + 1];
- struct ir3_shader_variant *variants[MESA_SHADER_FRAGMENT + 1];
- struct ir3_shader_variant *binning_variant;
- uint64_t shader_iova[MESA_SHADER_FRAGMENT + 1];
- uint64_t binning_vs_iova;
-
- uint32_t additional_cs_reserve_size;
-
- struct tu_pvtmem_config pvtmem;
+ struct tu_shader *shaders[MESA_SHADER_STAGES];
+ uint32_t shader_offsets[MESA_SHADER_STAGES];
+ uint32_t binning_vs_offset;
+ uint32_t shader_total_size;
bool rasterizer_discard;
/* these states are affectd by rasterizer_discard */
+ VkSampleCountFlagBits samples;
+ bool use_depth_stencil_attachment;
bool use_color_attachments;
- bool attachment_state_valid;
+ uint32_t color_attachment_count;
VkFormat color_attachment_formats[MAX_RTS];
- VkFormat depth_attachment_format;
- uint32_t multiview_mask;
-
- bool subpass_raster_order_attachment_access;
- bool subpass_feedback_loop_color;
- bool subpass_feedback_loop_ds;
- bool feedback_loop_may_involve_textures;
-
- /* Each library defines at least one piece of state in
- * VkGraphicsPipelineLibraryFlagsEXT, and libraries cannot overlap, so
- * there can be at most as many libraries as pieces of state, of which
- * there are currently 4.
- */
-#define MAX_LIBRARIES 4
+};
- unsigned num_libraries;
- struct tu_pipeline *libraries[MAX_LIBRARIES];
+static enum tu_dynamic_state_bits
+tu_dynamic_state_bit(VkDynamicState state)
+{
+ switch (state) {
+ case VK_DYNAMIC_STATE_VIEWPORT:
+ return TU_DYNAMIC_VIEWPORT;
+ case VK_DYNAMIC_STATE_SCISSOR:
+ return TU_DYNAMIC_SCISSOR;
+ case VK_DYNAMIC_STATE_LINE_WIDTH:
+ return TU_DYNAMIC_LINE_WIDTH;
+ case VK_DYNAMIC_STATE_DEPTH_BIAS:
+ return TU_DYNAMIC_DEPTH_BIAS;
+ case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
+ return TU_DYNAMIC_BLEND_CONSTANTS;
+ case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
+ return TU_DYNAMIC_DEPTH_BOUNDS;
+ case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
+ return TU_DYNAMIC_STENCIL_COMPARE_MASK;
+ case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
+ return TU_DYNAMIC_STENCIL_WRITE_MASK;
+ case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
+ return TU_DYNAMIC_STENCIL_REFERENCE;
+ default:
+ unreachable("invalid dynamic state");
+ return 0;
+ }
+}
- /* This is just the state that we are compiling now, whereas the final
- * pipeline will include the state from the libraries.
- */
- VkGraphicsPipelineLibraryFlagsEXT state;
+static gl_shader_stage
+tu_shader_stage(VkShaderStageFlagBits stage)
+{
+ switch (stage) {
+ case VK_SHADER_STAGE_VERTEX_BIT:
+ return MESA_SHADER_VERTEX;
+ case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
+ return MESA_SHADER_TESS_CTRL;
+ case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
+ return MESA_SHADER_TESS_EVAL;
+ case VK_SHADER_STAGE_GEOMETRY_BIT:
+ return MESA_SHADER_GEOMETRY;
+ case VK_SHADER_STAGE_FRAGMENT_BIT:
+ return MESA_SHADER_FRAGMENT;
+ case VK_SHADER_STAGE_COMPUTE_BIT:
+ return MESA_SHADER_COMPUTE;
+ default:
+ unreachable("invalid VkShaderStageFlagBits");
+ return MESA_SHADER_NONE;
+ }
+}
- /* The stages we are compiling now. */
- VkShaderStageFlags active_stages;
-};
+static const VkVertexInputAttributeDescription *
+tu_find_vertex_input_attribute(
+ const VkPipelineVertexInputStateCreateInfo *vi_info, uint32_t slot)
+{
+ assert(slot >= VERT_ATTRIB_GENERIC0);
+ slot -= VERT_ATTRIB_GENERIC0;
+ for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
+ if (vi_info->pVertexAttributeDescriptions[i].location == slot)
+ return &vi_info->pVertexAttributeDescriptions[i];
+ }
+ return NULL;
+}
+
+static const VkVertexInputBindingDescription *
+tu_find_vertex_input_binding(
+ const VkPipelineVertexInputStateCreateInfo *vi_info,
+ const VkVertexInputAttributeDescription *vi_attr)
+{
+ assert(vi_attr);
+ for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
+ if (vi_info->pVertexBindingDescriptions[i].binding == vi_attr->binding)
+ return &vi_info->pVertexBindingDescriptions[i];
+ }
+ return NULL;
+}
static bool
tu_logic_op_reads_dst(VkLogicOp op)
@@ -321,732 +164,418 @@ tu_blend_factor_no_dst_alpha(VkBlendFactor factor)
}
}
-static bool tu_blend_factor_is_dual_src(VkBlendFactor factor)
-{
- switch (factor) {
- case VK_BLEND_FACTOR_SRC1_COLOR:
- case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
- case VK_BLEND_FACTOR_SRC1_ALPHA:
- case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
- return true;
+static enum pc_di_primtype
+tu6_primtype(VkPrimitiveTopology topology)
+{
+ switch (topology) {
+ case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+ return DI_PT_POINTLIST;
+ case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+ return DI_PT_LINELIST;
+ case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+ return DI_PT_LINESTRIP;
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+ return DI_PT_TRILIST;
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+ return DI_PT_TRILIST;
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+ return DI_PT_TRIFAN;
+ case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+ return DI_PT_LINE_ADJ;
+ case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+ return DI_PT_LINESTRIP_ADJ;
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+ return DI_PT_TRI_ADJ;
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+ return DI_PT_TRISTRIP_ADJ;
+ case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
default:
- return false;
+ unreachable("invalid primitive topology");
+ return DI_PT_NONE;
}
}
-static bool
-tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo *info)
+static enum adreno_compare_func
+tu6_compare_func(VkCompareOp op)
{
- if (!info)
- return false;
-
- for (unsigned i = 0; i < info->attachmentCount; i++) {
- const VkPipelineColorBlendAttachmentState *blend = &info->pAttachments[i];
- if (tu_blend_factor_is_dual_src(blend->srcColorBlendFactor) ||
- tu_blend_factor_is_dual_src(blend->dstColorBlendFactor) ||
- tu_blend_factor_is_dual_src(blend->srcAlphaBlendFactor) ||
- tu_blend_factor_is_dual_src(blend->dstAlphaBlendFactor))
- return true;
+ switch (op) {
+ case VK_COMPARE_OP_NEVER:
+ return FUNC_NEVER;
+ case VK_COMPARE_OP_LESS:
+ return FUNC_LESS;
+ case VK_COMPARE_OP_EQUAL:
+ return FUNC_EQUAL;
+ case VK_COMPARE_OP_LESS_OR_EQUAL:
+ return FUNC_LEQUAL;
+ case VK_COMPARE_OP_GREATER:
+ return FUNC_GREATER;
+ case VK_COMPARE_OP_NOT_EQUAL:
+ return FUNC_NOTEQUAL;
+ case VK_COMPARE_OP_GREATER_OR_EQUAL:
+ return FUNC_GEQUAL;
+ case VK_COMPARE_OP_ALWAYS:
+ return FUNC_ALWAYS;
+ default:
+ unreachable("invalid VkCompareOp");
+ return FUNC_NEVER;
}
-
- return false;
}
-static const struct xs_config {
- uint16_t reg_sp_xs_ctrl;
- uint16_t reg_sp_xs_config;
- uint16_t reg_sp_xs_instrlen;
- uint16_t reg_hlsq_xs_ctrl;
- uint16_t reg_sp_xs_first_exec_offset;
- uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
-} xs_config[] = {
- [MESA_SHADER_VERTEX] = {
- REG_A6XX_SP_VS_CTRL_REG0,
- REG_A6XX_SP_VS_CONFIG,
- REG_A6XX_SP_VS_INSTRLEN,
- REG_A6XX_HLSQ_VS_CNTL,
- REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
- REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
- },
- [MESA_SHADER_TESS_CTRL] = {
- REG_A6XX_SP_HS_CTRL_REG0,
- REG_A6XX_SP_HS_CONFIG,
- REG_A6XX_SP_HS_INSTRLEN,
- REG_A6XX_HLSQ_HS_CNTL,
- REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
- REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
- },
- [MESA_SHADER_TESS_EVAL] = {
- REG_A6XX_SP_DS_CTRL_REG0,
- REG_A6XX_SP_DS_CONFIG,
- REG_A6XX_SP_DS_INSTRLEN,
- REG_A6XX_HLSQ_DS_CNTL,
- REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
- REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
- },
- [MESA_SHADER_GEOMETRY] = {
- REG_A6XX_SP_GS_CTRL_REG0,
- REG_A6XX_SP_GS_CONFIG,
- REG_A6XX_SP_GS_INSTRLEN,
- REG_A6XX_HLSQ_GS_CNTL,
- REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
- REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
- },
- [MESA_SHADER_FRAGMENT] = {
- REG_A6XX_SP_FS_CTRL_REG0,
- REG_A6XX_SP_FS_CONFIG,
- REG_A6XX_SP_FS_INSTRLEN,
- REG_A6XX_HLSQ_FS_CNTL,
- REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
- REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
- },
- [MESA_SHADER_COMPUTE] = {
- REG_A6XX_SP_CS_CTRL_REG0,
- REG_A6XX_SP_CS_CONFIG,
- REG_A6XX_SP_CS_INSTRLEN,
- REG_A6XX_HLSQ_CS_CNTL,
- REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
- REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
- },
-};
-
-static uint32_t
-tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
+static enum adreno_stencil_op
+tu6_stencil_op(VkStencilOp op)
{
- const struct ir3_const_state *const_state = ir3_const_state(xs);
- uint32_t base = const_state->offsets.immediate;
- int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
-
- /* truncate size to avoid writing constants that shader
- * does not use:
- */
- size = MIN2(size + base, xs->constlen) - base;
-
- return MAX2(size, 0) * 4;
+ switch (op) {
+ case VK_STENCIL_OP_KEEP:
+ return STENCIL_KEEP;
+ case VK_STENCIL_OP_ZERO:
+ return STENCIL_ZERO;
+ case VK_STENCIL_OP_REPLACE:
+ return STENCIL_REPLACE;
+ case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
+ return STENCIL_INCR_CLAMP;
+ case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
+ return STENCIL_DECR_CLAMP;
+ case VK_STENCIL_OP_INVERT:
+ return STENCIL_INVERT;
+ case VK_STENCIL_OP_INCREMENT_AND_WRAP:
+ return STENCIL_INCR_WRAP;
+ case VK_STENCIL_OP_DECREMENT_AND_WRAP:
+ return STENCIL_DECR_WRAP;
+ default:
+ unreachable("invalid VkStencilOp");
+ return STENCIL_KEEP;
+ }
}
-/* We allocate fixed-length substreams for shader state, however some
- * parts of the state may have unbound length. Their additional space
- * requirements should be calculated here.
- */
-static uint32_t
-tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
+static enum a3xx_rop_code
+tu6_rop(VkLogicOp op)
{
- const struct ir3_const_state *const_state = ir3_const_state(xs);
-
- uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
-
- /* Variable number of UBO upload ranges. */
- size += 4 * const_state->ubo_state.num_enabled;
-
- /* Variable number of dwords for the primitive map */
- size += xs->input_size;
-
- size += xs->constant_data_size / 4;
-
- return size;
+ switch (op) {
+ case VK_LOGIC_OP_CLEAR:
+ return ROP_CLEAR;
+ case VK_LOGIC_OP_AND:
+ return ROP_AND;
+ case VK_LOGIC_OP_AND_REVERSE:
+ return ROP_AND_REVERSE;
+ case VK_LOGIC_OP_COPY:
+ return ROP_COPY;
+ case VK_LOGIC_OP_AND_INVERTED:
+ return ROP_AND_INVERTED;
+ case VK_LOGIC_OP_NO_OP:
+ return ROP_NOOP;
+ case VK_LOGIC_OP_XOR:
+ return ROP_XOR;
+ case VK_LOGIC_OP_OR:
+ return ROP_OR;
+ case VK_LOGIC_OP_NOR:
+ return ROP_NOR;
+ case VK_LOGIC_OP_EQUIVALENT:
+ return ROP_EQUIV;
+ case VK_LOGIC_OP_INVERT:
+ return ROP_INVERT;
+ case VK_LOGIC_OP_OR_REVERSE:
+ return ROP_OR_REVERSE;
+ case VK_LOGIC_OP_COPY_INVERTED:
+ return ROP_COPY_INVERTED;
+ case VK_LOGIC_OP_OR_INVERTED:
+ return ROP_OR_INVERTED;
+ case VK_LOGIC_OP_NAND:
+ return ROP_NAND;
+ case VK_LOGIC_OP_SET:
+ return ROP_SET;
+ default:
+ unreachable("invalid VkLogicOp");
+ return ROP_NOOP;
+ }
}
-void
-tu6_emit_xs_config(struct tu_cs *cs,
- gl_shader_stage stage, /* xs->type, but xs may be NULL */
- const struct ir3_shader_variant *xs)
+static enum adreno_rb_blend_factor
+tu6_blend_factor(VkBlendFactor factor)
{
- const struct xs_config *cfg = &xs_config[stage];
-
- if (!xs) {
- /* shader stage disabled */
- tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
- tu_cs_emit(cs, 0);
-
- tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
- tu_cs_emit(cs, 0);
- return;
+ switch (factor) {
+ case VK_BLEND_FACTOR_ZERO:
+ return FACTOR_ZERO;
+ case VK_BLEND_FACTOR_ONE:
+ return FACTOR_ONE;
+ case VK_BLEND_FACTOR_SRC_COLOR:
+ return FACTOR_SRC_COLOR;
+ case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
+ return FACTOR_ONE_MINUS_SRC_COLOR;
+ case VK_BLEND_FACTOR_DST_COLOR:
+ return FACTOR_DST_COLOR;
+ case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
+ return FACTOR_ONE_MINUS_DST_COLOR;
+ case VK_BLEND_FACTOR_SRC_ALPHA:
+ return FACTOR_SRC_ALPHA;
+ case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
+ return FACTOR_ONE_MINUS_SRC_ALPHA;
+ case VK_BLEND_FACTOR_DST_ALPHA:
+ return FACTOR_DST_ALPHA;
+ case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
+ return FACTOR_ONE_MINUS_DST_ALPHA;
+ case VK_BLEND_FACTOR_CONSTANT_COLOR:
+ return FACTOR_CONSTANT_COLOR;
+ case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
+ return FACTOR_ONE_MINUS_CONSTANT_COLOR;
+ case VK_BLEND_FACTOR_CONSTANT_ALPHA:
+ return FACTOR_CONSTANT_ALPHA;
+ case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
+ return FACTOR_ONE_MINUS_CONSTANT_ALPHA;
+ case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
+ return FACTOR_SRC_ALPHA_SATURATE;
+ case VK_BLEND_FACTOR_SRC1_COLOR:
+ return FACTOR_SRC1_COLOR;
+ case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
+ return FACTOR_ONE_MINUS_SRC1_COLOR;
+ case VK_BLEND_FACTOR_SRC1_ALPHA:
+ return FACTOR_SRC1_ALPHA;
+ case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
+ return FACTOR_ONE_MINUS_SRC1_ALPHA;
+ default:
+ unreachable("invalid VkBlendFactor");
+ return FACTOR_ZERO;
}
-
- tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
- tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
- COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
- COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
- COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
- COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
- A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
- A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
-
- tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
- tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
- A6XX_HLSQ_VS_CNTL_ENABLED);
}
-void
-tu6_emit_xs(struct tu_cs *cs,
- gl_shader_stage stage, /* xs->type, but xs may be NULL */
- const struct ir3_shader_variant *xs,
- const struct tu_pvtmem_config *pvtmem,
- uint64_t binary_iova)
+static enum a3xx_rb_blend_opcode
+tu6_blend_op(VkBlendOp op)
{
- const struct xs_config *cfg = &xs_config[stage];
-
- if (!xs) {
- /* shader stage disabled */
- return;
- }
-
- enum a6xx_threadsize thrsz =
- xs->info.double_threadsize ? THREAD128 : THREAD64;
- switch (stage) {
- case MESA_SHADER_VERTEX:
- tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
- .fullregfootprint = xs->info.max_reg + 1,
- .halfregfootprint = xs->info.max_half_reg + 1,
- .branchstack = ir3_shader_branchstack_hw(xs),
- .mergedregs = xs->mergedregs,
- ));
- break;
- case MESA_SHADER_TESS_CTRL:
- tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(
- .fullregfootprint = xs->info.max_reg + 1,
- .halfregfootprint = xs->info.max_half_reg + 1,
- .branchstack = ir3_shader_branchstack_hw(xs),
- ));
- break;
- case MESA_SHADER_TESS_EVAL:
- tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0(
- .fullregfootprint = xs->info.max_reg + 1,
- .halfregfootprint = xs->info.max_half_reg + 1,
- .branchstack = ir3_shader_branchstack_hw(xs),
- ));
- break;
- case MESA_SHADER_GEOMETRY:
- tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0(
- .fullregfootprint = xs->info.max_reg + 1,
- .halfregfootprint = xs->info.max_half_reg + 1,
- .branchstack = ir3_shader_branchstack_hw(xs),
- ));
- break;
- case MESA_SHADER_FRAGMENT:
- tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
- .fullregfootprint = xs->info.max_reg + 1,
- .halfregfootprint = xs->info.max_half_reg + 1,
- .branchstack = ir3_shader_branchstack_hw(xs),
- .mergedregs = xs->mergedregs,
- .threadsize = thrsz,
- .pixlodenable = xs->need_pixlod,
- .diff_fine = xs->need_fine_derivatives,
- .varying = xs->total_in != 0,
- /* unknown bit, seems unnecessary */
- .unk24 = true,
- ));
- break;
- case MESA_SHADER_COMPUTE:
- tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
- .fullregfootprint = xs->info.max_reg + 1,
- .halfregfootprint = xs->info.max_half_reg + 1,
- .branchstack = ir3_shader_branchstack_hw(xs),
- .mergedregs = xs->mergedregs,
- .threadsize = thrsz,
- ));
- break;
+ switch (op) {
+ case VK_BLEND_OP_ADD:
+ return BLEND_DST_PLUS_SRC;
+ case VK_BLEND_OP_SUBTRACT:
+ return BLEND_SRC_MINUS_DST;
+ case VK_BLEND_OP_REVERSE_SUBTRACT:
+ return BLEND_DST_MINUS_SRC;
+ case VK_BLEND_OP_MIN:
+ return BLEND_MIN_DST_SRC;
+ case VK_BLEND_OP_MAX:
+ return BLEND_MAX_DST_SRC;
default:
- unreachable("bad shader stage");
+ unreachable("invalid VkBlendOp");
+ return BLEND_DST_PLUS_SRC;
}
+}
- tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
- tu_cs_emit(cs, xs->instrlen);
-
- /* emit program binary & private memory layout
- * binary_iova should be aligned to 1 instrlen unit (128 bytes)
- */
+static void
+tu6_emit_vs_config(struct tu_cs *cs, const struct ir3_shader_variant *vs)
+{
+ uint32_t sp_vs_ctrl =
+ A6XX_SP_VS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
+ A6XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vs->info.max_reg + 1) |
+ A6XX_SP_VS_CTRL_REG0_MERGEDREGS |
+ A6XX_SP_VS_CTRL_REG0_BRANCHSTACK(vs->branchstack);
+ if (vs->num_samp)
+ sp_vs_ctrl |= A6XX_SP_VS_CTRL_REG0_PIXLODENABLE;
- assert((binary_iova & 0x7f) == 0);
- assert((pvtmem->iova & 0x1f) == 0);
+ uint32_t sp_vs_config = A6XX_SP_VS_CONFIG_NTEX(vs->num_samp) |
+ A6XX_SP_VS_CONFIG_NSAMP(vs->num_samp);
+ if (vs->instrlen)
+ sp_vs_config |= A6XX_SP_VS_CONFIG_ENABLED;
- tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7);
- tu_cs_emit(cs, 0);
- tu_cs_emit_qw(cs, binary_iova);
- tu_cs_emit(cs,
- A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size));
- tu_cs_emit_qw(cs, pvtmem->iova);
- tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) |
- COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
-
- tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
- tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
-
- uint32_t shader_preload_size =
- MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size);
-
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
- CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
- tu_cs_emit_qw(cs, binary_iova);
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_CTRL_REG0, 1);
+ tu_cs_emit(cs, sp_vs_ctrl);
- /* emit immediates */
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_CONFIG, 2);
+ tu_cs_emit(cs, sp_vs_config);
+ tu_cs_emit(cs, vs->instrlen);
- const struct ir3_const_state *const_state = ir3_const_state(xs);
- uint32_t base = const_state->offsets.immediate;
- unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
-
- if (immediate_size > 0) {
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
- CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
- tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
- tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+ tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_VS_CNTL, 1);
+ tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(align(vs->constlen, 4)) | 0x100);
+}
- tu_cs_emit_array(cs, const_state->immediates, immediate_size);
- }
+static void
+tu6_emit_hs_config(struct tu_cs *cs, const struct ir3_shader_variant *hs)
+{
+ uint32_t sp_hs_config = 0;
+ if (hs->instrlen)
+ sp_hs_config |= A6XX_SP_HS_CONFIG_ENABLED;
- if (const_state->constant_data_ubo != -1) {
- uint64_t iova = binary_iova + xs->info.constant_data_offset;
-
- /* Upload UBO state for the constant data. */
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
- tu_cs_emit(cs,
- CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
- CP_LOAD_STATE6_0_NUM_UNIT(1));
- tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
- tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
- int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
- tu_cs_emit_qw(cs,
- iova |
- (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
-
- /* Upload the constant data to the const file if needed. */
- const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
-
- for (int i = 0; i < ubo_state->num_enabled; i++) {
- if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo ||
- ubo_state->range[i].ubo.bindless) {
- continue;
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
+ tu_cs_emit(cs, 0);
- uint32_t start = ubo_state->range[i].start;
- uint32_t end = ubo_state->range[i].end;
- uint32_t size = MIN2(end - start,
- (16 * xs->constlen) - ubo_state->range[i].offset);
-
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
- tu_cs_emit(cs,
- CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
- CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
- tu_cs_emit_qw(cs, iova + start);
- }
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_CONFIG, 2);
+ tu_cs_emit(cs, sp_hs_config);
+ tu_cs_emit(cs, hs->instrlen);
- /* emit FS driver param */
- if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) {
- uint32_t base = const_state->offsets.driver_param;
- int32_t size = DIV_ROUND_UP(const_state->num_driver_params, 4);
- size = MAX2(MIN2(size + base, xs->constlen) - base, 0);
-
- if (size > 0) {
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
- CP_LOAD_STATE6_0_NUM_UNIT(size));
- tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
- tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
-
- assert(size == 1);
- tu_cs_emit(cs, xs->info.double_threadsize ? 128 : 64);
- tu_cs_emit(cs, 0);
- tu_cs_emit(cs, 0);
- tu_cs_emit(cs, 0);
- }
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_HS_CNTL, 1);
+ tu_cs_emit(cs, A6XX_HLSQ_HS_CNTL_CONSTLEN(align(hs->constlen, 4)));
}
static void
-tu6_emit_dynamic_offset(struct tu_cs *cs,
- const struct ir3_shader_variant *xs,
- struct tu_pipeline_builder *builder)
+tu6_emit_ds_config(struct tu_cs *cs, const struct ir3_shader_variant *ds)
{
- if (!xs || builder->const_state[xs->type].dynamic_offset_loc == UINT32_MAX)
- return;
+ uint32_t sp_ds_config = 0;
+ if (ds->instrlen)
+ sp_ds_config |= A6XX_SP_DS_CONFIG_ENABLED;
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + MAX_SETS);
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(builder->const_state[xs->type].dynamic_offset_loc / 4) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
- CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(MAX_SETS, 4)));
- tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
- tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
-
- for (unsigned i = 0; i < MAX_SETS; i++) {
- unsigned dynamic_offset_start =
- builder->layout.set[i].dynamic_offset_start / (A6XX_TEX_CONST_DWORDS * 4);
- tu_cs_emit(cs, i < builder->layout.num_sets ? dynamic_offset_start : 0);
- }
-}
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_DS_CONFIG, 2);
+ tu_cs_emit(cs, sp_ds_config);
+ tu_cs_emit(cs, ds->instrlen);
-static void
-tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable)
-{
- /* Enable/disable shared constants */
- tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable));
- tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
- .isammode = ISAMMODE_GL,
- .shared_consts_enable = enable));
+ tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_DS_CNTL, 1);
+ tu_cs_emit(cs, A6XX_HLSQ_DS_CNTL_CONSTLEN(align(ds->constlen, 4)));
}
static void
-tu6_emit_cs_config(struct tu_cs *cs,
- const struct ir3_shader_variant *v,
- const struct tu_pvtmem_config *pvtmem,
- uint64_t binary_iova)
+tu6_emit_gs_config(struct tu_cs *cs, const struct ir3_shader_variant *gs)
{
- bool shared_consts_enable = ir3_const_state(v)->shared_consts_enable;
- tu6_emit_shared_consts_enable(cs, shared_consts_enable);
-
- tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
- .cs_state = true,
- .cs_ibo = true,
- .cs_shared_const = shared_consts_enable));
-
- tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v);
- tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
-
- uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
- tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
- A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
-
- if (cs->device->physical_device->info->a6xx.has_lpac) {
- tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
- tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
- A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
- }
+ uint32_t sp_gs_config = 0;
+ if (gs->instrlen)
+ sp_gs_config |= A6XX_SP_GS_CONFIG_ENABLED;
- uint32_t local_invocation_id =
- ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
- uint32_t work_group_id =
- ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_UNKNOWN_A871, 1);
+ tu_cs_emit(cs, 0);
- enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
- tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
- tu_cs_emit(cs,
- A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
- A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
- A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
- A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
- tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
- A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
-
- if (cs->device->physical_device->info->a6xx.has_lpac) {
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
- tu_cs_emit(cs,
- A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
- A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
- A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
- A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
- tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
- A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
- }
-}
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_CONFIG, 2);
+ tu_cs_emit(cs, sp_gs_config);
+ tu_cs_emit(cs, gs->instrlen);
-#define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2)
+ tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_GS_CNTL, 1);
+ tu_cs_emit(cs, A6XX_HLSQ_GS_CNTL_CONSTLEN(align(gs->constlen, 4)));
+}
static void
-tu6_emit_vfd_dest(struct tu_cs *cs,
- const struct ir3_shader_variant *vs)
-{
- int32_t input_for_attr[MAX_VERTEX_ATTRIBS];
- uint32_t attr_count = 0;
+tu6_emit_fs_config(struct tu_cs *cs, const struct ir3_shader_variant *fs)
+{
+ uint32_t sp_fs_ctrl =
+ A6XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | 0x1000000 |
+ A6XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fs->info.max_reg + 1) |
+ A6XX_SP_FS_CTRL_REG0_MERGEDREGS |
+ A6XX_SP_FS_CTRL_REG0_BRANCHSTACK(fs->branchstack);
+ if (fs->total_in > 0 || fs->frag_coord)
+ sp_fs_ctrl |= A6XX_SP_FS_CTRL_REG0_VARYING;
+ if (fs->num_samp > 0)
+ sp_fs_ctrl |= A6XX_SP_FS_CTRL_REG0_PIXLODENABLE;
+
+ uint32_t sp_fs_config = A6XX_SP_FS_CONFIG_NTEX(fs->num_samp) |
+ A6XX_SP_FS_CONFIG_NSAMP(fs->num_samp);
+ if (fs->instrlen)
+ sp_fs_config |= A6XX_SP_FS_CONFIG_ENABLED;
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_UNKNOWN_A99E, 1);
+ tu_cs_emit(cs, 0x7fc0);
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_UNKNOWN_A9A8, 1);
+ tu_cs_emit(cs, 0);
- for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++)
- input_for_attr[i] = -1;
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_UNKNOWN_AB00, 1);
+ tu_cs_emit(cs, 0x5);
- for (unsigned i = 0; i < vs->inputs_count; i++) {
- if (vs->inputs[i].sysval || vs->inputs[i].regid == regid(63, 0))
- continue;
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_CTRL_REG0, 1);
+ tu_cs_emit(cs, sp_fs_ctrl);
- assert(vs->inputs[i].slot >= VERT_ATTRIB_GENERIC0);
- unsigned loc = vs->inputs[i].slot - VERT_ATTRIB_GENERIC0;
- input_for_attr[loc] = i;
- attr_count = MAX2(attr_count, loc + 1);
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_CONFIG, 2);
+ tu_cs_emit(cs, sp_fs_config);
+ tu_cs_emit(cs, fs->instrlen);
- tu_cs_emit_regs(cs,
- A6XX_VFD_CONTROL_0(
- .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
- .decode_cnt = attr_count));
-
- if (attr_count)
- tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
-
- for (unsigned i = 0; i < attr_count; i++) {
- if (input_for_attr[i] >= 0) {
- unsigned input_idx = input_for_attr[i];
- tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
- .writemask = vs->inputs[input_idx].compmask,
- .regid = vs->inputs[input_idx].regid).value);
- } else {
- tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
- .writemask = 0,
- .regid = regid(63, 0)).value);
- }
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL, 1);
+ tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_CONSTLEN(align(fs->constlen, 4)) | 0x100);
}
static void
tu6_emit_vs_system_values(struct tu_cs *cs,
- const struct ir3_shader_variant *vs,
- const struct ir3_shader_variant *hs,
- const struct ir3_shader_variant *ds,
- const struct ir3_shader_variant *gs,
- bool primid_passthru)
+ const struct ir3_shader_variant *vs)
{
const uint32_t vertexid_regid =
- ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
+ ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
const uint32_t instanceid_regid =
- ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
- const uint32_t tess_coord_x_regid = hs ?
- ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD) :
- regid(63, 0);
- const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
- tess_coord_x_regid + 1 :
- regid(63, 0);
- const uint32_t hs_rel_patch_regid = hs ?
- ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3) :
- regid(63, 0);
- const uint32_t ds_rel_patch_regid = hs ?
- ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3) :
- regid(63, 0);
- const uint32_t hs_invocation_regid = hs ?
- ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3) :
- regid(63, 0);
- const uint32_t gs_primitiveid_regid = gs ?
- ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) :
- regid(63, 0);
- const uint32_t vs_primitiveid_regid = hs ?
- ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID) :
- gs_primitiveid_regid;
- const uint32_t ds_primitiveid_regid = ds ?
- ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID) :
- regid(63, 0);
- const uint32_t gsheader_regid = gs ?
- ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3) :
- regid(63, 0);
-
- /* Note: we currently don't support multiview with tess or GS. If we did,
- * and the HW actually works, then we'd have to somehow share this across
- * stages. Note that the blob doesn't support this either.
- */
- const uint32_t viewid_regid =
- ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
+ ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 6);
tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
- A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
- A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
- A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
- tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
- A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
- tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
- A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
- A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
- A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
+ A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
+ 0xfcfc0000);
+ tu_cs_emit(cs, 0x0000fcfc); /* VFD_CONTROL_2 */
+ tu_cs_emit(cs, 0xfcfcfcfc); /* VFD_CONTROL_3 */
tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
- tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
- 0xfc00); /* VFD_CONTROL_5 */
- tu_cs_emit(cs, COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */
+ tu_cs_emit(cs, 0x0000fcfc); /* VFD_CONTROL_5 */
+ tu_cs_emit(cs, 0x00000000); /* VFD_CONTROL_6 */
}
static void
-tu6_setup_streamout(struct tu_cs *cs,
- const struct ir3_shader_variant *v,
- struct ir3_shader_linkage *l)
+tu6_emit_vpc(struct tu_cs *cs,
+ const struct ir3_shader_variant *vs,
+ const struct ir3_shader_variant *fs,
+ bool binning_pass)
{
- const struct ir3_stream_output_info *info = &v->stream_output;
- /* Note: 64 here comes from the HW layout of the program RAM. The program
- * for stream N is at DWORD 64 * N.
- */
-#define A6XX_SO_PROG_DWORDS 64
- uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
- BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
-
- /* TODO: streamout state should be in a non-GMEM draw state */
-
- /* no streamout: */
- if (info->num_outputs == 0) {
- unsigned sizedw = 4;
- if (cs->device->physical_device->info->a6xx.tess_use_shared)
- sizedw += 2;
-
- tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, sizedw);
- tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
- tu_cs_emit(cs, 0);
- tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
- tu_cs_emit(cs, 0);
+ struct ir3_shader_linkage linkage = { 0 };
+ ir3_link_shaders(&linkage, vs, fs);
- if (cs->device->physical_device->info->a6xx.tess_use_shared) {
- tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL);
- tu_cs_emit(cs, 0);
- }
+ if (vs->shader->stream_output.num_outputs && !binning_pass)
+ tu_finishme("stream output");
- return;
+ BITSET_DECLARE(vpc_var_enables, 128) = { 0 };
+ for (uint32_t i = 0; i < linkage.cnt; i++) {
+ const uint32_t comp_count = util_last_bit(linkage.var[i].compmask);
+ for (uint32_t j = 0; j < comp_count; j++)
+ BITSET_SET(vpc_var_enables, linkage.var[i].loc + j);
}
- for (unsigned i = 0; i < info->num_outputs; i++) {
- const struct ir3_stream_output *out = &info->output[i];
- unsigned k = out->register_index;
- unsigned idx;
-
- /* Skip it, if it's an output that was never assigned a register. */
- if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
- continue;
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
+ tu_cs_emit(cs, ~vpc_var_enables[0]);
+ tu_cs_emit(cs, ~vpc_var_enables[1]);
+ tu_cs_emit(cs, ~vpc_var_enables[2]);
+ tu_cs_emit(cs, ~vpc_var_enables[3]);
- /* linkage map sorted by order frag shader wants things, so
- * a bit less ideal here..
- */
- for (idx = 0; idx < l->cnt; idx++)
- if (l->var[idx].slot == v->outputs[k].slot)
- break;
-
- assert(idx < l->cnt);
-
- for (unsigned j = 0; j < out->num_components; j++) {
- unsigned c = j + out->start_component;
- unsigned loc = l->var[idx].loc + c;
- unsigned off = j + out->dst_offset; /* in dwords */
-
- assert(loc < A6XX_SO_PROG_DWORDS * 2);
- unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
- if (loc & 1) {
- prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
- A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
- A6XX_VPC_SO_PROG_B_OFF(off * 4);
- } else {
- prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
- A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
- A6XX_VPC_SO_PROG_A_OFF(off * 4);
- }
- BITSET_SET(valid_dwords, dword);
- }
+ /* a6xx finds position/pointsize at the end */
+ const uint32_t position_regid =
+ ir3_find_output_regid(vs, VARYING_SLOT_POS);
+ const uint32_t pointsize_regid =
+ ir3_find_output_regid(vs, VARYING_SLOT_PSIZ);
+ uint32_t pointsize_loc = 0xff;
+ if (position_regid != regid(63, 0))
+ ir3_link_add(&linkage, position_regid, 0xf, linkage.max_loc);
+ if (pointsize_regid != regid(63, 0)) {
+ pointsize_loc = linkage.max_loc;
+ ir3_link_add(&linkage, pointsize_regid, 0x1, linkage.max_loc);
}
- unsigned prog_count = 0;
- unsigned start, end;
- BITSET_FOREACH_RANGE(start, end, valid_dwords,
- A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
- prog_count += end - start + 1;
+ /* map vs outputs to VPC */
+ assert(linkage.cnt <= 32);
+ const uint32_t sp_vs_out_count = (linkage.cnt + 1) / 2;
+ const uint32_t sp_vs_vpc_dst_count = (linkage.cnt + 3) / 4;
+ uint32_t sp_vs_out[16];
+ uint32_t sp_vs_vpc_dst[8];
+ sp_vs_out[sp_vs_out_count - 1] = 0;
+ sp_vs_vpc_dst[sp_vs_vpc_dst_count - 1] = 0;
+ for (uint32_t i = 0; i < linkage.cnt; i++) {
+ ((uint16_t *) sp_vs_out)[i] =
+ A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
+ A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
+ ((uint8_t *) sp_vs_vpc_dst)[i] =
+ A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
}
- const bool emit_pc_so_stream_cntl =
- cs->device->physical_device->info->a6xx.tess_use_shared &&
- v->type == MESA_SHADER_TESS_EVAL;
-
- if (emit_pc_so_stream_cntl)
- prog_count += 1;
-
- tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count);
- tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
- tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) |
- COND(info->stride[0] > 0,
- A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) |
- COND(info->stride[1] > 0,
- A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) |
- COND(info->stride[2] > 0,
- A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) |
- COND(info->stride[3] > 0,
- A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3])));
- for (uint32_t i = 0; i < 4; i++) {
- tu_cs_emit(cs, REG_A6XX_VPC_SO_BUFFER_STRIDE(i));
- tu_cs_emit(cs, info->stride[i]);
- }
- bool first = true;
- BITSET_FOREACH_RANGE(start, end, valid_dwords,
- A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
- tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
- tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) |
- A6XX_VPC_SO_CNTL_ADDR(start));
- for (unsigned i = start; i < end; i++) {
- tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
- tu_cs_emit(cs, prog[i]);
- }
- first = false;
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_OUT_REG(0), sp_vs_out_count);
+ tu_cs_emit_array(cs, sp_vs_out, sp_vs_out_count);
- if (emit_pc_so_stream_cntl) {
- /* Possibly not tess_use_shared related, but the combination of
- * tess + xfb fails some tests if we don't emit this.
- */
- tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL);
- tu_cs_emit(cs, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written));
- }
-}
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_VPC_DST_REG(0), sp_vs_vpc_dst_count);
+ tu_cs_emit_array(cs, sp_vs_vpc_dst, sp_vs_vpc_dst_count);
-static void
-tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base,
- enum a6xx_state_block block, uint32_t offset,
- uint32_t size, const uint32_t *dwords) {
- assert(size % 4 == 0);
-
- tu_cs_emit_pkt7(cs, opcode, 3 + size);
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(block) |
- CP_LOAD_STATE6_0_NUM_UNIT(size / 4));
-
- tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
- tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
- dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
-
- tu_cs_emit_array(cs, dwords, size);
-}
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
+ tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) |
+ (fs->total_in > 0 ? A6XX_VPC_CNTL_0_VARYING : 0) |
+ 0xff00ff00);
-static void
-tu6_emit_link_map(struct tu_cs *cs,
- const struct ir3_shader_variant *producer,
- const struct ir3_shader_variant *consumer,
- enum a6xx_state_block sb)
-{
- const struct ir3_const_state *const_state = ir3_const_state(consumer);
- uint32_t base = const_state->offsets.primitive_map;
- int size = DIV_ROUND_UP(consumer->input_size, 4);
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_PACK, 1);
+ tu_cs_emit(cs, A6XX_VPC_PACK_NUMNONPOSVAR(fs->total_in) |
+ A6XX_VPC_PACK_PSIZELOC(pointsize_loc) |
+ A6XX_VPC_PACK_STRIDE_IN_VPC(linkage.max_loc));
- size = (MIN2(size + base, consumer->constlen) - base) * 4;
- if (size <= 0)
- return;
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_SIV_CNTL, 1);
+ tu_cs_emit(cs, 0x0000ffff); /* XXX */
- tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size,
- producer->output_loc);
-}
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_PRIMITIVE_CNTL, 1);
+ tu_cs_emit(cs, A6XX_SP_PRIMITIVE_CNTL_VSOUT(linkage.cnt));
-static uint16_t
-primitive_to_tess(enum shader_prim primitive) {
- switch (primitive) {
- case SHADER_PRIM_POINTS:
- return TESS_POINTS;
- case SHADER_PRIM_LINE_STRIP:
- return TESS_LINES;
- case SHADER_PRIM_TRIANGLE_STRIP:
- return TESS_CW_TRIS;
- default:
- unreachable("");
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_1, 1);
+ tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_1_STRIDE_IN_VPC(linkage.max_loc) |
+ (vs->writes_psize ? A6XX_PC_PRIMITIVE_CNTL_1_PSIZE : 0));
}
static int
tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
- const struct ir3_shader_variant *last_shader,
uint32_t index,
uint8_t *interp_mode,
uint8_t *ps_repl_mode)
@@ -1091,18 +620,8 @@ tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
*interp_mode |= INTERP_ONE << 6;
shift += 2;
}
- } else if (fs->inputs[index].slot == VARYING_SLOT_LAYER ||
- fs->inputs[index].slot == VARYING_SLOT_VIEWPORT) {
- /* If the last geometry shader doesn't statically write these, they're
- * implicitly zero and the FS is supposed to read zero.
- */
- if (ir3_find_output(last_shader, fs->inputs[index].slot) < 0 &&
- (compmask & 0x1)) {
- *interp_mode |= INTERP_ZERO;
- } else {
- *interp_mode |= INTERP_FLAT;
- }
- } else if (fs->inputs[index].flat) {
+ } else if ((fs->inputs[index].interpolate == INTERP_MODE_FLAT) ||
+ fs->inputs[index].rasterflat) {
for (int i = 0; i < 4; i++) {
if (compmask & (1 << i)) {
*interp_mode |= INTERP_FLAT << shift;
@@ -1111,19 +630,18 @@ tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
}
}
- return util_bitcount(compmask) * 2;
+ return shift;
}
static void
tu6_emit_vpc_varying_modes(struct tu_cs *cs,
const struct ir3_shader_variant *fs,
- const struct ir3_shader_variant *last_shader)
+ bool binning_pass)
{
uint32_t interp_modes[8] = { 0 };
uint32_t ps_repl_modes[8] = { 0 };
- uint32_t interp_regs = 0;
- if (fs) {
+ if (!binning_pass) {
for (int i = -1;
(i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
@@ -1131,7 +649,7 @@ tu6_emit_vpc_varying_modes(struct tu_cs *cs,
uint8_t interp_mode;
uint8_t ps_repl_mode;
const int bits =
- tu6_vpc_varying_mode(fs, last_shader, i, &interp_mode, &ps_repl_mode);
+ tu6_vpc_varying_mode(fs, i, &interp_mode, &ps_repl_mode);
/* OR the mode into the array */
const uint32_t inloc = fs->inputs[i].inloc * 2;
@@ -1146,1043 +664,445 @@ tu6_emit_vpc_varying_modes(struct tu_cs *cs,
interp_modes[n] |= interp_mode >> shift;
ps_repl_modes[n] |= ps_repl_mode >> shift;
}
- interp_regs = MAX2(interp_regs, n + 1);
}
}
- if (interp_regs) {
- tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), interp_regs);
- tu_cs_emit_array(cs, interp_modes, interp_regs);
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
+ tu_cs_emit_array(cs, interp_modes, 8);
- tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), interp_regs);
- tu_cs_emit_array(cs, ps_repl_modes, interp_regs);
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
+ tu_cs_emit_array(cs, ps_repl_modes, 8);
}
-void
-tu6_emit_vpc(struct tu_cs *cs,
- const struct ir3_shader_variant *vs,
- const struct ir3_shader_variant *hs,
- const struct ir3_shader_variant *ds,
- const struct ir3_shader_variant *gs,
- const struct ir3_shader_variant *fs)
-{
- /* note: doesn't compile as static because of the array regs.. */
- const struct reg_config {
- uint16_t reg_sp_xs_out_reg;
- uint16_t reg_sp_xs_vpc_dst_reg;
- uint16_t reg_vpc_xs_pack;
- uint16_t reg_vpc_xs_clip_cntl;
- uint16_t reg_gras_xs_cl_cntl;
- uint16_t reg_pc_xs_out_cntl;
- uint16_t reg_sp_xs_primitive_cntl;
- uint16_t reg_vpc_xs_layer_cntl;
- uint16_t reg_gras_xs_layer_cntl;
- } reg_config[] = {
- [MESA_SHADER_VERTEX] = {
- REG_A6XX_SP_VS_OUT_REG(0),
- REG_A6XX_SP_VS_VPC_DST_REG(0),
- REG_A6XX_VPC_VS_PACK,
- REG_A6XX_VPC_VS_CLIP_CNTL,
- REG_A6XX_GRAS_VS_CL_CNTL,
- REG_A6XX_PC_VS_OUT_CNTL,
- REG_A6XX_SP_VS_PRIMITIVE_CNTL,
- REG_A6XX_VPC_VS_LAYER_CNTL,
- REG_A6XX_GRAS_VS_LAYER_CNTL
- },
- [MESA_SHADER_TESS_CTRL] = {
- 0,
- 0,
- 0,
- 0,
- 0,
- REG_A6XX_PC_HS_OUT_CNTL,
- 0,
- 0,
- 0
- },
- [MESA_SHADER_TESS_EVAL] = {
- REG_A6XX_SP_DS_OUT_REG(0),
- REG_A6XX_SP_DS_VPC_DST_REG(0),
- REG_A6XX_VPC_DS_PACK,
- REG_A6XX_VPC_DS_CLIP_CNTL,
- REG_A6XX_GRAS_DS_CL_CNTL,
- REG_A6XX_PC_DS_OUT_CNTL,
- REG_A6XX_SP_DS_PRIMITIVE_CNTL,
- REG_A6XX_VPC_DS_LAYER_CNTL,
- REG_A6XX_GRAS_DS_LAYER_CNTL
- },
- [MESA_SHADER_GEOMETRY] = {
- REG_A6XX_SP_GS_OUT_REG(0),
- REG_A6XX_SP_GS_VPC_DST_REG(0),
- REG_A6XX_VPC_GS_PACK,
- REG_A6XX_VPC_GS_CLIP_CNTL,
- REG_A6XX_GRAS_GS_CL_CNTL,
- REG_A6XX_PC_GS_OUT_CNTL,
- REG_A6XX_SP_GS_PRIMITIVE_CNTL,
- REG_A6XX_VPC_GS_LAYER_CNTL,
- REG_A6XX_GRAS_GS_LAYER_CNTL
- },
- };
-
- const struct ir3_shader_variant *last_shader;
- if (gs) {
- last_shader = gs;
- } else if (hs) {
- last_shader = ds;
- } else {
- last_shader = vs;
- }
-
- const struct reg_config *cfg = &reg_config[last_shader->type];
-
- struct ir3_shader_linkage linkage = {
- .primid_loc = 0xff,
- .clip0_loc = 0xff,
- .clip1_loc = 0xff,
- };
- if (fs)
- ir3_link_shaders(&linkage, last_shader, fs, true);
-
- if (last_shader->stream_output.num_outputs)
- ir3_link_stream_out(&linkage, last_shader);
-
- /* We do this after linking shaders in order to know whether PrimID
- * passthrough needs to be enabled.
- */
- bool primid_passthru = linkage.primid_loc != 0xff;
- tu6_emit_vs_system_values(cs, vs, hs, ds, gs, primid_passthru);
-
- tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
- tu_cs_emit(cs, ~linkage.varmask[0]);
- tu_cs_emit(cs, ~linkage.varmask[1]);
- tu_cs_emit(cs, ~linkage.varmask[2]);
- tu_cs_emit(cs, ~linkage.varmask[3]);
-
- /* a6xx finds position/pointsize at the end */
- const uint32_t pointsize_regid =
- ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
- const uint32_t layer_regid =
- ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
- const uint32_t view_regid =
- ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
- const uint32_t clip0_regid =
- ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
- const uint32_t clip1_regid =
- ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
- uint32_t flags_regid = gs ?
- ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
-
- uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
-
- if (layer_regid != regid(63, 0)) {
- layer_loc = linkage.max_loc;
- ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
- }
-
- if (view_regid != regid(63, 0)) {
- view_loc = linkage.max_loc;
- ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
- }
-
- unsigned extra_pos = 0;
-
- for (unsigned i = 0; i < last_shader->outputs_count; i++) {
- if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
- continue;
-
- if (position_loc == 0xff)
- position_loc = linkage.max_loc;
-
- ir3_link_add(&linkage, last_shader->outputs[i].slot,
- last_shader->outputs[i].regid,
- 0xf, position_loc + 4 * last_shader->outputs[i].view);
- extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
- }
-
- if (pointsize_regid != regid(63, 0)) {
- pointsize_loc = linkage.max_loc;
- ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
- }
-
- uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
-
- /* Handle the case where clip/cull distances aren't read by the FS */
- uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
- if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
- clip0_loc = linkage.max_loc;
- ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
- clip_cull_mask & 0xf, linkage.max_loc);
- }
- if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
- clip1_loc = linkage.max_loc;
- ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
- clip_cull_mask >> 4, linkage.max_loc);
- }
-
- tu6_setup_streamout(cs, last_shader, &linkage);
-
- /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
- * at least when a DS is the last stage, so add a dummy output to keep it
- * happy if there aren't any. We do this late in order to avoid emitting
- * any unused code and make sure that optimizations don't remove it.
- */
- if (linkage.cnt == 0)
- ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
-
- /* map outputs of the last shader to VPC */
- assert(linkage.cnt <= 32);
- const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
- const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
- uint32_t sp_out[16] = {0};
- uint32_t sp_vpc_dst[8] = {0};
- for (uint32_t i = 0; i < linkage.cnt; i++) {
- ((uint16_t *) sp_out)[i] =
- A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
- A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
- ((uint8_t *) sp_vpc_dst)[i] =
- A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
- }
-
- tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
- tu_cs_emit_array(cs, sp_out, sp_out_count);
-
- tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
- tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
-
- tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
- tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
- A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
- A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) |
- A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
-
- tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
- tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
- A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
- A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
-
- tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
- tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
- A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
-
- const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };
-
- for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
- const struct ir3_shader_variant *shader = geom_shaders[i];
- if (!shader)
- continue;
-
- bool primid = shader->type != MESA_SHADER_VERTEX &&
- VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
-
- tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
- if (shader == last_shader) {
- tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
- CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
- CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
- CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
- COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
- A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
- } else {
- tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
- }
- }
-
- /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
- if (gs)
- assert(flags_regid != INVALID_REG);
-
- tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
- tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
- A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
-
- tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
- tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
- A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc));
-
- tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
- tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
- CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
-
- tu_cs_emit_regs(cs, A6XX_PC_PRIMID_PASSTHRU(primid_passthru));
-
- tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
- tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs ? fs->total_in : 0) |
- COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
- A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) |
- A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc));
-
- if (hs) {
- tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
- tu_cs_emit(cs, hs->tess.tcs_vertices_out);
-
- /* In SPIR-V generated from GLSL, the tessellation primitive params are
- * are specified in the tess eval shader, but in SPIR-V generated from
- * HLSL, they are specified in the tess control shader. */
- const struct ir3_shader_variant *tess =
- ds->tess.spacing == TESS_SPACING_UNSPECIFIED ? hs : ds;
- tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_CNTL, 1);
- uint32_t output;
- if (tess->tess.point_mode)
- output = TESS_POINTS;
- else if (tess->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES)
- output = TESS_LINES;
- else if (tess->tess.ccw)
- output = TESS_CCW_TRIS;
- else
- output = TESS_CW_TRIS;
-
- enum a6xx_tess_spacing spacing;
- switch (tess->tess.spacing) {
- case TESS_SPACING_EQUAL:
- spacing = TESS_EQUAL;
- break;
- case TESS_SPACING_FRACTIONAL_ODD:
- spacing = TESS_FRACTIONAL_ODD;
- break;
- case TESS_SPACING_FRACTIONAL_EVEN:
- spacing = TESS_FRACTIONAL_EVEN;
- break;
- case TESS_SPACING_UNSPECIFIED:
- default:
- unreachable("invalid tess spacing");
- }
- tu_cs_emit(cs, A6XX_PC_TESS_CNTL_SPACING(spacing) |
- A6XX_PC_TESS_CNTL_OUTPUT(output));
-
- tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
- tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
- }
-
-
- if (gs) {
- uint32_t vertices_out, invocations, output, vec4_size;
- uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;
-
- if (hs) {
- tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
- } else {
- tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
- }
- vertices_out = gs->gs.vertices_out - 1;
- output = primitive_to_tess(gs->gs.output_primitive);
- invocations = gs->gs.invocations - 1;
- /* Size of per-primitive alloction in ldlw memory in vec4s. */
- vec4_size = gs->gs.vertices_in *
- DIV_ROUND_UP(prev_stage_output_size, 4);
-
- tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
- tu_cs_emit(cs,
- A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) |
- A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
- A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations));
-
- tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1);
- tu_cs_emit(cs, 0xff);
-
- tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
- tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
-
- uint32_t prim_size = prev_stage_output_size;
- if (prim_size > 64)
- prim_size = 64;
- else if (prim_size == 64)
- prim_size = 63;
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1);
- tu_cs_emit(cs, prim_size);
- }
+static void
+tu6_emit_fs_system_values(struct tu_cs *cs,
+ const struct ir3_shader_variant *fs)
+{
+ const uint32_t frontfacing_regid =
+ ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
+ const uint32_t sampleid_regid =
+ ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
+ const uint32_t samplemaskin_regid =
+ ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
+ const uint32_t fragcoord_xy_regid =
+ ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
+ const uint32_t fragcoord_zw_regid = (fragcoord_xy_regid != regid(63, 0))
+ ? (fragcoord_xy_regid + 2)
+ : fragcoord_xy_regid;
+ const uint32_t varyingcoord_regid =
+ ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PIXEL);
- tu6_emit_vpc_varying_modes(cs, fs, last_shader);
+ tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
+ tu_cs_emit(cs, 0x7);
+ tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(frontfacing_regid) |
+ A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(sampleid_regid) |
+ A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(samplemaskin_regid) |
+ A6XX_HLSQ_CONTROL_2_REG_SIZE(regid(63, 0)));
+ tu_cs_emit(cs,
+ A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_PIXEL(varyingcoord_regid) |
+ A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_CENTROID(regid(63, 0)) |
+ 0xfc00fc00);
+ tu_cs_emit(cs,
+ A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(fragcoord_xy_regid) |
+ A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(fragcoord_zw_regid) |
+ A6XX_HLSQ_CONTROL_4_REG_BARY_IJ_PIXEL_PERSAMP(regid(63, 0)) |
+ 0x0000fc00);
+ tu_cs_emit(cs, 0xfc);
}
-void
+static void
tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
{
- uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
- uint32_t ij_regid[IJ_COUNT];
- uint32_t smask_in_regid;
-
- bool sample_shading = fs->per_samp | fs->key.sample_shading;
- bool enable_varyings = fs->total_in > 0;
-
- samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
- smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
- face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
- coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
- zwcoord_regid = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
- for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
- ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
-
- if (fs->num_sampler_prefetch > 0) {
- assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL]));
- /* also, it seems like ij_pix is *required* to be r0.x */
- assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_UNKNOWN_B980, 1);
+ tu_cs_emit(cs, fs->total_in > 0 ? 3 : 1);
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
- tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
- A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) |
- 0x7000); // XXX);
- for (int i = 0; i < fs->num_sampler_prefetch; i++) {
- const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
- tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) |
- A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) |
- A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) |
- A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) |
- A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) |
- COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) |
- A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd));
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_UNKNOWN_A982, 1);
+ tu_cs_emit(cs, 0); /* XXX */
- if (fs->num_sampler_prefetch > 0) {
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
- for (int i = 0; i < fs->num_sampler_prefetch; i++) {
- const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
- tu_cs_emit(cs,
- A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
- A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
- }
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 1);
+ tu_cs_emit(cs, 0xff); /* XXX */
- tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
- tu_cs_emit(cs, 0x7);
- tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
- A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) |
- A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) |
- A6XX_HLSQ_CONTROL_2_REG_CENTERRHW(ij_regid[IJ_PERSP_CENTER_RHW]));
- tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) |
- A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) |
- A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) |
- A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID]));
- tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) |
- A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) |
- A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) |
- A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE]));
- tu_cs_emit(cs, 0xfcfc);
-
- enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
- tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
- tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz) |
- COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS));
-
- bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
- bool need_size_persamp = false;
- if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) {
- if (sample_shading)
- need_size_persamp = true;
- else
- need_size = true;
+ uint32_t gras_cntl = 0;
+ if (fs->total_in > 0)
+ gras_cntl |= A6XX_GRAS_CNTL_VARYING;
+ if (fs->frag_coord) {
+ gras_cntl |= A6XX_GRAS_CNTL_SIZE | A6XX_GRAS_CNTL_XCOORD |
+ A6XX_GRAS_CNTL_YCOORD | A6XX_GRAS_CNTL_ZCOORD |
+ A6XX_GRAS_CNTL_WCOORD;
}
tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
- tu_cs_emit(cs,
- CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
- CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
- CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
- CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
- CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
- CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
- COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
- COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
- COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
+ tu_cs_emit(cs, gras_cntl);
+
+ uint32_t rb_render_control = 0;
+ if (fs->total_in > 0) {
+ rb_render_control =
+ A6XX_RB_RENDER_CONTROL0_VARYING | A6XX_RB_RENDER_CONTROL0_UNK10;
+ }
+ if (fs->frag_coord) {
+ rb_render_control |=
+ A6XX_RB_RENDER_CONTROL0_SIZE | A6XX_RB_RENDER_CONTROL0_XCOORD |
+ A6XX_RB_RENDER_CONTROL0_YCOORD | A6XX_RB_RENDER_CONTROL0_ZCOORD |
+ A6XX_RB_RENDER_CONTROL0_WCOORD;
+ }
tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
- tu_cs_emit(cs,
- CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
- CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
- CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
- CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
- CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
- CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
- COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
- COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
- COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
- COND(fs->fragcoord_compmask != 0,
- A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
- tu_cs_emit(cs,
- A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
- sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
- CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
- CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
- CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) |
- COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
-
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
- tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
-
- tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
- tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
- A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
- sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
-
- tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
- tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
+ tu_cs_emit(cs, rb_render_control);
+ tu_cs_emit(cs, (fs->frag_face ? A6XX_RB_RENDER_CONTROL1_FACENESS : 0));
}
static void
tu6_emit_fs_outputs(struct tu_cs *cs,
const struct ir3_shader_variant *fs,
- struct tu_pipeline *pipeline)
+ uint32_t mrt_count)
{
- uint32_t smask_regid, posz_regid, stencilref_regid;
-
- posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
- smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
- stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
-
- int output_reg_count = 0;
+ const uint32_t fragdepth_regid =
+ ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
uint32_t fragdata_regid[8];
-
- assert(!fs->color0_mrt);
- for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
- fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
- if (VALIDREG(fragdata_regid[i]))
- output_reg_count = i + 1;
+ if (fs->color0_mrt) {
+ fragdata_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_COLOR);
+ for (uint32_t i = 1; i < ARRAY_SIZE(fragdata_regid); i++)
+ fragdata_regid[i] = fragdata_regid[0];
+ } else {
+ for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++)
+ fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
}
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1);
- tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
- A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
- A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
- COND(fs->dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
-
- /* There is no point in having component enabled which is not written
- * by the shader. Per VK spec it is an UB, however a few apps depend on
- * attachment not being changed if FS doesn't have corresponding output.
- */
- uint32_t fs_render_components = 0;
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
+ tu_cs_emit(
+ cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(fragdepth_regid) | 0xfcfc0000);
+ tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), output_reg_count);
- for (uint32_t i = 0; i < output_reg_count; i++) {
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 8);
+ for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
+ // TODO we could have a mix of half and full precision outputs,
+ // we really need to figure out half-precision from IR3_REG_HALF
tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
- (COND(fragdata_regid[i] & HALF_REG_ID,
- A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)));
-
- if (VALIDREG(fragdata_regid[i])) {
- fs_render_components |= 0xf << (i * 4);
- }
+ (false ? A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION : 0));
}
- tu_cs_emit_regs(cs,
- A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components));
-
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 1);
- tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
- COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
- COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
- COND(fs->dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
+ tu_cs_emit(cs, fs->writes_pos ? A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z : 0);
+ tu_cs_emit(cs, A6XX_RB_FS_OUTPUT_CNTL1_MRT(mrt_count));
- tu_cs_emit_regs(cs,
- A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components));
-
- if (pipeline) {
- pipeline->lrz.fs.has_kill = fs->has_kill;
- pipeline->lrz.fs.early_fragment_tests = fs->fs.early_fragment_tests;
-
- if (!fs->fs.early_fragment_tests &&
- (fs->no_earlyz || fs->writes_pos || fs->writes_stencilref || fs->writes_smask)) {
- pipeline->lrz.force_late_z = true;
- }
-
- pipeline->lrz.fs.force_early_z = fs->fs.early_fragment_tests;
+ uint32_t gras_su_depth_plane_cntl = 0;
+ uint32_t rb_depth_plane_cntl = 0;
+ if (fs->no_earlyz | fs->writes_pos) {
+ gras_su_depth_plane_cntl |= A6XX_GRAS_SU_DEPTH_PLANE_CNTL_FRAG_WRITES_Z;
+ rb_depth_plane_cntl |= A6XX_RB_DEPTH_PLANE_CNTL_FRAG_WRITES_Z;
}
-}
-static void
-tu6_emit_vs_params(struct tu_cs *cs,
- const struct ir3_const_state *const_state,
- unsigned constlen,
- unsigned param_stride,
- unsigned num_vertices)
-{
- uint32_t vs_params[4] = {
- param_stride * num_vertices * 4, /* vs primitive stride */
- param_stride * 4, /* vs vertex stride */
- 0,
- 0,
- };
- uint32_t vs_base = const_state->offsets.primitive_param;
- tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0,
- ARRAY_SIZE(vs_params), vs_params);
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1);
+ tu_cs_emit(cs, gras_su_depth_plane_cntl);
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1);
+ tu_cs_emit(cs, rb_depth_plane_cntl);
}
static void
-tu_get_tess_iova(struct tu_device *dev,
- uint64_t *tess_factor_iova,
- uint64_t *tess_param_iova)
-{
- /* Create the shared tess factor BO the first time tess is used on the device. */
- if (!dev->tess_bo) {
- mtx_lock(&dev->mutex);
- if (!dev->tess_bo)
- tu_bo_init_new(dev, &dev->tess_bo, TU_TESS_BO_SIZE, TU_BO_ALLOC_NO_FLAGS, "tess");
- mtx_unlock(&dev->mutex);
+tu6_emit_shader_object(struct tu_cs *cs,
+ gl_shader_stage stage,
+ const struct ir3_shader_variant *variant,
+ const struct tu_bo *binary_bo,
+ uint32_t binary_offset)
+{
+ uint16_t reg;
+ uint8_t opcode;
+ enum a6xx_state_block sb;
+ switch (stage) {
+ case MESA_SHADER_VERTEX:
+ reg = REG_A6XX_SP_VS_OBJ_START_LO;
+ opcode = CP_LOAD_STATE6_GEOM;
+ sb = SB6_VS_SHADER;
+ break;
+ case MESA_SHADER_TESS_CTRL:
+ reg = REG_A6XX_SP_HS_OBJ_START_LO;
+ opcode = CP_LOAD_STATE6_GEOM;
+ sb = SB6_HS_SHADER;
+ break;
+ case MESA_SHADER_TESS_EVAL:
+ reg = REG_A6XX_SP_DS_OBJ_START_LO;
+ opcode = CP_LOAD_STATE6_GEOM;
+ sb = SB6_DS_SHADER;
+ break;
+ case MESA_SHADER_GEOMETRY:
+ reg = REG_A6XX_SP_GS_OBJ_START_LO;
+ opcode = CP_LOAD_STATE6_GEOM;
+ sb = SB6_GS_SHADER;
+ break;
+ case MESA_SHADER_FRAGMENT:
+ reg = REG_A6XX_SP_FS_OBJ_START_LO;
+ opcode = CP_LOAD_STATE6_FRAG;
+ sb = SB6_FS_SHADER;
+ break;
+ case MESA_SHADER_COMPUTE:
+ reg = REG_A6XX_SP_CS_OBJ_START_LO;
+ opcode = CP_LOAD_STATE6_FRAG;
+ sb = SB6_CS_SHADER;
+ break;
+ default:
+ unreachable("invalid gl_shader_stage");
+ opcode = CP_LOAD_STATE6_GEOM;
+ sb = SB6_VS_SHADER;
+ break;
}
- *tess_factor_iova = dev->tess_bo->iova;
- *tess_param_iova = dev->tess_bo->iova + TU_TESS_FACTOR_SIZE;
-}
-
-void
-tu6_emit_patch_control_points(struct tu_cs *cs,
- const struct tu_pipeline *pipeline,
- unsigned patch_control_points)
-{
- if (!(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT))
+ if (!variant->instrlen) {
+ tu_cs_emit_pkt4(cs, reg, 2);
+ tu_cs_emit_qw(cs, 0);
return;
-
- struct tu_device *dev = cs->device;
-
- tu6_emit_vs_params(cs,
- &pipeline->program.link[MESA_SHADER_VERTEX].const_state,
- pipeline->program.link[MESA_SHADER_VERTEX].constlen,
- pipeline->program.vs_param_stride,
- patch_control_points);
-
- uint64_t tess_factor_iova, tess_param_iova;
- tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
-
- uint32_t hs_params[8] = {
- pipeline->program.vs_param_stride * patch_control_points * 4, /* hs primitive stride */
- pipeline->program.vs_param_stride * 4, /* hs vertex stride */
- pipeline->program.hs_param_stride,
- patch_control_points,
- tess_param_iova,
- tess_param_iova >> 32,
- tess_factor_iova,
- tess_factor_iova >> 32,
- };
-
- const struct ir3_const_state *hs_const =
- &pipeline->program.link[MESA_SHADER_TESS_CTRL].const_state;
- uint32_t hs_base = hs_const->offsets.primitive_param;
- tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0,
- pipeline->program.hs_param_dwords, hs_params);
-
- uint32_t patch_local_mem_size_16b =
- patch_control_points * pipeline->program.vs_param_stride / 4;
-
- /* Total attribute slots in HS incoming patch. */
- tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1);
- tu_cs_emit(cs, patch_local_mem_size_16b);
-
- const uint32_t wavesize = 64;
- const uint32_t vs_hs_local_mem_size = 16384;
-
- uint32_t max_patches_per_wave;
- if (dev->physical_device->info->a6xx.tess_use_shared) {
- /* HS invocations for a patch are always within the same wave,
- * making barriers less expensive. VS can't have barriers so we
- * don't care about VS invocations being in the same wave.
- */
- max_patches_per_wave = wavesize / pipeline->program.hs_vertices_out;
- } else {
- /* VS is also in the same wave */
- max_patches_per_wave =
- wavesize / MAX2(patch_control_points,
- pipeline->program.hs_vertices_out);
}
- uint32_t patches_per_wave =
- MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16),
- max_patches_per_wave);
-
- uint32_t wave_input_size = DIV_ROUND_UP(
- patches_per_wave * patch_local_mem_size_16b * 16, 256);
+ assert(variant->type == stage);
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
- tu_cs_emit(cs, wave_input_size);
-
- /* maximum number of patches that can fit in tess factor/param buffers */
- uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(pipeline->tess.patch_type),
- TU_TESS_PARAM_SIZE / (pipeline->program.hs_param_stride * 4));
- /* convert from # of patches to draw count */
- subdraw_size *= patch_control_points;
-
- tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
- tu_cs_emit(cs, subdraw_size);
-}
+ const uint64_t binary_iova = binary_bo->iova + binary_offset;
+ assert((binary_iova & 0x3) == 0);
-static void
-tu6_emit_geom_tess_consts(struct tu_cs *cs,
- const struct ir3_shader_variant *vs,
- const struct ir3_shader_variant *hs,
- const struct ir3_shader_variant *ds,
- const struct ir3_shader_variant *gs)
-{
- struct tu_device *dev = cs->device;
-
- if (gs && !hs) {
- tu6_emit_vs_params(cs, ir3_const_state(vs), vs->constlen,
- vs->output_size, gs->gs.vertices_in);
- }
+ tu_cs_emit_pkt4(cs, reg, 2);
+ tu_cs_emit_qw(cs, binary_iova);
- if (hs) {
- uint64_t tess_factor_iova, tess_param_iova;
- tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
-
- uint32_t ds_params[8] = {
- gs ? ds->output_size * gs->gs.vertices_in * 4 : 0, /* ds primitive stride */
- ds->output_size * 4, /* ds vertex stride */
- hs->output_size, /* hs vertex stride (dwords) */
- hs->tess.tcs_vertices_out,
- tess_param_iova,
- tess_param_iova >> 32,
- tess_factor_iova,
- tess_factor_iova >> 32,
- };
-
- uint32_t ds_base = ds->const_state->offsets.primitive_param;
- uint32_t ds_param_dwords = MIN2((ds->constlen - ds_base) * 4, ARRAY_SIZE(ds_params));
- tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0,
- ds_param_dwords, ds_params);
- }
+ /* always indirect */
+ const bool indirect = true;
+ if (indirect) {
+ tu_cs_emit_pkt7(cs, opcode, 3);
+ tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
+ CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
+ CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
+ CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
+ CP_LOAD_STATE6_0_NUM_UNIT(variant->instrlen));
+ tu_cs_emit_qw(cs, binary_iova);
+ } else {
+ const void *binary = binary_bo->map + binary_offset;
- if (gs) {
- const struct ir3_shader_variant *prev = ds ? ds : vs;
- uint32_t gs_params[4] = {
- prev->output_size * gs->gs.vertices_in * 4, /* gs primitive stride */
- prev->output_size * 4, /* gs vertex stride */
- 0,
- 0,
- };
- uint32_t gs_base = gs->const_state->offsets.primitive_param;
- tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0,
- ARRAY_SIZE(gs_params), gs_params);
+ tu_cs_emit_pkt7(cs, opcode, 3 + variant->info.sizedwords);
+ tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
+ CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
+ CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+ CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
+ CP_LOAD_STATE6_0_NUM_UNIT(variant->instrlen));
+ tu_cs_emit_qw(cs, 0);
+ tu_cs_emit_array(cs, binary, variant->info.sizedwords);
}
}
static void
-tu6_emit_program_config(struct tu_cs *cs,
- struct tu_pipeline_builder *builder)
+tu6_emit_program(struct tu_cs *cs,
+ const struct tu_pipeline_builder *builder,
+ const struct tu_bo *binary_bo,
+ bool binning_pass)
{
- gl_shader_stage stage = MESA_SHADER_VERTEX;
-
- STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
-
- bool shared_consts_enable = tu6_shared_constants_enable(&builder->layout,
- builder->device->compiler);
- tu6_emit_shared_consts_enable(cs, shared_consts_enable);
-
- tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
- .vs_state = true,
- .hs_state = true,
- .ds_state = true,
- .gs_state = true,
- .fs_state = true,
- .gfx_ibo = true,
- .gfx_shared_const = shared_consts_enable));
- for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) {
- tu6_emit_xs_config(cs, stage, builder->variants[stage]);
- }
+ static const struct ir3_shader_variant dummy_variant = {
+ .type = MESA_SHADER_NONE
+ };
+ assert(builder->shaders[MESA_SHADER_VERTEX]);
+ const struct ir3_shader_variant *vs =
+ &builder->shaders[MESA_SHADER_VERTEX]->variants[0];
+ const struct ir3_shader_variant *hs =
+ builder->shaders[MESA_SHADER_TESS_CTRL]
+ ? &builder->shaders[MESA_SHADER_TESS_CTRL]->variants[0]
+ : &dummy_variant;
+ const struct ir3_shader_variant *ds =
+ builder->shaders[MESA_SHADER_TESS_EVAL]
+ ? &builder->shaders[MESA_SHADER_TESS_EVAL]->variants[0]
+ : &dummy_variant;
+ const struct ir3_shader_variant *gs =
+ builder->shaders[MESA_SHADER_GEOMETRY]
+ ? &builder->shaders[MESA_SHADER_GEOMETRY]->variants[0]
+ : &dummy_variant;
+ const struct ir3_shader_variant *fs =
+ builder->shaders[MESA_SHADER_FRAGMENT]
+ ? &builder->shaders[MESA_SHADER_FRAGMENT]->variants[0]
+ : &dummy_variant;
+
+ if (binning_pass) {
+ vs = &builder->shaders[MESA_SHADER_VERTEX]->variants[1];
+ fs = &dummy_variant;
+ }
+
+ tu6_emit_vs_config(cs, vs);
+ tu6_emit_hs_config(cs, hs);
+ tu6_emit_ds_config(cs, ds);
+ tu6_emit_gs_config(cs, gs);
+ tu6_emit_fs_config(cs, fs);
+
+ tu6_emit_vs_system_values(cs, vs);
+ tu6_emit_vpc(cs, vs, fs, binning_pass);
+ tu6_emit_vpc_varying_modes(cs, fs, binning_pass);
+ tu6_emit_fs_system_values(cs, fs);
+ tu6_emit_fs_inputs(cs, fs);
+ tu6_emit_fs_outputs(cs, fs, builder->color_attachment_count);
+
+ tu6_emit_shader_object(cs, MESA_SHADER_VERTEX, vs, binary_bo,
+ builder->shader_offsets[MESA_SHADER_VERTEX]);
+
+ tu6_emit_shader_object(cs, MESA_SHADER_FRAGMENT, fs, binary_bo,
+ builder->shader_offsets[MESA_SHADER_FRAGMENT]);
}
static void
-tu6_emit_program(struct tu_cs *cs,
- struct tu_pipeline_builder *builder,
- bool binning_pass,
- struct tu_pipeline *pipeline)
-{
- const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
- const struct ir3_shader_variant *bs = builder->binning_variant;
- const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
- const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
- const struct ir3_shader_variant *gs = builder->variants[MESA_SHADER_GEOMETRY];
- const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT];
- gl_shader_stage stage = MESA_SHADER_VERTEX;
- bool multi_pos_output = vs->multi_pos_output;
-
- /* Don't use the binning pass variant when GS is present because we don't
- * support compiling correct binning pass variants with GS.
- */
- if (binning_pass && !gs) {
- vs = bs;
- tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova);
- tu6_emit_dynamic_offset(cs, bs, builder);
- stage++;
- }
-
- for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) {
- const struct ir3_shader_variant *xs = builder->variants[stage];
-
- if (stage == MESA_SHADER_FRAGMENT && binning_pass)
- fs = xs = NULL;
+tu6_emit_vertex_input(struct tu_cs *cs,
+ const struct ir3_shader_variant *vs,
+ const VkPipelineVertexInputStateCreateInfo *vi_info,
+ uint8_t bindings[MAX_VERTEX_ATTRIBS],
+ uint16_t strides[MAX_VERTEX_ATTRIBS],
+ uint16_t offsets[MAX_VERTEX_ATTRIBS],
+ uint32_t *count)
+{
+ uint32_t vfd_decode_idx = 0;
+
+ /* why do we go beyond inputs_count? */
+ assert(vs->inputs_count + 1 <= MAX_VERTEX_ATTRIBS);
+ for (uint32_t i = 0; i <= vs->inputs_count; i++) {
+ if (vs->inputs[i].sysval || !vs->inputs[i].compmask)
+ continue;
- tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]);
- tu6_emit_dynamic_offset(cs, xs, builder);
- }
+ const VkVertexInputAttributeDescription *vi_attr =
+ tu_find_vertex_input_attribute(vi_info, vs->inputs[i].slot);
+ const VkVertexInputBindingDescription *vi_binding =
+ tu_find_vertex_input_binding(vi_info, vi_attr);
+ assert(vi_attr && vi_binding);
- uint32_t multiview_views = util_logbase2(pipeline->rast.multiview_mask) + 1;
- uint32_t multiview_cntl = pipeline->rast.multiview_mask ?
- A6XX_PC_MULTIVIEW_CNTL_ENABLE |
- A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
- COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
- : 0;
+ const struct tu_native_format *format =
+ tu6_get_native_format(vi_attr->format);
+ assert(format && format->vtx >= 0);
- /* Copy what the blob does here. This will emit an extra 0x3f
- * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
- * this is working around yet.
- */
- if (builder->device->physical_device->info->a6xx.has_cp_reg_write) {
- tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
- tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
- tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
- } else {
- tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
- }
- tu_cs_emit(cs, multiview_cntl);
+ uint32_t vfd_decode = A6XX_VFD_DECODE_INSTR_IDX(vfd_decode_idx) |
+ A6XX_VFD_DECODE_INSTR_FORMAT(format->vtx) |
+ A6XX_VFD_DECODE_INSTR_SWAP(format->swap) |
+ A6XX_VFD_DECODE_INSTR_UNK30;
+ if (vi_binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
+ vfd_decode |= A6XX_VFD_DECODE_INSTR_INSTANCED;
+ if (!vk_format_is_int(vi_attr->format))
+ vfd_decode |= A6XX_VFD_DECODE_INSTR_FLOAT;
- tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
- tu_cs_emit(cs, multiview_cntl);
+ const uint32_t vfd_decode_step_rate = 1;
- if (multiview_cntl &&
- builder->device->physical_device->info->a6xx.supports_multiview_mask) {
- tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
- tu_cs_emit(cs, pipeline->rast.multiview_mask);
- }
+ const uint32_t vfd_dest_cntl =
+ A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) |
+ A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid);
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
- tu_cs_emit(cs, 0);
+ tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DECODE(vfd_decode_idx), 2);
+ tu_cs_emit(cs, vfd_decode);
+ tu_cs_emit(cs, vfd_decode_step_rate);
- tu6_emit_vfd_dest(cs, vs);
+ tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL(vfd_decode_idx), 1);
+ tu_cs_emit(cs, vfd_dest_cntl);
- tu6_emit_vpc(cs, vs, hs, ds, gs, fs);
+ bindings[vfd_decode_idx] = vi_binding->binding;
+ strides[vfd_decode_idx] = vi_binding->stride;
+ offsets[vfd_decode_idx] = vi_attr->offset;
- if (fs) {
- tu6_emit_fs_inputs(cs, fs);
- tu6_emit_fs_outputs(cs, fs, pipeline);
- } else {
- /* TODO: check if these can be skipped if fs is disabled */
- struct ir3_shader_variant dummy_variant = {};
- tu6_emit_fs_inputs(cs, &dummy_variant);
- tu6_emit_fs_outputs(cs, &dummy_variant, NULL);
+ vfd_decode_idx++;
}
- if (gs || hs) {
- tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs);
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_0, 1);
+ tu_cs_emit(
+ cs, A6XX_VFD_CONTROL_0_VTXCNT(vfd_decode_idx) | (vfd_decode_idx << 8));
+
+ *count = vfd_decode_idx;
}
-void
-tu6_emit_vertex_input(struct tu_cs *cs,
- uint32_t binding_count,
- const VkVertexInputBindingDescription2EXT *bindings,
- uint32_t unsorted_attr_count,
- const VkVertexInputAttributeDescription2EXT *unsorted_attrs)
+static uint32_t
+tu6_guardband_adj(uint32_t v)
{
- uint32_t binding_instanced = 0; /* bitmask of instanced bindings */
- uint32_t step_rate[MAX_VBS];
-
- for (uint32_t i = 0; i < binding_count; i++) {
- const VkVertexInputBindingDescription2EXT *binding = &bindings[i];
-
- if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
- binding_instanced |= 1u << binding->binding;
-
- step_rate[binding->binding] = binding->divisor;
- }
-
- const VkVertexInputAttributeDescription2EXT *attrs[MAX_VERTEX_ATTRIBS] = { };
- unsigned attr_count = 0;
- for (uint32_t i = 0; i < unsorted_attr_count; i++) {
- const VkVertexInputAttributeDescription2EXT *attr = &unsorted_attrs[i];
- attrs[attr->location] = attr;
- attr_count = MAX2(attr_count, attr->location + 1);
- }
-
- if (attr_count != 0)
- tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DECODE_INSTR(0), attr_count * 2);
-
- for (uint32_t loc = 0; loc < attr_count; loc++) {
- const VkVertexInputAttributeDescription2EXT *attr = attrs[loc];
-
- if (attr) {
- const struct tu_native_format format = tu6_format_vtx(attr->format);
- tu_cs_emit(cs, A6XX_VFD_DECODE_INSTR(0,
- .idx = attr->binding,
- .offset = attr->offset,
- .instanced = binding_instanced & (1 << attr->binding),
- .format = format.fmt,
- .swap = format.swap,
- .unk30 = 1,
- ._float = !vk_format_is_int(attr->format)).value);
- tu_cs_emit(cs, A6XX_VFD_DECODE_STEP_RATE(0, step_rate[attr->binding]).value);
- } else {
- tu_cs_emit(cs, 0);
- tu_cs_emit(cs, 0);
- }
- }
+ if (v > 256)
+ return (uint32_t)(511.0 - 65.0 * (log2(v) - 8.0));
+ else
+ return 511;
}
void
-tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewports, uint32_t num_viewport,
- bool z_negative_one_to_one)
-{
- VkExtent2D guardband = {511, 511};
-
- tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), num_viewport * 6);
- for (uint32_t i = 0; i < num_viewport; i++) {
- const VkViewport *viewport = &viewports[i];
- float offsets[3];
- float scales[3];
- scales[0] = viewport->width / 2.0f;
- scales[1] = viewport->height / 2.0f;
- if (z_negative_one_to_one) {
- scales[2] = 0.5 * (viewport->maxDepth - viewport->minDepth);
- } else {
- scales[2] = viewport->maxDepth - viewport->minDepth;
- }
-
- offsets[0] = viewport->x + scales[0];
- offsets[1] = viewport->y + scales[1];
- if (z_negative_one_to_one) {
- offsets[2] = 0.5 * (viewport->minDepth + viewport->maxDepth);
- } else {
- offsets[2] = viewport->minDepth;
- }
-
- for (uint32_t j = 0; j < 3; j++) {
- tu_cs_emit(cs, fui(offsets[j]));
- tu_cs_emit(cs, fui(scales[j]));
- }
-
- guardband.width =
- MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
- guardband.height =
- MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
- }
-
- tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), num_viewport * 2);
- for (uint32_t i = 0; i < num_viewport; i++) {
- const VkViewport *viewport = &viewports[i];
- VkOffset2D min;
- VkOffset2D max;
- min.x = (int32_t) viewport->x;
- max.x = (int32_t) ceilf(viewport->x + viewport->width);
- if (viewport->height >= 0.0f) {
- min.y = (int32_t) viewport->y;
- max.y = (int32_t) ceilf(viewport->y + viewport->height);
- } else {
- min.y = (int32_t)(viewport->y + viewport->height);
- max.y = (int32_t) ceilf(viewport->y);
- }
- /* the spec allows viewport->height to be 0.0f */
- if (min.y == max.y)
- max.y++;
- /* allow viewport->width = 0.0f for un-initialized viewports: */
- if (min.x == max.x)
- max.x++;
-
- min.x = MAX2(min.x, 0);
- min.y = MAX2(min.y, 0);
- max.x = MAX2(max.x, 1);
- max.y = MAX2(max.y, 1);
-
- assert(min.x < max.x);
- assert(min.y < max.y);
-
- tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) |
- A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y));
- tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_X(max.x - 1) |
- A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_Y(max.y - 1));
- }
+tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport)
+{
+ float offsets[3];
+ float scales[3];
+ scales[0] = viewport->width / 2.0f;
+ scales[1] = viewport->height / 2.0f;
+ scales[2] = viewport->maxDepth - viewport->minDepth;
+ offsets[0] = viewport->x + scales[0];
+ offsets[1] = viewport->y + scales[1];
+ offsets[2] = viewport->minDepth;
+
+ VkOffset2D min;
+ VkOffset2D max;
+ min.x = (int32_t) viewport->x;
+ max.x = (int32_t) ceilf(viewport->x + viewport->width);
+ if (viewport->height >= 0.0f) {
+ min.y = (int32_t) viewport->y;
+ max.y = (int32_t) ceilf(viewport->y + viewport->height);
+ } else {
+ min.y = (int32_t)(viewport->y + viewport->height);
+ max.y = (int32_t) ceilf(viewport->y);
+ }
+ /* the spec allows viewport->height to be 0.0f */
+ if (min.y == max.y)
+ max.y++;
+ assert(min.x >= 0 && min.x < max.x);
+ assert(min.y >= 0 && min.y < max.y);
+
+ VkExtent2D guardband_adj;
+ guardband_adj.width = tu6_guardband_adj(max.x - min.x);
+ guardband_adj.height = tu6_guardband_adj(max.y - min.y);
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET_0, 6);
+ tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_XOFFSET_0(offsets[0]));
+ tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_XSCALE_0(scales[0]));
+ tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_YOFFSET_0(offsets[1]));
+ tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_YSCALE_0(scales[1]));
+ tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_ZOFFSET_0(offsets[2]));
+ tu_cs_emit(cs, A6XX_GRAS_CL_VPORT_ZSCALE_0(scales[2]));
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0, 2);
+ tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_X(min.x) |
+ A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_Y(min.y));
+ tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_X(max.x - 1) |
+ A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_Y(max.y - 1));
- tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), num_viewport * 2);
- for (uint32_t i = 0; i < num_viewport; i++) {
- const VkViewport *viewport = &viewports[i];
- tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
- tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
- }
tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
- tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) |
- A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height));
-
- /* TODO: what to do about this and multi viewport ? */
- float z_clamp_min = num_viewport ? MIN2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
- float z_clamp_max = num_viewport ? MAX2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
-
- tu_cs_emit_regs(cs,
- A6XX_RB_Z_CLAMP_MIN(z_clamp_min),
- A6XX_RB_Z_CLAMP_MAX(z_clamp_max));
+ tu_cs_emit(cs,
+ A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband_adj.width) |
+ A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband_adj.height));
}
void
-tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissors, uint32_t scissor_count)
+tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissor)
{
- tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), scissor_count * 2);
-
- for (uint32_t i = 0; i < scissor_count; i++) {
- const VkRect2D *scissor = &scissors[i];
-
- uint32_t min_x = scissor->offset.x;
- uint32_t min_y = scissor->offset.y;
- uint32_t max_x = min_x + scissor->extent.width - 1;
- uint32_t max_y = min_y + scissor->extent.height - 1;
-
- if (!scissor->extent.width || !scissor->extent.height) {
- min_x = min_y = 1;
- max_x = max_y = 0;
- } else {
- /* avoid overflow */
- uint32_t scissor_max = BITFIELD_MASK(15);
- min_x = MIN2(scissor_max, min_x);
- min_y = MIN2(scissor_max, min_y);
- max_x = MIN2(scissor_max, max_x);
- max_y = MIN2(scissor_max, max_y);
- }
+ const VkOffset2D min = scissor->offset;
+ const VkOffset2D max = {
+ scissor->offset.x + scissor->extent.width,
+ scissor->offset.y + scissor->extent.height,
+ };
- tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) |
- A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y));
- tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) |
- A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y));
- }
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0, 2);
+ tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0_X(min.x) |
+ A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0_Y(min.y));
+ tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0_X(max.x - 1) |
+ A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0_Y(max.y - 1));
}
-void
-tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc)
+static void
+tu6_emit_gras_unknowns(struct tu_cs *cs)
{
- if (!samp_loc) {
- tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1);
- tu_cs_emit(cs, 0);
-
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1);
- tu_cs_emit(cs, 0);
-
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1);
- tu_cs_emit(cs, 0);
- return;
- }
-
- assert(samp_loc->sampleLocationsPerPixel == samp_loc->sampleLocationsCount);
- assert(samp_loc->sampleLocationGridSize.width == 1);
- assert(samp_loc->sampleLocationGridSize.height == 1);
-
- uint32_t sample_config =
- A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE;
- uint32_t sample_locations = 0;
- for (uint32_t i = 0; i < samp_loc->sampleLocationsCount; i++) {
- sample_locations |=
- (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(samp_loc->pSampleLocations[i].x) |
- A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(samp_loc->pSampleLocations[i].y)) << i*8;
- }
-
- tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 2);
- tu_cs_emit(cs, sample_config);
- tu_cs_emit(cs, sample_locations);
-
- tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 2);
- tu_cs_emit(cs, sample_config);
- tu_cs_emit(cs, sample_locations);
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_8000, 1);
+ tu_cs_emit(cs, 0x80);
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_8001, 1);
+ tu_cs_emit(cs, 0x0);
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_UNKNOWN_8004, 1);
+ tu_cs_emit(cs, 0x0);
+}
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 2);
- tu_cs_emit(cs, sample_config);
- tu_cs_emit(cs, sample_locations);
+static void
+tu6_emit_point_size(struct tu_cs *cs)
+{
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POINT_MINMAX, 2);
+ tu_cs_emit(cs, A6XX_GRAS_SU_POINT_MINMAX_MIN(1.0f / 16.0f) |
+ A6XX_GRAS_SU_POINT_MINMAX_MAX(4092.0f));
+ tu_cs_emit(cs, A6XX_GRAS_SU_POINT_SIZE(1.0f));
}
static uint32_t
tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info,
- enum a5xx_line_mode line_mode,
- bool multiview)
+ VkSampleCountFlagBits samples)
{
uint32_t gras_su_cntl = 0;
@@ -2194,33 +1114,117 @@ tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info,
if (rast_info->frontFace == VK_FRONT_FACE_CLOCKWISE)
gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW;
- gras_su_cntl |=
- A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(rast_info->lineWidth / 2.0f);
+ /* don't set A6XX_GRAS_SU_CNTL_LINEHALFWIDTH */
if (rast_info->depthBiasEnable)
gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET;
- gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINE_MODE(line_mode);
-
- if (multiview) {
- gras_su_cntl |=
- A6XX_GRAS_SU_CNTL_UNK17 |
- A6XX_GRAS_SU_CNTL_MULTIVIEW_ENABLE;
- }
+ if (samples > VK_SAMPLE_COUNT_1_BIT)
+ gras_su_cntl |= A6XX_GRAS_SU_CNTL_MSAA_ENABLE;
return gras_su_cntl;
}
void
+tu6_emit_gras_su_cntl(struct tu_cs *cs,
+ uint32_t gras_su_cntl,
+ float line_width)
+{
+ assert((gras_su_cntl & A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK) == 0);
+ gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(line_width / 2.0f);
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_CNTL, 1);
+ tu_cs_emit(cs, gras_su_cntl);
+}
+
+void
tu6_emit_depth_bias(struct tu_cs *cs,
float constant_factor,
float clamp,
float slope_factor)
{
tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
- tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor).value);
- tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor).value);
- tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp).value);
+ tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor));
+ tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor));
+ tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp));
+}
+
+static void
+tu6_emit_alpha_control_disable(struct tu_cs *cs)
+{
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_ALPHA_CONTROL, 1);
+ tu_cs_emit(cs, 0);
+}
+
+static void
+tu6_emit_depth_control(struct tu_cs *cs,
+ const VkPipelineDepthStencilStateCreateInfo *ds_info)
+{
+ assert(!ds_info->depthBoundsTestEnable);
+
+ uint32_t rb_depth_cntl = 0;
+ if (ds_info->depthTestEnable) {
+ rb_depth_cntl |=
+ A6XX_RB_DEPTH_CNTL_Z_ENABLE |
+ A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) |
+ A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
+
+ if (ds_info->depthWriteEnable)
+ rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
+ }
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_CNTL, 1);
+ tu_cs_emit(cs, rb_depth_cntl);
+}
+
+static void
+tu6_emit_stencil_control(struct tu_cs *cs,
+ const VkPipelineDepthStencilStateCreateInfo *ds_info)
+{
+ uint32_t rb_stencil_control = 0;
+ if (ds_info->stencilTestEnable) {
+ const VkStencilOpState *front = &ds_info->front;
+ const VkStencilOpState *back = &ds_info->back;
+ rb_stencil_control |=
+ A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
+ A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
+ A6XX_RB_STENCIL_CONTROL_STENCIL_READ |
+ A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) |
+ A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) |
+ A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) |
+ A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) |
+ A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) |
+ A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) |
+ A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) |
+ A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp));
+ }
+
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_CONTROL, 1);
+ tu_cs_emit(cs, rb_stencil_control);
+}
+
+void
+tu6_emit_stencil_compare_mask(struct tu_cs *cs, uint32_t front, uint32_t back)
+{
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCILMASK, 1);
+ tu_cs_emit(
+ cs, A6XX_RB_STENCILMASK_MASK(front) | A6XX_RB_STENCILMASK_BFMASK(back));
+}
+
+void
+tu6_emit_stencil_write_mask(struct tu_cs *cs, uint32_t front, uint32_t back)
+{
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCILWRMASK, 1);
+ tu_cs_emit(cs, A6XX_RB_STENCILWRMASK_WRMASK(front) |
+ A6XX_RB_STENCILWRMASK_BFWRMASK(back));
+}
+
+void
+tu6_emit_stencil_reference(struct tu_cs *cs, uint32_t front, uint32_t back)
+{
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCILREF, 1);
+ tu_cs_emit(cs,
+ A6XX_RB_STENCILREF_REF(front) | A6XX_RB_STENCILREF_BFREF(back));
}
static uint32_t
@@ -2251,11 +1255,18 @@ tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState *att,
static uint32_t
tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att,
uint32_t rb_mrt_control_rop,
+ bool is_int,
bool has_alpha)
{
uint32_t rb_mrt_control =
A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE(att->colorWriteMask);
+ /* ignore blending and logic op for integer attachments */
+ if (is_int) {
+ rb_mrt_control |= A6XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY);
+ return rb_mrt_control;
+ }
+
rb_mrt_control |= rb_mrt_control_rop;
if (att->blendEnable) {
@@ -2268,44 +1279,23 @@ tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att,
return rb_mrt_control;
}
-uint32_t
-tu6_rb_mrt_control_rop(VkLogicOp op, bool *rop_reads_dst)
-{
- *rop_reads_dst = tu_logic_op_reads_dst(op);
- return A6XX_RB_MRT_CONTROL_ROP_ENABLE |
- A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(op));
-}
-
static void
-tu6_emit_rb_mrt_controls(struct tu_pipeline *pipeline,
+tu6_emit_rb_mrt_controls(struct tu_cs *cs,
const VkPipelineColorBlendStateCreateInfo *blend_info,
const VkFormat attachment_formats[MAX_RTS],
- bool *rop_reads_dst,
- uint32_t *color_bandwidth_per_sample)
+ uint32_t *blend_enable_mask)
{
- const VkPipelineColorWriteCreateInfoEXT *color_info =
- vk_find_struct_const(blend_info->pNext,
- PIPELINE_COLOR_WRITE_CREATE_INFO_EXT);
-
- /* The static state is ignored if it's dynamic. In that case assume
- * everything is enabled and then the appropriate registers will be zero'd
- * dynamically.
- */
- if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE))
- color_info = NULL;
-
- *rop_reads_dst = false;
- *color_bandwidth_per_sample = 0;
+ *blend_enable_mask = 0;
+ bool rop_reads_dst = false;
uint32_t rb_mrt_control_rop = 0;
if (blend_info->logicOpEnable) {
- pipeline->blend.logic_op_enabled = true;
- rb_mrt_control_rop = tu6_rb_mrt_control_rop(blend_info->logicOp,
- rop_reads_dst);
+ rop_reads_dst = tu_logic_op_reads_dst(blend_info->logicOp);
+ rb_mrt_control_rop =
+ A6XX_RB_MRT_CONTROL_ROP_ENABLE |
+ A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(blend_info->logicOp));
}
- uint32_t total_bpp = 0;
- pipeline->blend.num_rts = blend_info->attachmentCount;
for (uint32_t i = 0; i < blend_info->attachmentCount; i++) {
const VkPipelineColorBlendAttachmentState *att =
&blend_info->pAttachments[i];
@@ -2313,1273 +1303,179 @@ tu6_emit_rb_mrt_controls(struct tu_pipeline *pipeline,
uint32_t rb_mrt_control = 0;
uint32_t rb_mrt_blend_control = 0;
- if (format != VK_FORMAT_UNDEFINED &&
- (!color_info || color_info->pColorWriteEnables[i])) {
+ if (format != VK_FORMAT_UNDEFINED) {
+ const bool is_int = vk_format_is_int(format);
const bool has_alpha = vk_format_has_alpha(format);
rb_mrt_control =
- tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha);
+ tu6_rb_mrt_control(att, rb_mrt_control_rop, is_int, has_alpha);
rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha);
- /* calculate bpp based on format and write mask */
- uint32_t write_bpp = 0;
- if (att->colorWriteMask == 0xf) {
- write_bpp = vk_format_get_blocksizebits(format);
- } else {
- const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
- for (uint32_t i = 0; i < 4; i++) {
- if (att->colorWriteMask & (1 << i)) {
- write_bpp += util_format_get_component_bits(pipe_format,
- UTIL_FORMAT_COLORSPACE_RGB, i);
- }
- }
- }
- total_bpp += write_bpp;
-
- pipeline->blend.color_write_enable |= BIT(i);
- if (att->blendEnable)
- pipeline->blend.blend_enable |= BIT(i);
-
- if (att->blendEnable || *rop_reads_dst) {
- total_bpp += write_bpp;
- }
+ if (att->blendEnable || rop_reads_dst)
+ *blend_enable_mask |= 1 << i;
}
- pipeline->blend.rb_mrt_control[i] = rb_mrt_control & pipeline->blend.rb_mrt_control_mask;
- pipeline->blend.rb_mrt_blend_control[i] = rb_mrt_blend_control;
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_CONTROL(i), 2);
+ tu_cs_emit(cs, rb_mrt_control);
+ tu_cs_emit(cs, rb_mrt_blend_control);
}
- *color_bandwidth_per_sample = total_bpp / 8;
+ for (uint32_t i = blend_info->attachmentCount; i < MAX_RTS; i++) {
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_CONTROL(i), 2);
+ tu_cs_emit(cs, 0);
+ tu_cs_emit(cs, 0);
+ }
}
static void
-tu6_emit_blend_control(struct tu_pipeline *pipeline,
+tu6_emit_blend_control(struct tu_cs *cs,
uint32_t blend_enable_mask,
- bool dual_src_blend,
const VkPipelineMultisampleStateCreateInfo *msaa_info)
{
- const uint32_t sample_mask =
- msaa_info->pSampleMask ? (*msaa_info->pSampleMask & 0xffff)
- : ((1 << msaa_info->rasterizationSamples) - 1);
+ assert(!msaa_info->sampleShadingEnable);
+ assert(!msaa_info->alphaToOneEnable);
+ uint32_t sp_blend_cntl = A6XX_SP_BLEND_CNTL_UNK8;
+ if (blend_enable_mask)
+ sp_blend_cntl |= A6XX_SP_BLEND_CNTL_ENABLED;
+ if (msaa_info->alphaToCoverageEnable)
+ sp_blend_cntl |= A6XX_SP_BLEND_CNTL_ALPHA_TO_COVERAGE;
- pipeline->blend.sp_blend_cntl =
- A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
- .dual_color_in_enable = dual_src_blend,
- .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
- .unk8 = true).value & pipeline->blend.sp_blend_cntl_mask;
+ const uint32_t sample_mask =
+ msaa_info->pSampleMask ? *msaa_info->pSampleMask
+ : ((1 << msaa_info->rasterizationSamples) - 1);
/* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */
- pipeline->blend.rb_blend_cntl =
- A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask,
- .independent_blend = true,
- .sample_mask = sample_mask,
- .dual_color_in_enable = dual_src_blend,
- .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
- .alpha_to_one = msaa_info->alphaToOneEnable).value &
- pipeline->blend.rb_blend_cntl_mask;
-}
+ uint32_t rb_blend_cntl =
+ A6XX_RB_BLEND_CNTL_ENABLE_BLEND(blend_enable_mask) |
+ A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND |
+ A6XX_RB_BLEND_CNTL_SAMPLE_MASK(sample_mask);
+ if (msaa_info->alphaToCoverageEnable)
+ rb_blend_cntl |= A6XX_RB_BLEND_CNTL_ALPHA_TO_COVERAGE;
-static void
-tu6_emit_blend(struct tu_cs *cs,
- struct tu_pipeline *pipeline)
-{
- tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = pipeline->blend.num_rts));
- tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = pipeline->blend.num_rts));
- tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.dword = pipeline->blend.sp_blend_cntl));
- tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.dword = pipeline->blend.rb_blend_cntl));
-
- for (unsigned i = 0; i < pipeline->blend.num_rts; i++) {
- tu_cs_emit_regs(cs,
- A6XX_RB_MRT_CONTROL(i, .dword = pipeline->blend.rb_mrt_control[i]),
- A6XX_RB_MRT_BLEND_CONTROL(i, .dword = pipeline->blend.rb_mrt_blend_control[i]));
- }
-}
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_BLEND_CNTL, 1);
+ tu_cs_emit(cs, sp_blend_cntl);
-static VkResult
-tu_setup_pvtmem(struct tu_device *dev,
- struct tu_pipeline *pipeline,
- struct tu_pvtmem_config *config,
- uint32_t pvtmem_bytes,
- bool per_wave)
-{
- if (!pvtmem_bytes) {
- memset(config, 0, sizeof(*config));
- return VK_SUCCESS;
- }
-
- /* There is a substantial memory footprint from private memory BOs being
- * allocated on a per-pipeline basis and it isn't required as the same
- * BO can be utilized by multiple pipelines as long as they have the
- * private memory layout (sizes and per-wave/per-fiber) to avoid being
- * overwritten by other active pipelines using the same BO with differing
- * private memory layouts resulting memory corruption.
- *
- * To avoid this, we create private memory BOs on a per-device level with
- * an associated private memory layout then dynamically grow them when
- * needed and reuse them across pipelines. Growth is done in terms of
- * powers of two so that we can avoid frequent reallocation of the
- * private memory BOs.
- */
-
- struct tu_pvtmem_bo *pvtmem_bo =
- per_wave ? &dev->wave_pvtmem_bo : &dev->fiber_pvtmem_bo;
- mtx_lock(&pvtmem_bo->mtx);
-
- if (pvtmem_bo->per_fiber_size < pvtmem_bytes) {
- if (pvtmem_bo->bo)
- tu_bo_finish(dev, pvtmem_bo->bo);
-
- pvtmem_bo->per_fiber_size =
- util_next_power_of_two(ALIGN(pvtmem_bytes, 512));
- pvtmem_bo->per_sp_size =
- ALIGN(pvtmem_bo->per_fiber_size *
- dev->physical_device->info->a6xx.fibers_per_sp,
- 1 << 12);
- uint32_t total_size =
- dev->physical_device->info->num_sp_cores * pvtmem_bo->per_sp_size;
-
- VkResult result = tu_bo_init_new(dev, &pvtmem_bo->bo, total_size,
- TU_BO_ALLOC_NO_FLAGS, "pvtmem");
- if (result != VK_SUCCESS) {
- mtx_unlock(&pvtmem_bo->mtx);
- return result;
- }
- }
-
- config->per_wave = per_wave;
- config->per_fiber_size = pvtmem_bo->per_fiber_size;
- config->per_sp_size = pvtmem_bo->per_sp_size;
-
- pipeline->pvtmem_bo = tu_bo_get_ref(pvtmem_bo->bo);
- config->iova = pipeline->pvtmem_bo->iova;
-
- mtx_unlock(&pvtmem_bo->mtx);
-
- return VK_SUCCESS;
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_CNTL, 1);
+ tu_cs_emit(cs, rb_blend_cntl);
}
-static bool
-contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)
-{
- return (state &
- (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) ==
- (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT);
-}
-
-/* Return true if this pipeline contains all of the GPL stages listed but none
- * of the libraries it uses do, so this is "the first time" that all of them
- * are defined together. This is useful for state that needs to be combined
- * from multiple GPL stages.
- */
-
-static bool
-set_combined_state(struct tu_pipeline_builder *builder,
- struct tu_pipeline *pipeline,
- VkGraphicsPipelineLibraryFlagsEXT state)
+void
+tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4])
{
- if ((pipeline->state & state) != state)
- return false;
-
- for (unsigned i = 0; i < builder->num_libraries; i++) {
- if ((builder->libraries[i]->state & state) == state)
- return false;
- }
-
- return true;
+ tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_RED_F32, 4);
+ tu_cs_emit_array(cs, (const uint32_t *) constants, 4);
}
static VkResult
-tu_pipeline_allocate_cs(struct tu_device *dev,
- struct tu_pipeline *pipeline,
- struct tu_pipeline_layout *layout,
- struct tu_pipeline_builder *builder,
- struct ir3_shader_variant *compute)
+tu_pipeline_builder_create_pipeline(struct tu_pipeline_builder *builder,
+ struct tu_pipeline **out_pipeline)
{
- uint32_t size = 1024;
-
- /* graphics case: */
- if (builder) {
- if (builder->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
- size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS;
- }
-
- if (set_combined_state(builder, pipeline,
- VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
- size += 2 * TU6_EMIT_VFD_DEST_MAX_DWORDS;
- size += tu6_load_state_size(pipeline, layout);
+ struct tu_device *dev = builder->device;
- for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) {
- if (builder->variants[i]) {
- size += builder->variants[i]->info.size / 4;
- }
- }
-
- size += builder->binning_variant->info.size / 4;
-
- builder->additional_cs_reserve_size = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) {
- struct ir3_shader_variant *variant = builder->variants[i];
- if (variant) {
- builder->additional_cs_reserve_size +=
- tu_xs_get_additional_cs_size_dwords(variant);
-
- if (variant->binning) {
- builder->additional_cs_reserve_size +=
- tu_xs_get_additional_cs_size_dwords(variant->binning);
- }
- }
- }
-
- /* The additional size is used twice, once per tu6_emit_program() call. */
- size += builder->additional_cs_reserve_size * 2;
- }
- } else {
- size += tu6_load_state_size(pipeline, layout);
+ struct tu_pipeline *pipeline =
+ vk_zalloc2(&dev->alloc, builder->alloc, sizeof(*pipeline), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (!pipeline)
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
- size += compute->info.size / 4;
+ tu_cs_init(&pipeline->cs, TU_CS_MODE_SUB_STREAM, 2048);
- size += tu_xs_get_additional_cs_size_dwords(compute);
- }
-
- /* Allocate the space for the pipeline out of the device's RO suballocator.
- *
- * Sub-allocating BOs saves memory and also kernel overhead in refcounting of
- * BOs at exec time.
- *
- * The pipeline cache would seem like a natural place to stick the
- * suballocator, except that it is not guaranteed to outlive the pipelines
- * created from it, so you can't store any long-lived state there, and you
- * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because
- * pipeline destroy isn't synchronized by the cache.
- */
- pthread_mutex_lock(&dev->pipeline_mutex);
- VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc,
- size * 4, 128);
- pthread_mutex_unlock(&dev->pipeline_mutex);
- if (result != VK_SUCCESS)
+ /* reserve the space now such that tu_cs_begin_sub_stream never fails */
+ VkResult result = tu_cs_reserve_space(dev, &pipeline->cs, 2048);
+ if (result != VK_SUCCESS) {
+ vk_free2(&dev->alloc, builder->alloc, pipeline);
return result;
-
- tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo);
-
- return VK_SUCCESS;
-}
-
-static void
-tu_pipeline_shader_key_init(struct ir3_shader_key *key,
- const struct tu_pipeline *pipeline,
- struct tu_pipeline_builder *builder,
- nir_shader **nir)
-{
- /* We set this after we compile to NIR because we need the prim mode */
- key->tessellation = IR3_TESS_NONE;
-
- for (unsigned i = 0; i < builder->num_libraries; i++) {
- if (!(builder->libraries[i]->state &
- (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)))
- continue;
-
- const struct ir3_shader_key *library_key =
- &builder->libraries[i]->ir3_key;
-
- if (library_key->tessellation != IR3_TESS_NONE)
- key->tessellation = library_key->tessellation;
- key->has_gs |= library_key->has_gs;
- key->sample_shading |= library_key->sample_shading;
- }
-
- for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
- if (builder->create_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) {
- key->has_gs = true;
- break;
- }
- }
-
- if (!(builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT))
- return;
-
- if (builder->rasterizer_discard)
- return;
-
- const VkPipelineMultisampleStateCreateInfo *msaa_info =
- builder->create_info->pMultisampleState;
-
- /* The 1.3.215 spec says:
- *
- * Sample shading can be used to specify a minimum number of unique
- * samples to process for each fragment. If sample shading is enabled,
- * an implementation must provide a minimum of
- *
- * max(ceil(minSampleShadingFactor * totalSamples), 1)
- *
- * unique associated data for each fragment, where
- * minSampleShadingFactor is the minimum fraction of sample shading.
- *
- * The definition is pretty much the same as OpenGL's GL_SAMPLE_SHADING.
- * They both require unique associated data.
- *
- * There are discussions to change the definition, such that
- * sampleShadingEnable does not imply unique associated data. Before the
- * discussions are settled and before apps (i.e., ANGLE) are fixed to
- * follow the new and incompatible definition, we should stick to the
- * current definition.
- *
- * Note that ir3_shader_key::sample_shading is not actually used by ir3,
- * just checked in tu6_emit_fs_inputs. We will also copy the value to
- * tu_shader_key::force_sample_interp in a bit.
- */
- if (msaa_info && msaa_info->sampleShadingEnable &&
- (msaa_info->minSampleShading * msaa_info->rasterizationSamples) > 1.0f)
- key->sample_shading = true;
-}
-
-static uint32_t
-tu6_get_tessmode(struct tu_shader* shader)
-{
- enum tess_primitive_mode primitive_mode = shader->ir3_shader->nir->info.tess._primitive_mode;
- switch (primitive_mode) {
- case TESS_PRIMITIVE_ISOLINES:
- return IR3_TESS_ISOLINES;
- case TESS_PRIMITIVE_TRIANGLES:
- return IR3_TESS_TRIANGLES;
- case TESS_PRIMITIVE_QUADS:
- return IR3_TESS_QUADS;
- case TESS_PRIMITIVE_UNSPECIFIED:
- return IR3_TESS_NONE;
- default:
- unreachable("bad tessmode");
- }
-}
-
-static uint64_t
-tu_upload_variant(struct tu_pipeline *pipeline,
- const struct ir3_shader_variant *variant)
-{
- struct tu_cs_memory memory;
-
- if (!variant)
- return 0;
-
- /* this expects to get enough alignment because shaders are allocated first
- * and total size is always aligned correctly
- * note: an assert in tu6_emit_xs_config validates the alignment
- */
- tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory);
-
- memcpy(memory.map, variant->bin, variant->info.size);
- return memory.iova;
-}
-
-static void
-tu_append_executable(struct tu_pipeline *pipeline, struct ir3_shader_variant *variant,
- char *nir_from_spirv)
-{
- struct tu_pipeline_executable exe = {
- .stage = variant->type,
- .nir_from_spirv = nir_from_spirv,
- .nir_final = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.nir),
- .disasm = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.disasm),
- .stats = variant->info,
- .is_binning = variant->binning_pass,
- };
-
- util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe);
-}
-
-static bool
-can_remove_out_var(nir_variable *var, void *data)
-{
- return !var->data.explicit_xfb_buffer && !var->data.explicit_xfb_stride;
-}
-
-static void
-tu_link_shaders(struct tu_pipeline_builder *builder,
- nir_shader **shaders, unsigned shaders_count)
-{
- nir_shader *consumer = NULL;
- for (gl_shader_stage stage = shaders_count - 1;
- stage >= MESA_SHADER_VERTEX; stage--) {
- if (!shaders[stage])
- continue;
-
- nir_shader *producer = shaders[stage];
- if (!consumer) {
- consumer = producer;
- continue;
- }
-
- if (nir_link_opt_varyings(producer, consumer)) {
- NIR_PASS_V(consumer, nir_opt_constant_folding);
- NIR_PASS_V(consumer, nir_opt_algebraic);
- NIR_PASS_V(consumer, nir_opt_dce);
- }
-
- const nir_remove_dead_variables_options out_var_opts = {
- .can_remove_var = can_remove_out_var,
- };
- NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, &out_var_opts);
-
- NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
-
- bool progress = nir_remove_unused_varyings(producer, consumer);
-
- nir_compact_varyings(producer, consumer, true);
- if (progress) {
- if (nir_lower_global_vars_to_local(producer)) {
- /* Remove dead writes, which can remove input loads */
- NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL);
- NIR_PASS_V(producer, nir_opt_dce);
- }
- nir_lower_global_vars_to_local(consumer);
- }
-
- consumer = producer;
- }
-}
-
-static void
-tu_shader_key_init(struct tu_shader_key *key,
- const VkPipelineShaderStageCreateInfo *stage_info,
- struct tu_device *dev)
-{
- enum ir3_wavesize_option api_wavesize, real_wavesize;
-
- if (stage_info) {
- if (stage_info->flags &
- VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT) {
- api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
- } else {
- const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *size_info =
- vk_find_struct_const(stage_info->pNext,
- PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
-
- if (size_info) {
- if (size_info->requiredSubgroupSize == dev->compiler->threadsize_base) {
- api_wavesize = IR3_SINGLE_ONLY;
- } else {
- assert(size_info->requiredSubgroupSize == dev->compiler->threadsize_base * 2);
- api_wavesize = IR3_DOUBLE_ONLY;
- }
- } else {
- /* Match the exposed subgroupSize. */
- api_wavesize = IR3_DOUBLE_ONLY;
- }
-
- if (stage_info->flags &
- VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT)
- real_wavesize = api_wavesize;
- else if (api_wavesize == IR3_SINGLE_ONLY)
- real_wavesize = IR3_SINGLE_ONLY;
- else
- real_wavesize = IR3_SINGLE_OR_DOUBLE;
- }
- } else {
- api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
- }
-
- key->api_wavesize = api_wavesize;
- key->real_wavesize = real_wavesize;
-}
-
-static void
-tu_hash_stage(struct mesa_sha1 *ctx,
- const VkPipelineShaderStageCreateInfo *stage,
- const nir_shader *nir,
- const struct tu_shader_key *key)
-{
-
- if (nir) {
- struct blob blob;
- blob_init(&blob);
- nir_serialize(&blob, nir, true);
- _mesa_sha1_update(ctx, blob.data, blob.size);
- blob_finish(&blob);
- } else {
- unsigned char stage_hash[SHA1_DIGEST_LENGTH];
- vk_pipeline_hash_shader_stage(stage, NULL, stage_hash);
- _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash));
- }
- _mesa_sha1_update(ctx, key, sizeof(*key));
-}
-
-/* Hash flags which can affect ir3 shader compilation which aren't known until
- * logical device creation.
- */
-static void
-tu_hash_compiler(struct mesa_sha1 *ctx, const struct ir3_compiler *compiler)
-{
- _mesa_sha1_update(ctx, &compiler->robust_buffer_access2,
- sizeof(compiler->robust_buffer_access2));
- _mesa_sha1_update(ctx, &ir3_shader_debug, sizeof(ir3_shader_debug));
-}
-
-static void
-tu_hash_shaders(unsigned char *hash,
- const VkPipelineShaderStageCreateInfo **stages,
- nir_shader *const *nir,
- const struct tu_pipeline_layout *layout,
- const struct tu_shader_key *keys,
- const struct ir3_shader_key *ir3_key,
- VkGraphicsPipelineLibraryFlagsEXT state,
- const struct ir3_compiler *compiler)
-{
- struct mesa_sha1 ctx;
-
- _mesa_sha1_init(&ctx);
-
- if (layout)
- _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
-
- _mesa_sha1_update(&ctx, ir3_key, sizeof(ir3_key));
-
- for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
- if (stages[i] || nir[i]) {
- tu_hash_stage(&ctx, stages[i], nir[i], &keys[i]);
- }
- }
- _mesa_sha1_update(&ctx, &state, sizeof(state));
- tu_hash_compiler(&ctx, compiler);
- _mesa_sha1_final(&ctx, hash);
-}
-
-static void
-tu_hash_compute(unsigned char *hash,
- const VkPipelineShaderStageCreateInfo *stage,
- const struct tu_pipeline_layout *layout,
- const struct tu_shader_key *key,
- const struct ir3_compiler *compiler)
-{
- struct mesa_sha1 ctx;
-
- _mesa_sha1_init(&ctx);
-
- if (layout)
- _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
-
- tu_hash_stage(&ctx, stage, NULL, key);
-
- tu_hash_compiler(&ctx, compiler);
- _mesa_sha1_final(&ctx, hash);
-}
-
-static bool
-tu_shaders_serialize(struct vk_pipeline_cache_object *object,
- struct blob *blob);
-
-static struct vk_pipeline_cache_object *
-tu_shaders_deserialize(struct vk_device *device,
- const void *key_data, size_t key_size,
- struct blob_reader *blob);
-
-static void
-tu_shaders_destroy(struct vk_pipeline_cache_object *object)
-{
- struct tu_compiled_shaders *shaders =
- container_of(object, struct tu_compiled_shaders, base);
-
- for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++)
- ralloc_free(shaders->variants[i]);
-
- for (unsigned i = 0; i < ARRAY_SIZE(shaders->safe_const_variants); i++)
- ralloc_free(shaders->safe_const_variants[i]);
-
- vk_pipeline_cache_object_finish(&shaders->base);
- vk_free(&object->device->alloc, shaders);
-}
-
-const struct vk_pipeline_cache_object_ops tu_shaders_ops = {
- .serialize = tu_shaders_serialize,
- .deserialize = tu_shaders_deserialize,
- .destroy = tu_shaders_destroy,
-};
-
-static struct tu_compiled_shaders *
-tu_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size)
-{
- VK_MULTIALLOC(ma);
- VK_MULTIALLOC_DECL(&ma, struct tu_compiled_shaders, shaders, 1);
- VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, key_size);
-
- if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
- VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
- return NULL;
-
- memcpy(obj_key_data, key_data, key_size);
- vk_pipeline_cache_object_init(&dev->vk, &shaders->base,
- &tu_shaders_ops, obj_key_data, key_size);
-
- return shaders;
-}
-
-static bool
-tu_shaders_serialize(struct vk_pipeline_cache_object *object,
- struct blob *blob)
-{
- struct tu_compiled_shaders *shaders =
- container_of(object, struct tu_compiled_shaders, base);
-
- blob_write_bytes(blob, shaders->const_state, sizeof(shaders->const_state));
- blob_write_uint8(blob, shaders->active_desc_sets);
-
- for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) {
- if (shaders->variants[i]) {
- blob_write_uint8(blob, 1);
- ir3_store_variant(blob, shaders->variants[i]);
- } else {
- blob_write_uint8(blob, 0);
- }
-
- if (shaders->safe_const_variants[i]) {
- blob_write_uint8(blob, 1);
- ir3_store_variant(blob, shaders->safe_const_variants[i]);
- } else {
- blob_write_uint8(blob, 0);
- }
}
- return true;
-}
-
-static struct vk_pipeline_cache_object *
-tu_shaders_deserialize(struct vk_device *_device,
- const void *key_data, size_t key_size,
- struct blob_reader *blob)
-{
- struct tu_device *dev = container_of(_device, struct tu_device, vk);
- struct tu_compiled_shaders *shaders =
- tu_shaders_init(dev, key_data, key_size);
-
- if (!shaders)
- return NULL;
-
- blob_copy_bytes(blob, shaders->const_state, sizeof(shaders->const_state));
- shaders->active_desc_sets = blob_read_uint8(blob);
-
- for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) {
- if (blob_read_uint8(blob)) {
- shaders->variants[i] = ir3_retrieve_variant(blob, dev->compiler, NULL);
- }
-
- if (blob_read_uint8(blob)) {
- shaders->safe_const_variants[i] = ir3_retrieve_variant(blob, dev->compiler, NULL);
- }
- }
-
- return &shaders->base;
-}
-
-static struct tu_compiled_shaders *
-tu_pipeline_cache_lookup(struct vk_pipeline_cache *cache,
- const void *key_data, size_t key_size,
- bool *application_cache_hit)
-{
- struct vk_pipeline_cache_object *object =
- vk_pipeline_cache_lookup_object(cache, key_data, key_size,
- &tu_shaders_ops, application_cache_hit);
- if (object)
- return container_of(object, struct tu_compiled_shaders, base);
- else
- return NULL;
-}
-
-static struct tu_compiled_shaders *
-tu_pipeline_cache_insert(struct vk_pipeline_cache *cache,
- struct tu_compiled_shaders *shaders)
-{
- struct vk_pipeline_cache_object *object =
- vk_pipeline_cache_add_object(cache, &shaders->base);
- return container_of(object, struct tu_compiled_shaders, base);
-}
-
-static bool
-tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
- struct blob *blob);
-
-static struct vk_pipeline_cache_object *
-tu_nir_shaders_deserialize(struct vk_device *device,
- const void *key_data, size_t key_size,
- struct blob_reader *blob);
-
-static void
-tu_nir_shaders_destroy(struct vk_pipeline_cache_object *object)
-{
- struct tu_nir_shaders *shaders =
- container_of(object, struct tu_nir_shaders, base);
-
- for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++)
- ralloc_free(shaders->nir[i]);
-
- vk_pipeline_cache_object_finish(&shaders->base);
- vk_free(&object->device->alloc, shaders);
-}
-
-const struct vk_pipeline_cache_object_ops tu_nir_shaders_ops = {
- .serialize = tu_nir_shaders_serialize,
- .deserialize = tu_nir_shaders_deserialize,
- .destroy = tu_nir_shaders_destroy,
-};
-
-static struct tu_nir_shaders *
-tu_nir_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size)
-{
- VK_MULTIALLOC(ma);
- VK_MULTIALLOC_DECL(&ma, struct tu_nir_shaders, shaders, 1);
- VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, key_size);
-
- if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
- VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
- return NULL;
-
- memcpy(obj_key_data, key_data, key_size);
- vk_pipeline_cache_object_init(&dev->vk, &shaders->base,
- &tu_nir_shaders_ops, obj_key_data, key_size);
-
- return shaders;
-}
-
-static bool
-tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
- struct blob *blob)
-{
- struct tu_nir_shaders *shaders =
- container_of(object, struct tu_nir_shaders, base);
-
- for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
- if (shaders->nir[i]) {
- blob_write_uint8(blob, 1);
- nir_serialize(blob, shaders->nir[i], true);
- } else {
- blob_write_uint8(blob, 0);
- }
- }
-
- return true;
-}
-
-static struct vk_pipeline_cache_object *
-tu_nir_shaders_deserialize(struct vk_device *_device,
- const void *key_data, size_t key_size,
- struct blob_reader *blob)
-{
- struct tu_device *dev = container_of(_device, struct tu_device, vk);
- struct tu_nir_shaders *shaders =
- tu_nir_shaders_init(dev, key_data, key_size);
-
- if (!shaders)
- return NULL;
-
- for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
- if (blob_read_uint8(blob)) {
- shaders->nir[i] =
- nir_deserialize(NULL, ir3_get_compiler_options(dev->compiler), blob);
- }
- }
-
- return &shaders->base;
-}
+ *out_pipeline = pipeline;
-static struct tu_nir_shaders *
-tu_nir_cache_lookup(struct vk_pipeline_cache *cache,
- const void *key_data, size_t key_size,
- bool *application_cache_hit)
-{
- struct vk_pipeline_cache_object *object =
- vk_pipeline_cache_lookup_object(cache, key_data, key_size,
- &tu_nir_shaders_ops, application_cache_hit);
- if (object)
- return container_of(object, struct tu_nir_shaders, base);
- else
- return NULL;
-}
-
-static struct tu_nir_shaders *
-tu_nir_cache_insert(struct vk_pipeline_cache *cache,
- struct tu_nir_shaders *shaders)
-{
- struct vk_pipeline_cache_object *object =
- vk_pipeline_cache_add_object(cache, &shaders->base);
- return container_of(object, struct tu_nir_shaders, base);
+ return VK_SUCCESS;
}
-
static VkResult
-tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
- struct tu_pipeline *pipeline)
+tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder)
{
- VkResult result = VK_SUCCESS;
- const struct ir3_compiler *compiler = builder->device->compiler;
const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
NULL
};
- VkPipelineCreationFeedback pipeline_feedback = {
- .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
- };
- VkPipelineCreationFeedback stage_feedbacks[MESA_SHADER_STAGES] = { 0 };
-
- int64_t pipeline_start = os_time_get_nano();
-
- const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
- vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
-
- bool must_compile =
- builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT;
for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
gl_shader_stage stage =
- vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
+ tu_shader_stage(builder->create_info->pStages[i].stage);
stage_infos[stage] = &builder->create_info->pStages[i];
- must_compile = true;
- }
-
- if (tu6_shared_constants_enable(&builder->layout, builder->device->compiler)) {
- pipeline->shared_consts = (struct tu_push_constant_range) {
- .lo = 0,
- .dwords = builder->layout.push_constant_size / 4,
- };
- }
-
- nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL };
-
- struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { };
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(keys); stage++) {
- tu_shader_key_init(&keys[stage], stage_infos[stage], builder->device);
- }
-
- if (builder->create_info->flags &
- VK_PIPELINE_CREATE_LINK_TIME_OPTIMIZATION_BIT_EXT) {
- for (unsigned i = 0; i < builder->num_libraries; i++) {
- struct tu_pipeline *library = builder->libraries[i];
-
- for (unsigned j = 0; j < ARRAY_SIZE(library->shaders); j++) {
- if (library->shaders[j].nir) {
- assert(!nir[j]);
- nir[j] = nir_shader_clone(builder->mem_ctx,
- library->shaders[j].nir);
- keys[j] = library->shaders[j].key;
- must_compile = true;
- }
- }
- }
}
- struct ir3_shader_key ir3_key = {};
- tu_pipeline_shader_key_init(&ir3_key, pipeline, builder, nir);
+ struct tu_shader_compile_options options;
+ tu_shader_compile_options_init(&options, builder->create_info);
- struct tu_compiled_shaders *compiled_shaders = NULL;
- struct tu_nir_shaders *nir_shaders = NULL;
- if (!must_compile)
- goto done;
-
- if (builder->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
- keys[MESA_SHADER_VERTEX].multiview_mask = builder->multiview_mask;
- }
-
- if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
- keys[MESA_SHADER_FRAGMENT].multiview_mask = builder->multiview_mask;
- keys[MESA_SHADER_FRAGMENT].force_sample_interp = ir3_key.sample_shading;
- }
-
- unsigned char pipeline_sha1[20];
- tu_hash_shaders(pipeline_sha1, stage_infos, nir, &builder->layout, keys,
- &ir3_key, builder->state, compiler);
-
- unsigned char nir_sha1[21];
- memcpy(nir_sha1, pipeline_sha1, sizeof(pipeline_sha1));
- nir_sha1[20] = 'N';
-
- const bool executable_info = builder->create_info->flags &
- VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
-
- char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL };
-
- if (!executable_info) {
- bool cache_hit = false;
- bool application_cache_hit = false;
-
- compiled_shaders =
- tu_pipeline_cache_lookup(builder->cache, &pipeline_sha1,
- sizeof(pipeline_sha1),
- &application_cache_hit);
-
- cache_hit = !!compiled_shaders;
-
- /* If the user asks us to keep the NIR around, we need to have it for a
- * successful cache hit. If we only have a "partial" cache hit, then we
- * still need to recompile in order to get the NIR.
- */
- if (compiled_shaders &&
- (builder->create_info->flags &
- VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT)) {
- bool nir_application_cache_hit = false;
- nir_shaders =
- tu_nir_cache_lookup(builder->cache, &nir_sha1,
- sizeof(nir_sha1),
- &nir_application_cache_hit);
-
- application_cache_hit &= nir_application_cache_hit;
- cache_hit &= !!nir_shaders;
- }
-
- if (application_cache_hit && builder->cache != builder->device->mem_cache) {
- pipeline_feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
- }
-
- if (cache_hit)
- goto done;
- }
-
- if (builder->create_info->flags &
- VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) {
- return VK_PIPELINE_COMPILE_REQUIRED;
- }
-
- struct tu_shader *shaders[ARRAY_SIZE(nir)] = { NULL };
-
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(nir); stage++) {
+ /* compile shaders in reverse order */
+ struct tu_shader *next_stage_shader = NULL;
+ for (gl_shader_stage stage = MESA_SHADER_STAGES - 1;
+ stage > MESA_SHADER_NONE; stage--) {
const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
if (!stage_info)
continue;
- int64_t stage_start = os_time_get_nano();
-
- nir[stage] = tu_spirv_to_nir(builder->device, builder->mem_ctx, stage_info, stage);
- if (!nir[stage]) {
- result = VK_ERROR_OUT_OF_HOST_MEMORY;
- goto fail;
- }
-
- stage_feedbacks[stage].flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
- stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
- }
-
- if (!nir[MESA_SHADER_FRAGMENT] &&
- (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
- const nir_shader_compiler_options *nir_options =
- ir3_get_compiler_options(builder->device->compiler);
- nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
- nir_options,
- "noop_fs");
- nir[MESA_SHADER_FRAGMENT] = fs_b.shader;
- }
-
- if (executable_info) {
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(nir); stage++) {
- if (!nir[stage])
- continue;
-
- nir_initial_disasm[stage] =
- nir_shader_as_str(nir[stage], pipeline->executables_mem_ctx);
- }
- }
-
- tu_link_shaders(builder, nir, ARRAY_SIZE(nir));
-
- if (builder->create_info->flags &
- VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT) {
- nir_shaders =
- tu_nir_shaders_init(builder->device, &nir_sha1, sizeof(nir_sha1));
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(nir); stage++) {
- if (!nir[stage])
- continue;
-
- nir_shaders->nir[stage] = nir_shader_clone(NULL, nir[stage]);
- }
-
- nir_shaders = tu_nir_cache_insert(builder->cache, nir_shaders);
-
- if (compiled_shaders)
- goto done;
- }
-
- compiled_shaders =
- tu_shaders_init(builder->device, &pipeline_sha1, sizeof(pipeline_sha1));
-
- if (!compiled_shaders) {
- result = VK_ERROR_OUT_OF_HOST_MEMORY;
- goto fail;
- }
-
- uint32_t desc_sets = 0;
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(nir); stage++) {
- if (!nir[stage])
- continue;
-
- int64_t stage_start = os_time_get_nano();
-
struct tu_shader *shader =
- tu_shader_create(builder->device, nir[stage], &keys[stage],
- &builder->layout, builder->alloc);
- if (!shader) {
- result = VK_ERROR_OUT_OF_HOST_MEMORY;
- goto fail;
- }
-
- /* In SPIR-V generated from GLSL, the primitive mode is specified in the
- * tessellation evaluation shader, but in SPIR-V generated from HLSL,
- * the mode is specified in the tessellation control shader. */
- if ((stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_TESS_CTRL) &&
- ir3_key.tessellation == IR3_TESS_NONE) {
- ir3_key.tessellation = tu6_get_tessmode(shader);
- }
-
- if (stage > MESA_SHADER_TESS_CTRL) {
- if (stage == MESA_SHADER_FRAGMENT) {
- ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
- (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID));
- } else {
- ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
- BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
- }
- }
-
- /* Keep track of the status of each shader's active descriptor sets,
- * which is set in tu_lower_io. */
- desc_sets |= shader->active_desc_sets;
-
- shaders[stage] = shader;
-
- stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
- }
-
- /* In the the tess-but-not-FS case we don't know whether the FS will read
- * PrimID so we need to unconditionally store it.
- */
- if (nir[MESA_SHADER_TESS_CTRL] && !nir[MESA_SHADER_FRAGMENT])
- ir3_key.tcs_store_primid = true;
-
- struct tu_shader *last_shader = shaders[MESA_SHADER_GEOMETRY];
- if (!last_shader)
- last_shader = shaders[MESA_SHADER_TESS_EVAL];
- if (!last_shader)
- last_shader = shaders[MESA_SHADER_VERTEX];
-
- compiled_shaders->active_desc_sets = desc_sets;
-
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(shaders); stage++) {
- if (!shaders[stage])
- continue;
-
- int64_t stage_start = os_time_get_nano();
-
- compiled_shaders->variants[stage] =
- ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key,
- executable_info);
- if (!compiled_shaders->variants[stage])
+ tu_shader_create(builder->device, stage, stage_info, builder->alloc);
+ if (!shader)
return VK_ERROR_OUT_OF_HOST_MEMORY;
- compiled_shaders->const_state[stage] = shaders[stage]->const_state;
-
- stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
- }
-
- uint32_t safe_constlens = ir3_trim_constlen(compiled_shaders->variants, compiler);
-
- ir3_key.safe_constlen = true;
-
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(shaders); stage++) {
- if (!shaders[stage])
- continue;
-
- if (safe_constlens & (1 << stage)) {
- int64_t stage_start = os_time_get_nano();
-
- ralloc_free(compiled_shaders->variants[stage]);
- compiled_shaders->variants[stage] =
- ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key,
- executable_info);
- if (!compiled_shaders->variants[stage]) {
- result = VK_ERROR_OUT_OF_HOST_MEMORY;
- goto fail;
- }
-
- stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
- } else if (contains_all_shader_state(builder->state)) {
- compiled_shaders->safe_const_variants[stage] =
- ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key,
- executable_info);
- if (!compiled_shaders->variants[stage]) {
- result = VK_ERROR_OUT_OF_HOST_MEMORY;
- goto fail;
- }
- }
- }
-
- ir3_key.safe_constlen = false;
-
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(nir); stage++) {
- if (shaders[stage]) {
- tu_shader_destroy(builder->device, shaders[stage], builder->alloc);
- }
- }
-
- compiled_shaders =
- tu_pipeline_cache_insert(builder->cache, compiled_shaders);
-
-done:;
-
- struct ir3_shader_variant *safe_const_variants[ARRAY_SIZE(nir)] = { NULL };
- nir_shader *post_link_nir[ARRAY_SIZE(nir)] = { NULL };
-
- if (compiled_shaders) {
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(nir); stage++) {
- if (compiled_shaders->variants[stage]) {
- tu_append_executable(pipeline, compiled_shaders->variants[stage],
- nir_initial_disasm[stage]);
- builder->variants[stage] = compiled_shaders->variants[stage];
- safe_const_variants[stage] =
- compiled_shaders->safe_const_variants[stage];
- builder->const_state[stage] =
- compiled_shaders->const_state[stage];
- }
- }
- }
-
- if (nir_shaders) {
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(nir); stage++) {
- if (nir_shaders->nir[stage]) {
- post_link_nir[stage] = nir_shaders->nir[stage];
- }
- }
- }
-
- /* In the case where we're building a library without link-time
- * optimization but with sub-libraries that retain LTO info, we should
- * retain it ourselves in case another pipeline includes us with LTO.
- */
- for (unsigned i = 0; i < builder->num_libraries; i++) {
- struct tu_pipeline *library = builder->libraries[i];
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(library->shaders); stage++) {
- if (!post_link_nir[stage] && library->shaders[stage].nir) {
- post_link_nir[stage] = library->shaders[stage].nir;
- keys[stage] = library->shaders[stage].key;
- }
- }
- }
+ VkResult result =
+ tu_shader_compile(builder->device, shader, next_stage_shader,
+ &options, builder->alloc);
+ if (result != VK_SUCCESS)
+ return result;
- if (!(builder->create_info->flags &
- VK_PIPELINE_CREATE_LINK_TIME_OPTIMIZATION_BIT_EXT)) {
- for (unsigned i = 0; i < builder->num_libraries; i++) {
- struct tu_pipeline *library = builder->libraries[i];
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(library->shaders); stage++) {
- if (library->shaders[stage].variant) {
- assert(!builder->variants[stage]);
- builder->variants[stage] = library->shaders[stage].variant;
- safe_const_variants[stage] =
- library->shaders[stage].safe_const_variant;
- builder->const_state[stage] =
- library->shaders[stage].const_state;
- post_link_nir[stage] = library->shaders[stage].nir;
- }
- }
- }
+ builder->shaders[stage] = shader;
+ builder->shader_offsets[stage] = builder->shader_total_size;
+ builder->shader_total_size +=
+ sizeof(uint32_t) * shader->variants[0].info.sizedwords;
- /* Because we added more variants, we need to trim constlen again.
- */
- if (builder->num_libraries > 0) {
- uint32_t safe_constlens = ir3_trim_constlen(builder->variants, compiler);
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(builder->variants); stage++) {
- if (safe_constlens & (1u << stage))
- builder->variants[stage] = safe_const_variants[stage];
- }
- }
+ next_stage_shader = shader;
}
- if (compiled_shaders)
- pipeline->active_desc_sets = compiled_shaders->active_desc_sets;
-
- for (unsigned i = 0; i < builder->num_libraries; i++) {
- struct tu_pipeline *library = builder->libraries[i];
- pipeline->active_desc_sets |= library->active_desc_sets;
+ if (builder->shaders[MESA_SHADER_VERTEX]->has_binning_pass) {
+ const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX];
+ builder->binning_vs_offset = builder->shader_total_size;
+ builder->shader_total_size +=
+ sizeof(uint32_t) * vs->variants[1].info.sizedwords;
}
- if (compiled_shaders && compiled_shaders->variants[MESA_SHADER_TESS_CTRL]) {
- pipeline->tess.patch_type =
- compiled_shaders->variants[MESA_SHADER_TESS_CTRL]->key.tessellation;
- }
+ return VK_SUCCESS;
+}
- if (contains_all_shader_state(pipeline->state)) {
- struct ir3_shader_variant *vs =
- builder->variants[MESA_SHADER_VERTEX];
+static VkResult
+tu_pipeline_builder_upload_shaders(struct tu_pipeline_builder *builder,
+ struct tu_pipeline *pipeline)
+{
+ struct tu_bo *bo = &pipeline->program.binary_bo;
- struct ir3_shader_variant *variant;
- if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) {
- tu_append_executable(pipeline, vs->binning, NULL);
- variant = vs->binning;
- } else {
- variant = vs;
- }
+ VkResult result =
+ tu_bo_init_new(builder->device, bo, builder->shader_total_size);
+ if (result != VK_SUCCESS)
+ return result;
- builder->binning_variant = variant;
+ result = tu_bo_map(builder->device, bo);
+ if (result != VK_SUCCESS)
+ return result;
- builder->compiled_shaders = compiled_shaders;
+ for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) {
+ const struct tu_shader *shader = builder->shaders[i];
+ if (!shader)
+ continue;
- /* It doesn't make much sense to use RETAIN_LINK_TIME_OPTIMIZATION_INFO
- * when compiling all stages, but make sure we don't leak.
- */
- if (nir_shaders)
- vk_pipeline_cache_object_unref(&nir_shaders->base);
- } else {
- pipeline->compiled_shaders = compiled_shaders;
- pipeline->nir_shaders = nir_shaders;
- pipeline->ir3_key = ir3_key;
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(pipeline->shaders); stage++) {
- pipeline->shaders[stage].nir = post_link_nir[stage];
- pipeline->shaders[stage].key = keys[stage];
- pipeline->shaders[stage].const_state = builder->const_state[stage];
- pipeline->shaders[stage].variant = builder->variants[stage];
- pipeline->shaders[stage].safe_const_variant =
- safe_const_variants[stage];
- }
+ memcpy(bo->map + builder->shader_offsets[i], shader->binary,
+ sizeof(uint32_t) * shader->variants[0].info.sizedwords);
}
- pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
- if (creation_feedback) {
- *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
-
- for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
- gl_shader_stage s =
- vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
- creation_feedback->pPipelineStageCreationFeedbacks[i] = stage_feedbacks[s];
- }
+ if (builder->shaders[MESA_SHADER_VERTEX]->has_binning_pass) {
+ const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX];
+ memcpy(bo->map + builder->binning_vs_offset, vs->binning_binary,
+ sizeof(uint32_t) * vs->variants[1].info.sizedwords);
}
return VK_SUCCESS;
-
-fail:
- for (gl_shader_stage stage = MESA_SHADER_VERTEX;
- stage < ARRAY_SIZE(nir); stage++) {
- if (shaders[stage]) {
- tu_shader_destroy(builder->device, shaders[stage], builder->alloc);
- }
- }
-
- if (compiled_shaders)
- vk_pipeline_cache_object_unref(&compiled_shaders->base);
-
- if (nir_shaders)
- vk_pipeline_cache_object_unref(&nir_shaders->base);
-
- return result;
}
static void
@@ -3589,449 +1485,56 @@ tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder *builder,
const VkPipelineDynamicStateCreateInfo *dynamic_info =
builder->create_info->pDynamicState;
- pipeline->rast.gras_su_cntl_mask = ~0u;
- pipeline->rast.pc_raster_cntl_mask = ~0u;
- pipeline->rast.vpc_unknown_9107_mask = ~0u;
- pipeline->ds.rb_depth_cntl_mask = ~0u;
- pipeline->ds.rb_stencil_cntl_mask = ~0u;
- pipeline->blend.sp_blend_cntl_mask = ~0u;
- pipeline->blend.rb_blend_cntl_mask = ~0u;
- pipeline->blend.rb_mrt_control_mask = ~0u;
-
if (!dynamic_info)
return;
for (uint32_t i = 0; i < dynamic_info->dynamicStateCount; i++) {
- VkDynamicState state = dynamic_info->pDynamicStates[i];
- switch (state) {
- case VK_DYNAMIC_STATE_VIEWPORT ... VK_DYNAMIC_STATE_STENCIL_REFERENCE:
- if (state == VK_DYNAMIC_STATE_LINE_WIDTH)
- pipeline->rast.gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
- pipeline->dynamic_state_mask |= BIT(state);
- break;
- case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS);
- break;
- case VK_DYNAMIC_STATE_CULL_MODE:
- pipeline->rast.gras_su_cntl_mask &=
- ~(A6XX_GRAS_SU_CNTL_CULL_BACK | A6XX_GRAS_SU_CNTL_CULL_FRONT);
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
- break;
- case VK_DYNAMIC_STATE_FRONT_FACE:
- pipeline->rast.gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_FRONT_CW;
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
- break;
- case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY:
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY);
- break;
- case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE:
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VB_STRIDE);
- break;
- case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT:
- pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT);
- break;
- case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT:
- pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR);
- break;
- case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE:
- pipeline->ds.rb_depth_cntl_mask &=
- ~(A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE);
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
- break;
- case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE:
- pipeline->ds.rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
- break;
- case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP:
- pipeline->ds.rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK;
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
- break;
- case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE:
- pipeline->ds.rb_depth_cntl_mask &=
- ~(A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE);
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
- break;
- case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE:
- pipeline->ds.rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
- A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
- A6XX_RB_STENCIL_CONTROL_STENCIL_READ);
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
- break;
- case VK_DYNAMIC_STATE_STENCIL_OP:
- pipeline->ds.rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_FUNC__MASK |
- A6XX_RB_STENCIL_CONTROL_FAIL__MASK |
- A6XX_RB_STENCIL_CONTROL_ZPASS__MASK |
- A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK |
- A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK |
- A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK |
- A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK |
- A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK);
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
- break;
- case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE:
- pipeline->rast.gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET;
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
- break;
- case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE:
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE);
- break;
- case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE:
- pipeline->rast.pc_raster_cntl_mask &= ~A6XX_PC_RASTER_CNTL_DISCARD;
- pipeline->rast.vpc_unknown_9107_mask &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD);
- break;
- case VK_DYNAMIC_STATE_LOGIC_OP_EXT:
- pipeline->blend.sp_blend_cntl_mask &= ~A6XX_SP_BLEND_CNTL_ENABLE_BLEND__MASK;
- pipeline->blend.rb_blend_cntl_mask &= ~A6XX_RB_BLEND_CNTL_ENABLE_BLEND__MASK;
- pipeline->blend.rb_mrt_control_mask &= ~A6XX_RB_MRT_CONTROL_ROP_CODE__MASK;
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_BLEND);
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_LOGIC_OP);
- break;
- case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
- pipeline->blend.sp_blend_cntl_mask &= ~A6XX_SP_BLEND_CNTL_ENABLE_BLEND__MASK;
- pipeline->blend.rb_blend_cntl_mask &= ~A6XX_RB_BLEND_CNTL_ENABLE_BLEND__MASK;
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_BLEND);
-
- /* Dynamic color write enable doesn't directly change any of the
- * registers, but it causes us to make some of the registers 0, so we
- * set this dynamic state instead of making the register dynamic.
- */
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE);
- break;
- case VK_DYNAMIC_STATE_VERTEX_INPUT_EXT:
- pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VERTEX_INPUT) |
- BIT(TU_DYNAMIC_STATE_VB_STRIDE);
- break;
- case VK_DYNAMIC_STATE_PATCH_CONTROL_POINTS_EXT:
- pipeline->dynamic_state_mask |=
- BIT(TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS);
- break;
- default:
- assert(!"unsupported dynamic state");
- break;
- }
- }
-}
-
-static void
-tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder *builder,
- struct tu_pipeline *pipeline)
-{
- const VkPipelineLibraryCreateInfoKHR *library_info =
- vk_find_struct_const(builder->create_info->pNext,
- PIPELINE_LIBRARY_CREATE_INFO_KHR);
-
- if (library_info) {
- assert(library_info->libraryCount <= MAX_LIBRARIES);
- builder->num_libraries = library_info->libraryCount;
- for (unsigned i = 0; i < library_info->libraryCount; i++) {
- TU_FROM_HANDLE(tu_pipeline, library, library_info->pLibraries[i]);
- builder->libraries[i] = library;
- }
- }
-
- /* Merge in the state from libraries. The program state is a bit special
- * and is handled separately.
- */
- pipeline->state = builder->state;
- for (unsigned i = 0; i < builder->num_libraries; i++) {
- struct tu_pipeline *library = builder->libraries[i];
- pipeline->state |= library->state;
-
- uint32_t library_dynamic_state = 0;
- if (library->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
- pipeline->vi = library->vi;
- pipeline->ia = library->ia;
- library_dynamic_state |=
- BIT(TU_DYNAMIC_STATE_VERTEX_INPUT) |
- BIT(TU_DYNAMIC_STATE_VB_STRIDE) |
- BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY) |
- BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE);
- pipeline->shared_consts = library->shared_consts;
- }
-
- if (library->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
- pipeline->tess = library->tess;
- pipeline->rast = library->rast;
- pipeline->viewport = library->viewport;
- library_dynamic_state |=
- BIT(VK_DYNAMIC_STATE_VIEWPORT) |
- BIT(VK_DYNAMIC_STATE_SCISSOR) |
- BIT(VK_DYNAMIC_STATE_LINE_WIDTH) |
- BIT(VK_DYNAMIC_STATE_DEPTH_BIAS) |
- BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD) |
- BIT(TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS);
- }
-
- if (library->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
- pipeline->ds = library->ds;
- pipeline->lrz.fs = library->lrz.fs;
- pipeline->lrz.force_disable_mask |= library->lrz.force_disable_mask;
- pipeline->lrz.force_late_z |= library->lrz.force_late_z;
- library_dynamic_state |=
- BIT(VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK) |
- BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK) |
- BIT(VK_DYNAMIC_STATE_STENCIL_REFERENCE) |
- BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL) |
- BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL) |
- BIT(VK_DYNAMIC_STATE_DEPTH_BOUNDS);
- pipeline->shared_consts = library->shared_consts;
- }
-
- if (library->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
- pipeline->blend = library->blend;
- pipeline->output = library->output;
- pipeline->lrz.force_disable_mask |= library->lrz.force_disable_mask;
- pipeline->lrz.force_late_z |= library->lrz.force_late_z;
- pipeline->prim_order = library->prim_order;
- library_dynamic_state |=
- BIT(VK_DYNAMIC_STATE_BLEND_CONSTANTS) |
- BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS) |
- BIT(TU_DYNAMIC_STATE_BLEND) |
- BIT(TU_DYNAMIC_STATE_LOGIC_OP) |
- BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE);
- }
-
- if ((library->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
- (library->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
- pipeline->prim_order = library->prim_order;
- }
-
- if ((library->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
- (library->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) &&
- (library->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT)) {
- pipeline->rast_ds = library->rast_ds;
- }
-
- pipeline->dynamic_state_mask =
- (pipeline->dynamic_state_mask & ~library_dynamic_state) |
- (library->dynamic_state_mask & library_dynamic_state);
-
- u_foreach_bit (i, library_dynamic_state & ~library->dynamic_state_mask) {
- if (i >= TU_DYNAMIC_STATE_COUNT)
- break;
-
- pipeline->dynamic_state[i] = library->dynamic_state[i];
- }
-
- if (contains_all_shader_state(library->state)) {
- pipeline->program = library->program;
- pipeline->load_state = library->load_state;
- }
+ pipeline->dynamic_state.mask |=
+ tu_dynamic_state_bit(dynamic_info->pDynamicStates[i]);
}
}
static void
-tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder,
- struct tu_pipeline *pipeline)
-{
- TU_FROM_HANDLE(tu_pipeline_layout, layout, builder->create_info->layout);
-
- if (layout) {
- /* Note: it's still valid to have a layout even if there are libraries.
- * This allows the app to e.g. overwrite an INDEPENDENT_SET layout with
- * a non-INDEPENDENT_SET layout which may make us use a faster path,
- * currently this just affects dynamic offset descriptors.
- */
- builder->layout = *layout;
- } else {
- for (unsigned i = 0; i < builder->num_libraries; i++) {
- struct tu_pipeline *library = builder->libraries[i];
- builder->layout.num_sets = MAX2(builder->layout.num_sets,
- library->num_sets);
- for (unsigned j = 0; j < library->num_sets; j++) {
- if (library->layouts[i])
- builder->layout.set[i].layout = library->layouts[i];
- }
-
- builder->layout.push_constant_size = pipeline->push_constant_size;
- builder->layout.independent_sets |= pipeline->independent_sets;
- }
-
- tu_pipeline_layout_init(&builder->layout);
- }
-
- if (builder->create_info->flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR) {
- pipeline->num_sets = builder->layout.num_sets;
- for (unsigned i = 0; i < pipeline->num_sets; i++) {
- pipeline->layouts[i] = builder->layout.set[i].layout;
- if (pipeline->layouts[i])
- vk_descriptor_set_layout_ref(&pipeline->layouts[i]->vk);
- }
- pipeline->push_constant_size = builder->layout.push_constant_size;
- pipeline->independent_sets = builder->layout.independent_sets;
- }
-}
-
-static void
-tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
- struct tu_const_state *const_state,
- struct ir3_shader_variant *v)
-{
- link->const_state = *ir3_const_state(v);
- link->tu_const_state = *const_state;
- link->constlen = v->constlen;
-}
-
-static bool
-tu_pipeline_static_state(struct tu_pipeline *pipeline, struct tu_cs *cs,
- uint32_t id, uint32_t size)
-{
- assert(id < ARRAY_SIZE(pipeline->dynamic_state));
-
- if (pipeline->dynamic_state_mask & BIT(id))
- return false;
-
- pipeline->dynamic_state[id] = tu_cs_draw_state(&pipeline->cs, cs, size);
- return true;
-}
-
-static void
tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
struct tu_pipeline *pipeline)
{
struct tu_cs prog_cs;
+ tu_cs_begin_sub_stream(builder->device, &pipeline->cs, 512, &prog_cs);
+ tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, false);
+ pipeline->program.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs);
- /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
- * else that could depend on that state (like push constants)
- *
- * Note also that this always uses the full VS even in binning pass. The
- * binning pass variant has the same const layout as the full VS, and
- * the constlen for the VS will be the same or greater than the constlen
- * for the binning pass variant. It is required that the constlen state
- * matches between binning and draw passes, as some parts of the push
- * consts are emitted in state groups that are shared between the binning
- * and draw passes.
- */
- tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
- tu6_emit_program_config(&prog_cs, builder);
- pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
-
- tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
- tu6_emit_program(&prog_cs, builder, false, pipeline);
- pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
-
- tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
- tu6_emit_program(&prog_cs, builder, true, pipeline);
- pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
-
- for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) {
- if (!builder->variants[i])
- continue;
-
- tu_pipeline_set_linkage(&pipeline->program.link[i],
- &builder->const_state[i],
- builder->variants[i]);
- }
-
- struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
- struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
- if (hs) {
- pipeline->program.vs_param_stride = vs->output_size;
- pipeline->program.hs_param_stride = hs->output_size;
- pipeline->program.hs_vertices_out = hs->tess.tcs_vertices_out;
-
- const struct ir3_const_state *hs_const =
- &pipeline->program.link[MESA_SHADER_TESS_CTRL].const_state;
- unsigned hs_constlen =
- pipeline->program.link[MESA_SHADER_TESS_CTRL].constlen;
- uint32_t hs_base = hs_const->offsets.primitive_param;
- pipeline->program.hs_param_dwords =
- MIN2((hs_constlen - hs_base) * 4, 8);
-
- uint32_t state_size = TU6_EMIT_PATCH_CONTROL_POINTS_DWORDS(
- pipeline->program.hs_param_dwords);
-
- struct tu_cs cs;
- if (tu_pipeline_static_state(pipeline, &cs,
- TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
- state_size)) {
- tu6_emit_patch_control_points(&cs, pipeline,
- pipeline->tess.patch_control_points);
- }
- }
+ tu_cs_begin_sub_stream(builder->device, &pipeline->cs, 512, &prog_cs);
+ tu6_emit_program(&prog_cs, builder, &pipeline->program.binary_bo, true);
+ pipeline->program.binning_state_ib =
+ tu_cs_end_sub_stream(&pipeline->cs, &prog_cs);
}
static void
tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder,
struct tu_pipeline *pipeline)
{
- if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VERTEX_INPUT))
- return;
-
const VkPipelineVertexInputStateCreateInfo *vi_info =
builder->create_info->pVertexInputState;
+ const struct tu_shader *vs = builder->shaders[MESA_SHADER_VERTEX];
- struct tu_cs cs;
- if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_VB_STRIDE,
- 2 * vi_info->vertexBindingDescriptionCount)) {
- for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
- const VkVertexInputBindingDescription *binding =
- &vi_info->pVertexBindingDescriptions[i];
+ struct tu_cs vi_cs;
+ tu_cs_begin_sub_stream(builder->device, &pipeline->cs,
+ MAX_VERTEX_ATTRIBS * 5 + 2, &vi_cs);
+ tu6_emit_vertex_input(&vi_cs, &vs->variants[0], vi_info,
+ pipeline->vi.bindings, pipeline->vi.strides,
+ pipeline->vi.offsets, &pipeline->vi.count);
+ pipeline->vi.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &vi_cs);
- tu_cs_emit_regs(&cs,
- A6XX_VFD_FETCH_STRIDE(binding->binding, binding->stride));
- }
- }
-
- VkVertexInputBindingDescription2EXT bindings[MAX_VBS];
- VkVertexInputAttributeDescription2EXT attrs[MAX_VERTEX_ATTRIBS];
-
- for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
- const VkVertexInputBindingDescription *binding =
- &vi_info->pVertexBindingDescriptions[i];
- bindings[i] = (VkVertexInputBindingDescription2EXT) {
- .sType = VK_STRUCTURE_TYPE_VERTEX_INPUT_BINDING_DESCRIPTION_2_EXT,
- .pNext = NULL,
- .binding = binding->binding,
- .inputRate = binding->inputRate,
- .stride = binding->stride,
- .divisor = 1,
- };
-
- /* Bindings may contain holes */
- pipeline->vi.num_vbs = MAX2(pipeline->vi.num_vbs, binding->binding + 1);
- }
-
- const VkPipelineVertexInputDivisorStateCreateInfoEXT *div_state =
- vk_find_struct_const(vi_info->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
- if (div_state) {
- for (uint32_t i = 0; i < div_state->vertexBindingDivisorCount; i++) {
- const VkVertexInputBindingDivisorDescriptionEXT *desc =
- &div_state->pVertexBindingDivisors[i];
- bindings[desc->binding].divisor = desc->divisor;
- }
+ if (vs->has_binning_pass) {
+ tu_cs_begin_sub_stream(builder->device, &pipeline->cs,
+ MAX_VERTEX_ATTRIBS * 5 + 2, &vi_cs);
+ tu6_emit_vertex_input(
+ &vi_cs, &vs->variants[1], vi_info, pipeline->vi.binning_bindings,
+ pipeline->vi.binning_strides, pipeline->vi.binning_offsets,
+ &pipeline->vi.binning_count);
+ pipeline->vi.binning_state_ib =
+ tu_cs_end_sub_stream(&pipeline->cs, &vi_cs);
}
-
- for (unsigned i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
- const VkVertexInputAttributeDescription *attr =
- &vi_info->pVertexAttributeDescriptions[i];
- attrs[i] = (VkVertexInputAttributeDescription2EXT) {
- .sType = VK_STRUCTURE_TYPE_VERTEX_INPUT_ATTRIBUTE_DESCRIPTION_2_EXT,
- .pNext = NULL,
- .binding = attr->binding,
- .location = attr->location,
- .offset = attr->offset,
- .format = attr->format,
- };
- }
-
- tu_cs_begin_sub_stream(&pipeline->cs,
- TU6_EMIT_VERTEX_INPUT_MAX_DWORDS, &cs);
- tu6_emit_vertex_input(&cs,
- vi_info->vertexBindingDescriptionCount, bindings,
- vi_info->vertexAttributeDescriptionCount, attrs);
- pipeline->dynamic_state[TU_DYNAMIC_STATE_VERTEX_INPUT] =
- tu_cs_end_draw_state(&pipeline->cs, &cs);
}
static void
@@ -4046,29 +1549,6 @@ tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder *builder,
}
static void
-tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder,
- struct tu_pipeline *pipeline)
-{
- if (!(builder->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) ||
- !(builder->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT))
- return;
-
- const VkPipelineTessellationStateCreateInfo *tess_info =
- builder->create_info->pTessellationState;
-
- if (!(pipeline->dynamic_state_mask &
- BIT(TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS))) {
- assert(tess_info->patchControlPoints <= 32);
- pipeline->tess.patch_control_points = tess_info->patchControlPoints;
- }
-
- const VkPipelineTessellationDomainOriginStateCreateInfo *domain_info =
- vk_find_struct_const(tess_info->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
- pipeline->tess.upper_left_domain_origin = !domain_info ||
- domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
-}
-
-static void
tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder,
struct tu_pipeline *pipeline)
{
@@ -4085,17 +1565,21 @@ tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder,
const VkPipelineViewportStateCreateInfo *vp_info =
builder->create_info->pViewportState;
- const VkPipelineViewportDepthClipControlCreateInfoEXT *depth_clip_info =
- vk_find_struct_const(vp_info->pNext, PIPELINE_VIEWPORT_DEPTH_CLIP_CONTROL_CREATE_INFO_EXT);
- pipeline->viewport.z_negative_one_to_one = depth_clip_info ? depth_clip_info->negativeOneToOne : false;
- struct tu_cs cs;
+ struct tu_cs vp_cs;
+ tu_cs_begin_sub_stream(builder->device, &pipeline->cs, 15, &vp_cs);
- if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount))
- tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount, pipeline->viewport.z_negative_one_to_one);
+ if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_VIEWPORT)) {
+ assert(vp_info->viewportCount == 1);
+ tu6_emit_viewport(&vp_cs, vp_info->pViewports);
+ }
- if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount))
- tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount);
+ if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_SCISSOR)) {
+ assert(vp_info->scissorCount == 1);
+ tu6_emit_scissor(&vp_cs, vp_info->pScissors);
+ }
+
+ pipeline->vp.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &vp_cs);
}
static void
@@ -4105,95 +1589,31 @@ tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder,
const VkPipelineRasterizationStateCreateInfo *rast_info =
builder->create_info->pRasterizationState;
- enum a6xx_polygon_mode mode = tu6_polygon_mode(rast_info->polygonMode);
-
- bool depth_clip_disable = rast_info->depthClampEnable;
-
- const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
- vk_find_struct_const(rast_info, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
- if (depth_clip_state)
- depth_clip_disable = !depth_clip_state->depthClipEnable;
-
- pipeline->rast.rb_depth_cntl =
- COND(rast_info->depthClampEnable, A6XX_RB_DEPTH_CNTL_Z_CLAMP_ENABLE);
-
- pipeline->rast.line_mode = RECTANGULAR;
-
- const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_state =
- vk_find_struct_const(rast_info->pNext,
- PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
-
- if (rast_line_state &&
- rast_line_state->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) {
- pipeline->rast.line_mode = BRESENHAM;
- }
-
- struct tu_cs cs;
- uint32_t cs_size = 9 +
- (builder->device->physical_device->info->a6xx.has_shading_rate ? 8 : 0);
- pipeline->rast.state = tu_cs_draw_state(&pipeline->cs, &cs, cs_size);
-
- tu_cs_emit_regs(&cs,
- A6XX_GRAS_CL_CNTL(
- .znear_clip_disable = depth_clip_disable,
- .zfar_clip_disable = depth_clip_disable,
- .z_clamp_enable = rast_info->depthClampEnable,
- .zero_gb_scale_z = pipeline->viewport.z_negative_one_to_one ? 0 : 1,
- .vp_clip_code_ignore = 1));
+ assert(!rast_info->depthClampEnable);
+ assert(rast_info->polygonMode == VK_POLYGON_MODE_FILL);
- tu_cs_emit_regs(&cs,
- A6XX_VPC_POLYGON_MODE(mode));
-
- tu_cs_emit_regs(&cs,
- A6XX_PC_POLYGON_MODE(mode));
+ struct tu_cs rast_cs;
+ tu_cs_begin_sub_stream(builder->device, &pipeline->cs, 20, &rast_cs);
/* move to hw ctx init? */
- tu_cs_emit_regs(&cs,
- A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
- A6XX_GRAS_SU_POINT_SIZE(1.0f));
-
- if (builder->device->physical_device->info->a6xx.has_shading_rate) {
- tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A00());
- tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A10());
- tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A20());
- tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A30());
- }
-
- const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
- vk_find_struct_const(rast_info->pNext,
- PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
- unsigned stream = stream_info ? stream_info->rasterizationStream : 0;
-
- pipeline->rast.pc_raster_cntl = A6XX_PC_RASTER_CNTL_STREAM(stream);
- pipeline->rast.vpc_unknown_9107 = 0;
- if (rast_info->rasterizerDiscardEnable) {
- pipeline->rast.pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD;
- pipeline->rast.vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
- }
+ tu6_emit_gras_unknowns(&rast_cs);
+ tu6_emit_point_size(&rast_cs);
- if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4)) {
- tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = pipeline->rast.pc_raster_cntl));
- tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = pipeline->rast.vpc_unknown_9107));
- }
-
- pipeline->rast.gras_su_cntl =
- tu6_gras_su_cntl(rast_info, pipeline->rast.line_mode, builder->multiview_mask != 0);
+ const uint32_t gras_su_cntl =
+ tu6_gras_su_cntl(rast_info, builder->samples);
- if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2))
- tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = pipeline->rast.gras_su_cntl));
+ if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_LINE_WIDTH))
+ tu6_emit_gras_su_cntl(&rast_cs, gras_su_cntl, rast_info->lineWidth);
- if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BIAS, 4)) {
- tu6_emit_depth_bias(&cs, rast_info->depthBiasConstantFactor,
+ if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_DEPTH_BIAS)) {
+ tu6_emit_depth_bias(&rast_cs, rast_info->depthBiasConstantFactor,
rast_info->depthBiasClamp,
rast_info->depthBiasSlopeFactor);
}
- const struct VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_state =
- vk_find_struct_const(rast_info->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
- pipeline->rast.provoking_vertex_last = provoking_vtx_state &&
- provoking_vtx_state->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT;
+ pipeline->rast.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &rast_cs);
- pipeline->rast.multiview_mask = builder->multiview_mask;
+ pipeline->rast.gras_su_cntl = gras_su_cntl;
}
static void
@@ -4207,128 +1627,38 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
* the pipeline has rasterization disabled or if the subpass of the
* render pass the pipeline is created against does not use a
* depth/stencil attachment.
+ *
+ * We disable both depth and stenil tests in those cases.
*/
+ static const VkPipelineDepthStencilStateCreateInfo dummy_ds_info;
const VkPipelineDepthStencilStateCreateInfo *ds_info =
- builder->create_info->pDepthStencilState;
- uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0;
- struct tu_cs cs;
-
- if (!builder->attachment_state_valid ||
- (builder->depth_attachment_format != VK_FORMAT_UNDEFINED &&
- builder->depth_attachment_format != VK_FORMAT_S8_UINT)) {
- if (ds_info->depthTestEnable) {
- rb_depth_cntl |=
- A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE |
- A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) |
- A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; /* TODO: don't set for ALWAYS/NEVER */
-
- if (ds_info->depthWriteEnable)
- rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
- }
-
- if (ds_info->depthBoundsTestEnable)
- rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
+ builder->use_depth_stencil_attachment
+ ? builder->create_info->pDepthStencilState
+ : &dummy_ds_info;
- if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable)
- tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl);
- }
-
- if (!builder->attachment_state_valid ||
- builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
- const VkStencilOpState *front = &ds_info->front;
- const VkStencilOpState *back = &ds_info->back;
-
- rb_stencil_cntl |=
- A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) |
- A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) |
- A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) |
- A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) |
- A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) |
- A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) |
- A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) |
- A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp));
-
- if (ds_info->stencilTestEnable) {
- rb_stencil_cntl |=
- A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
- A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
- A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
- }
-
- pipeline->ds.raster_order_attachment_access =
- ds_info->flags &
- (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_ARM |
- VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_ARM);
-
- pipeline->ds.write_enable =
- ds_info->depthWriteEnable || ds_info->stencilTestEnable;
- }
-
- pipeline->ds.rb_depth_cntl = rb_depth_cntl;
-
- if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2)) {
- tu_cs_emit_pkt4(&cs, REG_A6XX_RB_STENCIL_CONTROL, 1);
- tu_cs_emit(&cs, rb_stencil_cntl);
- }
- pipeline->ds.rb_stencil_cntl = rb_stencil_cntl;
+ struct tu_cs ds_cs;
+ tu_cs_begin_sub_stream(builder->device, &pipeline->cs, 12, &ds_cs);
- /* the remaining draw states arent used if there is no d/s, leave them empty */
- if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED &&
- builder->attachment_state_valid)
- return;
+ /* move to hw ctx init? */
+ tu6_emit_alpha_control_disable(&ds_cs);
- if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3)) {
- tu_cs_emit_regs(&cs,
- A6XX_RB_Z_BOUNDS_MIN(ds_info->minDepthBounds),
- A6XX_RB_Z_BOUNDS_MAX(ds_info->maxDepthBounds));
- }
+ tu6_emit_depth_control(&ds_cs, ds_info);
+ tu6_emit_stencil_control(&ds_cs, ds_info);
- if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2)) {
- tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.mask = ds_info->front.compareMask & 0xff,
- .bfmask = ds_info->back.compareMask & 0xff));
+ if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_COMPARE_MASK)) {
+ tu6_emit_stencil_compare_mask(&ds_cs, ds_info->front.compareMask,
+ ds_info->back.compareMask);
}
-
- if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2)) {
- update_stencil_mask(&pipeline->ds.stencil_wrmask, VK_STENCIL_FACE_FRONT_BIT, ds_info->front.writeMask);
- update_stencil_mask(&pipeline->ds.stencil_wrmask, VK_STENCIL_FACE_BACK_BIT, ds_info->back.writeMask);
- tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = pipeline->ds.stencil_wrmask));
+ if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_WRITE_MASK)) {
+ tu6_emit_stencil_write_mask(&ds_cs, ds_info->front.writeMask,
+ ds_info->back.writeMask);
}
-
- if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2)) {
- tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.ref = ds_info->front.reference & 0xff,
- .bfref = ds_info->back.reference & 0xff));
+ if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_STENCIL_REFERENCE)) {
+ tu6_emit_stencil_reference(&ds_cs, ds_info->front.reference,
+ ds_info->back.reference);
}
- if (builder->variants[MESA_SHADER_FRAGMENT]) {
- const struct ir3_shader_variant *fs = builder->variants[MESA_SHADER_FRAGMENT];
- if (fs->has_kill) {
- pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
- }
- if (fs->no_earlyz || fs->writes_pos) {
- pipeline->lrz.force_disable_mask = TU_LRZ_FORCE_DISABLE_LRZ;
- }
- }
-}
-
-static void
-tu_pipeline_builder_parse_rast_ds(struct tu_pipeline_builder *builder,
- struct tu_pipeline *pipeline)
-{
- if (builder->rasterizer_discard)
- return;
-
- pipeline->rast_ds.rb_depth_cntl =
- pipeline->rast.rb_depth_cntl | pipeline->ds.rb_depth_cntl;
- pipeline->rast_ds.rb_depth_cntl_mask = pipeline->ds.rb_depth_cntl_mask;
-
- struct tu_cs cs;
- if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) {
- tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_CNTL, 1);
- if (pipeline->output.rb_depth_cntl_disable)
- tu_cs_emit(&cs, 0);
- else
- tu_cs_emit(&cs, pipeline->rast_ds.rb_depth_cntl);
- }
+ pipeline->ds.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &ds_cs);
}
static void
@@ -4351,189 +1681,31 @@ tu_pipeline_builder_parse_multisample_and_color_blend(
*
* We leave the relevant registers stale when rasterization is disabled.
*/
- if (builder->rasterizer_discard) {
- pipeline->output.samples = VK_SAMPLE_COUNT_1_BIT;
+ if (builder->rasterizer_discard)
return;
- }
-
- pipeline->output.feedback_loop_may_involve_textures =
- builder->feedback_loop_may_involve_textures;
static const VkPipelineColorBlendStateCreateInfo dummy_blend_info;
const VkPipelineMultisampleStateCreateInfo *msaa_info =
builder->create_info->pMultisampleState;
- pipeline->output.samples = msaa_info->rasterizationSamples;
-
const VkPipelineColorBlendStateCreateInfo *blend_info =
builder->use_color_attachments ? builder->create_info->pColorBlendState
: &dummy_blend_info;
- bool no_earlyz = builder->depth_attachment_format == VK_FORMAT_S8_UINT ||
- /* alpha to coverage can behave like a discard */
- msaa_info->alphaToCoverageEnable;
- pipeline->lrz.force_late_z |= no_earlyz;
-
- pipeline->output.subpass_feedback_loop_color =
- builder->subpass_feedback_loop_color;
- pipeline->output.subpass_feedback_loop_ds =
- builder->subpass_feedback_loop_ds;
-
- if (builder->use_color_attachments) {
- pipeline->blend.raster_order_attachment_access =
- blend_info->flags &
- VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_ARM;
- }
-
- const enum pipe_format ds_pipe_format =
- vk_format_to_pipe_format(builder->depth_attachment_format);
+ struct tu_cs blend_cs;
+ tu_cs_begin_sub_stream(builder->device, &pipeline->cs, MAX_RTS * 3 + 9,
+ &blend_cs);
- if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED &&
- builder->depth_attachment_format != VK_FORMAT_S8_UINT) {
- pipeline->output.depth_cpp_per_sample = util_format_get_component_bits(
- ds_pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
- } else {
- /* We need to make sure RB_DEPTH_CNTL is set to 0 when this pipeline is
- * used, regardless of whether it's linked with a fragment shader
- * pipeline that has an enabled depth test or if RB_DEPTH_CNTL is set
- * dynamically.
- */
- pipeline->output.rb_depth_cntl_disable = true;
- }
-
- if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
- pipeline->output.stencil_cpp_per_sample = util_format_get_component_bits(
- ds_pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
- }
-
- struct tu_cs cs;
- tu6_emit_rb_mrt_controls(pipeline, blend_info,
+ uint32_t blend_enable_mask;
+ tu6_emit_rb_mrt_controls(&blend_cs, blend_info,
builder->color_attachment_formats,
- &pipeline->blend.rop_reads_dst,
- &pipeline->output.color_bandwidth_per_sample);
-
- if (msaa_info->alphaToCoverageEnable && pipeline->blend.num_rts == 0) {
- /* In addition to changing the *_OUTPUT_CNTL1 registers, this will also
- * make sure we disable memory writes for MRT0 rather than using
- * whatever setting was leftover.
- */
- pipeline->blend.num_rts = 1;
- }
-
- uint32_t blend_enable_mask =
- pipeline->blend.rop_reads_dst ?
- pipeline->blend.color_write_enable :
- pipeline->blend.blend_enable;
- tu6_emit_blend_control(pipeline, blend_enable_mask,
- tu_blend_state_is_dual_src(blend_info), msaa_info);
-
- if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_BLEND,
- pipeline->blend.num_rts * 3 + 8)) {
- tu6_emit_blend(&cs, pipeline);
- assert(cs.cur == cs.end); /* validate draw state size */
- }
-
- /* Disable LRZ writes when blend or logic op that reads the destination is
- * enabled, since the resulting pixel value from the blend-draw depends on
- * an earlier draw, which LRZ in the draw pass could early-reject if the
- * previous blend-enabled draw wrote LRZ.
- *
- * TODO: We need to disable LRZ writes only for the binning pass.
- * Therefore, we need to emit it in a separate draw state. We keep
- * it disabled for sysmem path as well for the moment.
- */
- if (blend_enable_mask)
- pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
-
- for (int i = 0; i < blend_info->attachmentCount; i++) {
- VkPipelineColorBlendAttachmentState blendAttachment = blend_info->pAttachments[i];
- /* From the PoV of LRZ, having masked color channels is
- * the same as having blend enabled, in that the draw will
- * care about the fragments from an earlier draw.
- */
- VkFormat format = builder->color_attachment_formats[i];
- unsigned mask = MASK(vk_format_get_nr_components(format));
- if (format != VK_FORMAT_UNDEFINED &&
- ((blendAttachment.colorWriteMask & mask) != mask ||
- !(pipeline->blend.color_write_enable & BIT(i)))) {
- pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
- }
- }
-
- if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5)) {
- tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
- tu_cs_emit_array(&cs, (const uint32_t *) blend_info->blendConstants, 4);
- }
+ &blend_enable_mask);
- const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
- vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
- const VkSampleLocationsInfoEXT *samp_loc = NULL;
+ if (!(pipeline->dynamic_state.mask & TU_DYNAMIC_BLEND_CONSTANTS))
+ tu6_emit_blend_constants(&blend_cs, blend_info->blendConstants);
- if (sample_locations && sample_locations->sampleLocationsEnable)
- samp_loc = &sample_locations->sampleLocationsInfo;
+ tu6_emit_blend_control(&blend_cs, blend_enable_mask, msaa_info);
- if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
- samp_loc ? 9 : 6)) {
- tu6_emit_sample_locations(&cs, samp_loc);
- }
-}
-
-static void
-tu_pipeline_builder_parse_rasterization_order(
- struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
-{
- if (builder->rasterizer_discard)
- return;
-
- bool raster_order_attachment_access =
- pipeline->blend.raster_order_attachment_access ||
- pipeline->ds.raster_order_attachment_access ||
- unlikely(builder->device->physical_device->instance->debug_flags & TU_DEBUG_RAST_ORDER);
-
- /* VK_EXT_blend_operation_advanced would also require ordered access
- * when implemented in the future.
- */
-
- uint32_t sysmem_prim_mode = NO_FLUSH;
- uint32_t gmem_prim_mode = NO_FLUSH;
-
- if (raster_order_attachment_access) {
- /* VK_EXT_rasterization_order_attachment_access:
- *
- * This extension allow access to framebuffer attachments when used as
- * both input and color attachments from one fragment to the next,
- * in rasterization order, without explicit synchronization.
- */
- sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
- gmem_prim_mode = FLUSH_PER_OVERLAP;
- pipeline->prim_order.sysmem_single_prim_mode = true;
- } else {
- /* If there is a feedback loop, then the shader can read the previous value
- * of a pixel being written out. It can also write some components and then
- * read different components without a barrier in between. This is a
- * problem in sysmem mode with UBWC, because the main buffer and flags
- * buffer can get out-of-sync if only one is flushed. We fix this by
- * setting the SINGLE_PRIM_MODE field to the same value that the blob does
- * for advanced_blend in sysmem mode if a feedback loop is detected.
- */
- if (pipeline->output.subpass_feedback_loop_color ||
- (pipeline->output.subpass_feedback_loop_ds &&
- pipeline->ds.write_enable)) {
- sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
- pipeline->prim_order.sysmem_single_prim_mode = true;
- }
- }
-
- struct tu_cs cs;
-
- pipeline->prim_order.state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
- tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
- A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
- A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode));
-
- pipeline->prim_order.state_sysmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
- tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
- A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
- A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(sysmem_prim_mode));
+ pipeline->blend.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &blend_cs);
}
static void
@@ -4541,161 +1713,45 @@ tu_pipeline_finish(struct tu_pipeline *pipeline,
struct tu_device *dev,
const VkAllocationCallbacks *alloc)
{
- tu_cs_finish(&pipeline->cs);
- pthread_mutex_lock(&dev->pipeline_mutex);
- tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo);
- pthread_mutex_unlock(&dev->pipeline_mutex);
-
- if (pipeline->pvtmem_bo)
- tu_bo_finish(dev, pipeline->pvtmem_bo);
-
- if (pipeline->compiled_shaders)
- vk_pipeline_cache_object_unref(&pipeline->compiled_shaders->base);
-
- if (pipeline->nir_shaders)
- vk_pipeline_cache_object_unref(&pipeline->nir_shaders->base);
+ tu_cs_finish(dev, &pipeline->cs);
- for (unsigned i = 0; i < pipeline->num_sets; i++) {
- if (pipeline->layouts[i])
- vk_descriptor_set_layout_unref(&dev->vk, &pipeline->layouts[i]->vk);
- }
-
- ralloc_free(pipeline->executables_mem_ctx);
+ if (pipeline->program.binary_bo.gem_handle)
+ tu_bo_finish(dev, &pipeline->program.binary_bo);
}
-
static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
struct tu_pipeline **pipeline)
{
- VkResult result;
-
- *pipeline = vk_object_zalloc(&builder->device->vk, builder->alloc,
- sizeof(**pipeline), VK_OBJECT_TYPE_PIPELINE);
- if (!*pipeline)
- return VK_ERROR_OUT_OF_HOST_MEMORY;
-
- (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
- util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
-
- tu_pipeline_builder_parse_dynamic(builder, *pipeline);
- tu_pipeline_builder_parse_libraries(builder, *pipeline);
-
- VkShaderStageFlags stages = 0;
- for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
- stages |= builder->create_info->pStages[i].stage;
- }
- builder->active_stages = stages;
-
- (*pipeline)->active_stages = stages;
- for (unsigned i = 0; i < builder->num_libraries; i++)
- (*pipeline)->active_stages |= builder->libraries[i]->active_stages;
+ VkResult result = tu_pipeline_builder_create_pipeline(builder, pipeline);
+ if (result != VK_SUCCESS)
+ return result;
- /* Compile and upload shaders unless a library has already done that. */
- if ((*pipeline)->program.state.size == 0) {
- tu_pipeline_builder_parse_layout(builder, *pipeline);
+ /* compile and upload shaders */
+ result = tu_pipeline_builder_compile_shaders(builder);
+ if (result == VK_SUCCESS)
+ result = tu_pipeline_builder_upload_shaders(builder, *pipeline);
+ if (result != VK_SUCCESS) {
+ tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
+ vk_free2(&builder->device->alloc, builder->alloc, *pipeline);
+ *pipeline = VK_NULL_HANDLE;
- result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
- if (result != VK_SUCCESS) {
- vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
- return result;
- }
+ return result;
}
- result = tu_pipeline_allocate_cs(builder->device, *pipeline,
- &builder->layout, builder, NULL);
-
-
- /* This has to come before emitting the program so that
- * pipeline->tess.patch_control_points and pipeline->rast.multiview_mask
- * are always set.
+ tu_pipeline_builder_parse_dynamic(builder, *pipeline);
+ tu_pipeline_builder_parse_shader_stages(builder, *pipeline);
+ tu_pipeline_builder_parse_vertex_input(builder, *pipeline);
+ tu_pipeline_builder_parse_input_assembly(builder, *pipeline);
+ tu_pipeline_builder_parse_viewport(builder, *pipeline);
+ tu_pipeline_builder_parse_rasterization(builder, *pipeline);
+ tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
+ tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
+
+ /* we should have reserved enough space upfront such that the CS never
+ * grows
*/
- if (builder->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
- tu_pipeline_builder_parse_tessellation(builder, *pipeline);
- (*pipeline)->rast.multiview_mask = builder->multiview_mask;
- }
-
- if (set_combined_state(builder, *pipeline,
- VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
- if (result != VK_SUCCESS) {
- vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
- return result;
- }
-
- for (uint32_t i = 0; i < ARRAY_SIZE(builder->shader_iova); i++)
- builder->shader_iova[i] =
- tu_upload_variant(*pipeline, builder->variants[i]);
-
- builder->binning_vs_iova =
- tu_upload_variant(*pipeline, builder->binning_variant);
-
- /* Setup private memory. Note that because we're sharing the same private
- * memory for all stages, all stages must use the same config, or else
- * fibers from one stage might overwrite fibers in another.
- */
-
- uint32_t pvtmem_size = 0;
- bool per_wave = true;
- for (uint32_t i = 0; i < ARRAY_SIZE(builder->variants); i++) {
- if (builder->variants[i]) {
- pvtmem_size = MAX2(pvtmem_size, builder->variants[i]->pvtmem_size);
- if (!builder->variants[i]->pvtmem_per_wave)
- per_wave = false;
- }
- }
-
- if (builder->binning_variant) {
- pvtmem_size = MAX2(pvtmem_size, builder->binning_variant->pvtmem_size);
- if (!builder->binning_variant->pvtmem_per_wave)
- per_wave = false;
- }
-
- result = tu_setup_pvtmem(builder->device, *pipeline, &builder->pvtmem,
- pvtmem_size, per_wave);
- if (result != VK_SUCCESS) {
- vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
- return result;
- }
-
- tu_pipeline_builder_parse_shader_stages(builder, *pipeline);
- tu6_emit_load_state(*pipeline, &builder->layout);
- }
-
- if (builder->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
- tu_pipeline_builder_parse_vertex_input(builder, *pipeline);
- tu_pipeline_builder_parse_input_assembly(builder, *pipeline);
- }
-
- if (builder->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
- tu_pipeline_builder_parse_viewport(builder, *pipeline);
- tu_pipeline_builder_parse_rasterization(builder, *pipeline);
- }
-
- if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
- tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
- }
-
- if (builder->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
- tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
- }
-
- if (set_combined_state(builder, *pipeline,
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
- tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
- }
-
- if (set_combined_state(builder, *pipeline,
- VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
- tu_pipeline_builder_parse_rast_ds(builder, *pipeline);
- }
+ assert((*pipeline)->cs.bo_count == 1);
return VK_SUCCESS;
}
@@ -4703,231 +1759,59 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
static void
tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
{
- if (builder->compiled_shaders)
- vk_pipeline_cache_object_unref(&builder->compiled_shaders->base);
- ralloc_free(builder->mem_ctx);
+ for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) {
+ if (!builder->shaders[i])
+ continue;
+ tu_shader_destroy(builder->device, builder->shaders[i], builder->alloc);
+ }
}
static void
tu_pipeline_builder_init_graphics(
struct tu_pipeline_builder *builder,
struct tu_device *dev,
- struct vk_pipeline_cache *cache,
+ struct tu_pipeline_cache *cache,
const VkGraphicsPipelineCreateInfo *create_info,
const VkAllocationCallbacks *alloc)
{
*builder = (struct tu_pipeline_builder) {
.device = dev,
- .mem_ctx = ralloc_context(NULL),
.cache = cache,
.create_info = create_info,
.alloc = alloc,
};
- const VkGraphicsPipelineLibraryCreateInfoEXT *gpl_info =
- vk_find_struct_const(builder->create_info->pNext,
- GRAPHICS_PIPELINE_LIBRARY_CREATE_INFO_EXT);
-
- const VkPipelineLibraryCreateInfoKHR *library_info =
- vk_find_struct_const(builder->create_info->pNext,
- PIPELINE_LIBRARY_CREATE_INFO_KHR);
-
- if (gpl_info) {
- builder->state = gpl_info->flags;
- } else {
- /* Implement this bit of spec text:
- *
- * If this structure is omitted, and either
- * VkGraphicsPipelineCreateInfo::flags includes
- * VK_PIPELINE_CREATE_LIBRARY_BIT_KHR or the
- * VkGraphicsPipelineCreateInfo::pNext chain includes a
- * VkPipelineLibraryCreateInfoKHR structure with a libraryCount
- * greater than 0, it is as if flags is 0. Otherwise if this
- * structure is omitted, it is as if flags includes all possible
- * subsets of the graphics pipeline (i.e. a complete graphics
- * pipeline).
- */
- if ((library_info && library_info->libraryCount > 0) ||
- (builder->create_info->flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR)) {
- builder->state = 0;
- } else {
- builder->state =
- VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT;
- }
- }
-
- bool rasterizer_discard_dynamic = false;
- if (create_info->pDynamicState) {
- for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
- if (create_info->pDynamicState->pDynamicStates[i] ==
- VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE) {
- rasterizer_discard_dynamic = true;
- break;
- }
- }
- }
-
builder->rasterizer_discard =
- (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) &&
- builder->create_info->pRasterizationState->rasterizerDiscardEnable &&
- !rasterizer_discard_dynamic;
-
- if (builder->state &
- (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
- const VkPipelineRenderingCreateInfo *rendering_info =
- vk_find_struct_const(create_info->pNext, PIPELINE_RENDERING_CREATE_INFO);
-
- if (unlikely(dev->instance->debug_flags & TU_DEBUG_DYNAMIC) && !rendering_info)
- rendering_info = vk_get_pipeline_rendering_create_info(create_info);
-
- /* Get multiview_mask, which is only used for shaders */
- if (builder->state &
- (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
- if (rendering_info) {
- builder->multiview_mask = rendering_info->viewMask;
- } else {
- const struct tu_render_pass *pass =
- tu_render_pass_from_handle(create_info->renderPass);
- const struct tu_subpass *subpass =
- &pass->subpasses[create_info->subpass];
- builder->multiview_mask = subpass->multiview_mask;
- }
- }
-
- /* Get the attachment state. This is valid:
- *
- * - With classic renderpasses, when either fragment shader or fragment
- * output interface state is being compiled. This includes when we
- * emulate classic renderpasses with dynamic rendering with the debug
- * flag.
- * - With dynamic rendering (renderPass is NULL) only when compiling the
- * output interface state.
- *
- * We only actually need this for the fragment output interface state,
- * but the spec also requires us to skip parsing depth/stencil state
- * when the attachment state is defined *and* no depth/stencil
- * attachment is not used, so we have to parse it for fragment shader
- * state when possible. Life is pain.
- */
- if (((builder->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) ||
- ((builder->state &
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
- builder->create_info->renderPass)) &&
- rendering_info) {
- builder->subpass_raster_order_attachment_access = false;
- builder->subpass_feedback_loop_ds = false;
- builder->subpass_feedback_loop_color = false;
-
- const VkRenderingSelfDependencyInfoMESA *self_dependency =
- vk_find_struct_const(rendering_info->pNext, RENDERING_SELF_DEPENDENCY_INFO_MESA);
-
- if (self_dependency) {
- builder->subpass_feedback_loop_ds =
- self_dependency->depthSelfDependency ||
- self_dependency->stencilSelfDependency;
- builder->subpass_feedback_loop_color =
- self_dependency->colorSelfDependencies;
- }
-
- if (!builder->rasterizer_discard) {
- builder->depth_attachment_format =
- rendering_info->depthAttachmentFormat == VK_FORMAT_UNDEFINED ?
- rendering_info->stencilAttachmentFormat :
- rendering_info->depthAttachmentFormat;
-
- for (unsigned i = 0; i < rendering_info->colorAttachmentCount; i++) {
- builder->color_attachment_formats[i] =
- rendering_info->pColorAttachmentFormats[i];
- if (builder->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) {
- builder->use_color_attachments = true;
- }
- }
- }
+ create_info->pRasterizationState->rasterizerDiscardEnable;
- builder->attachment_state_valid = true;
- } else if ((builder->state &
- (VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
- VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) &&
- create_info->renderPass != VK_NULL_HANDLE) {
- const struct tu_render_pass *pass =
- tu_render_pass_from_handle(create_info->renderPass);
- const struct tu_subpass *subpass =
- &pass->subpasses[create_info->subpass];
-
- builder->subpass_raster_order_attachment_access =
- subpass->raster_order_attachment_access;
- builder->subpass_feedback_loop_color = subpass->feedback_loop_color;
- builder->subpass_feedback_loop_ds = subpass->feedback_loop_ds;
-
- if (!builder->rasterizer_discard) {
- const uint32_t a = subpass->depth_stencil_attachment.attachment;
- builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ?
- pass->attachments[a].format : VK_FORMAT_UNDEFINED;
-
- assert(subpass->color_count == 0 ||
- !create_info->pColorBlendState ||
- subpass->color_count == create_info->pColorBlendState->attachmentCount);
- for (uint32_t i = 0; i < subpass->color_count; i++) {
- const uint32_t a = subpass->color_attachments[i].attachment;
- if (a == VK_ATTACHMENT_UNUSED)
- continue;
-
- builder->color_attachment_formats[i] = pass->attachments[a].format;
- builder->use_color_attachments = true;
- }
- }
+ if (builder->rasterizer_discard) {
+ builder->samples = VK_SAMPLE_COUNT_1_BIT;
+ } else {
+ builder->samples = create_info->pMultisampleState->rasterizationSamples;
+
+ const struct tu_render_pass *pass =
+ tu_render_pass_from_handle(create_info->renderPass);
+ const struct tu_subpass *subpass =
+ &pass->subpasses[create_info->subpass];
+
+ builder->use_depth_stencil_attachment =
+ subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED;
+
+ assert(subpass->color_count ==
+ create_info->pColorBlendState->attachmentCount);
+ builder->color_attachment_count = subpass->color_count;
+ for (uint32_t i = 0; i < subpass->color_count; i++) {
+ const uint32_t a = subpass->color_attachments[i].attachment;
+ if (a == VK_ATTACHMENT_UNUSED)
+ continue;
- builder->attachment_state_valid = true;
+ builder->color_attachment_formats[i] = pass->attachments[a].format;
+ builder->use_color_attachments = true;
}
}
-
- if (builder->create_info->flags & VK_PIPELINE_CREATE_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT) {
- builder->subpass_feedback_loop_color = true;
- builder->feedback_loop_may_involve_textures = true;
- }
-
- if (builder->create_info->flags & VK_PIPELINE_CREATE_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT) {
- builder->subpass_feedback_loop_ds = true;
- builder->feedback_loop_may_involve_textures = true;
- }
}
-static VkResult
-tu_graphics_pipeline_create(VkDevice device,
- VkPipelineCache pipelineCache,
- const VkGraphicsPipelineCreateInfo *pCreateInfo,
- const VkAllocationCallbacks *pAllocator,
- VkPipeline *pPipeline)
-{
- TU_FROM_HANDLE(tu_device, dev, device);
- TU_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
-
- cache = cache ? cache : dev->mem_cache;
-
- struct tu_pipeline_builder builder;
- tu_pipeline_builder_init_graphics(&builder, dev, cache,
- pCreateInfo, pAllocator);
-
- struct tu_pipeline *pipeline = NULL;
- VkResult result = tu_pipeline_builder_build(&builder, &pipeline);
- tu_pipeline_builder_finish(&builder);
-
- if (result == VK_SUCCESS)
- *pPipeline = tu_pipeline_to_handle(pipeline);
- else
- *pPipeline = VK_NULL_HANDLE;
-
- return result;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreateGraphicsPipelines(VkDevice device,
VkPipelineCache pipelineCache,
uint32_t count,
@@ -4935,242 +1819,68 @@ tu_CreateGraphicsPipelines(VkDevice device,
const VkAllocationCallbacks *pAllocator,
VkPipeline *pPipelines)
{
- MESA_TRACE_FUNC();
- VkResult final_result = VK_SUCCESS;
- uint32_t i = 0;
+ TU_FROM_HANDLE(tu_device, dev, device);
+ TU_FROM_HANDLE(tu_pipeline_cache, cache, pipelineCache);
- for (; i < count; i++) {
- VkResult result = tu_graphics_pipeline_create(device, pipelineCache,
- &pCreateInfos[i], pAllocator,
- &pPipelines[i]);
+ for (uint32_t i = 0; i < count; i++) {
+ struct tu_pipeline_builder builder;
+ tu_pipeline_builder_init_graphics(&builder, dev, cache,
+ &pCreateInfos[i], pAllocator);
+
+ struct tu_pipeline *pipeline;
+ VkResult result = tu_pipeline_builder_build(&builder, &pipeline);
+ tu_pipeline_builder_finish(&builder);
if (result != VK_SUCCESS) {
- final_result = result;
- pPipelines[i] = VK_NULL_HANDLE;
+ for (uint32_t j = 0; j < i; j++) {
+ tu_DestroyPipeline(device, pPipelines[j], pAllocator);
+ pPipelines[j] = VK_NULL_HANDLE;
+ }
- if (pCreateInfos[i].flags &
- VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
- break;
+ return result;
}
- }
- for (; i < count; i++)
- pPipelines[i] = VK_NULL_HANDLE;
+ pPipelines[i] = tu_pipeline_to_handle(pipeline);
+ }
- return final_result;
+ return VK_SUCCESS;
}
static VkResult
-tu_compute_pipeline_create(VkDevice device,
- VkPipelineCache pipelineCache,
+tu_compute_pipeline_create(VkDevice _device,
+ VkPipelineCache _cache,
const VkComputePipelineCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
VkPipeline *pPipeline)
{
- TU_FROM_HANDLE(tu_device, dev, device);
- TU_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
- TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
- const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
- VkResult result;
-
- cache = cache ? cache : dev->mem_cache;
-
- struct tu_pipeline *pipeline;
-
- *pPipeline = VK_NULL_HANDLE;
-
- VkPipelineCreationFeedback pipeline_feedback = {
- .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
- };
-
- const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
- vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
-
- int64_t pipeline_start = os_time_get_nano();
-
- pipeline = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pipeline),
- VK_OBJECT_TYPE_PIPELINE);
- if (!pipeline)
- return VK_ERROR_OUT_OF_HOST_MEMORY;
-
- pipeline->executables_mem_ctx = ralloc_context(NULL);
- util_dynarray_init(&pipeline->executables, pipeline->executables_mem_ctx);
- pipeline->active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
-
- struct tu_shader_key key = { };
- tu_shader_key_init(&key, stage_info, dev);
-
- void *pipeline_mem_ctx = ralloc_context(NULL);
-
- unsigned char pipeline_sha1[20];
- tu_hash_compute(pipeline_sha1, stage_info, layout, &key, dev->compiler);
-
- struct tu_compiled_shaders *compiled = NULL;
-
- const bool executable_info = pCreateInfo->flags &
- VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
-
- bool application_cache_hit = false;
-
- if (!executable_info) {
- compiled =
- tu_pipeline_cache_lookup(cache, pipeline_sha1, sizeof(pipeline_sha1),
- &application_cache_hit);
- }
-
- if (application_cache_hit && cache != dev->mem_cache) {
- pipeline_feedback.flags |=
- VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
- }
-
- if (tu6_shared_constants_enable(layout, dev->compiler)) {
- pipeline->shared_consts = (struct tu_push_constant_range) {
- .lo = 0,
- .dwords = layout->push_constant_size / 4,
- };
- }
-
- char *nir_initial_disasm = NULL;
-
- if (!compiled) {
- if (pCreateInfo->flags &
- VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) {
- result = VK_PIPELINE_COMPILE_REQUIRED;
- goto fail;
- }
-
- struct ir3_shader_key ir3_key = {};
-
- nir_shader *nir = tu_spirv_to_nir(dev, pipeline_mem_ctx, stage_info,
- MESA_SHADER_COMPUTE);
-
- nir_initial_disasm = executable_info ?
- nir_shader_as_str(nir, pipeline->executables_mem_ctx) : NULL;
-
- struct tu_shader *shader =
- tu_shader_create(dev, nir, &key, layout, pAllocator);
- if (!shader) {
- result = VK_ERROR_OUT_OF_HOST_MEMORY;
- goto fail;
- }
-
- compiled = tu_shaders_init(dev, &pipeline_sha1, sizeof(pipeline_sha1));
- if (!compiled) {
- tu_shader_destroy(dev, shader, pAllocator);
- result = VK_ERROR_OUT_OF_HOST_MEMORY;
- goto fail;
- }
-
- compiled->active_desc_sets = shader->active_desc_sets;
- compiled->const_state[MESA_SHADER_COMPUTE] = shader->const_state;
-
- struct ir3_shader_variant *v =
- ir3_shader_create_variant(shader->ir3_shader, &ir3_key, executable_info);
-
- tu_shader_destroy(dev, shader, pAllocator);
-
- if (!v) {
- result = VK_ERROR_OUT_OF_HOST_MEMORY;
- goto fail;
- }
-
- compiled->variants[MESA_SHADER_COMPUTE] = v;
-
- compiled = tu_pipeline_cache_insert(cache, compiled);
- }
-
- pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
-
- if (creation_feedback) {
- *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
- assert(creation_feedback->pipelineStageCreationFeedbackCount == 1);
- creation_feedback->pPipelineStageCreationFeedbacks[0] = pipeline_feedback;
- }
-
- pipeline->active_desc_sets = compiled->active_desc_sets;
-
- struct ir3_shader_variant *v = compiled->variants[MESA_SHADER_COMPUTE];
-
- tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE],
- &compiled->const_state[MESA_SHADER_COMPUTE], v);
-
- result = tu_pipeline_allocate_cs(dev, pipeline, layout, NULL, v);
- if (result != VK_SUCCESS)
- goto fail;
-
- uint64_t shader_iova = tu_upload_variant(pipeline, v);
-
- struct tu_pvtmem_config pvtmem;
- tu_setup_pvtmem(dev, pipeline, &pvtmem, v->pvtmem_size, v->pvtmem_per_wave);
-
- for (int i = 0; i < 3; i++)
- pipeline->compute.local_size[i] = v->local_size[i];
-
- pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64;
-
- struct tu_cs prog_cs;
- uint32_t additional_reserve_size = tu_xs_get_additional_cs_size_dwords(v);
- tu_cs_begin_sub_stream(&pipeline->cs, 64 + additional_reserve_size, &prog_cs);
- tu6_emit_cs_config(&prog_cs, v, &pvtmem, shader_iova);
- pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
-
- tu6_emit_load_state(pipeline, layout);
-
- tu_append_executable(pipeline, v, nir_initial_disasm);
-
- pipeline->program.cs_instrlen = v->instrlen;
-
- vk_pipeline_cache_object_unref(&compiled->base);
- ralloc_free(pipeline_mem_ctx);
-
- *pPipeline = tu_pipeline_to_handle(pipeline);
-
return VK_SUCCESS;
-
-fail:
- if (compiled)
- vk_pipeline_cache_object_unref(&compiled->base);
-
- ralloc_free(pipeline_mem_ctx);
-
- vk_object_free(&dev->vk, pAllocator, pipeline);
-
- return result;
}
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_CreateComputePipelines(VkDevice device,
+VkResult
+tu_CreateComputePipelines(VkDevice _device,
VkPipelineCache pipelineCache,
uint32_t count,
const VkComputePipelineCreateInfo *pCreateInfos,
const VkAllocationCallbacks *pAllocator,
VkPipeline *pPipelines)
{
- MESA_TRACE_FUNC();
- VkResult final_result = VK_SUCCESS;
- uint32_t i = 0;
+ VkResult result = VK_SUCCESS;
+ unsigned i = 0;
for (; i < count; i++) {
- VkResult result = tu_compute_pipeline_create(device, pipelineCache,
- &pCreateInfos[i],
- pAllocator, &pPipelines[i]);
- if (result != VK_SUCCESS) {
- final_result = result;
+ VkResult r;
+ r = tu_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i],
+ pAllocator, &pPipelines[i]);
+ if (r != VK_SUCCESS) {
+ result = r;
pPipelines[i] = VK_NULL_HANDLE;
-
- if (pCreateInfos[i].flags &
- VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
- break;
}
}
- for (; i < count; i++)
- pPipelines[i] = VK_NULL_HANDLE;
-
- return final_result;
+ return result;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyPipeline(VkDevice _device,
VkPipeline _pipeline,
const VkAllocationCallbacks *pAllocator)
@@ -5182,274 +1892,5 @@ tu_DestroyPipeline(VkDevice _device,
return;
tu_pipeline_finish(pipeline, dev, pAllocator);
- vk_object_free(&dev->vk, pAllocator, pipeline);
-}
-
-#define WRITE_STR(field, ...) ({ \
- memset(field, 0, sizeof(field)); \
- UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
- assert(_i > 0 && _i < sizeof(field)); \
-})
-
-static const struct tu_pipeline_executable *
-tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
-{
- assert(index < util_dynarray_num_elements(&pipeline->executables,
- struct tu_pipeline_executable));
- return util_dynarray_element(
- &pipeline->executables, struct tu_pipeline_executable, index);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPipelineExecutablePropertiesKHR(
- VkDevice _device,
- const VkPipelineInfoKHR* pPipelineInfo,
- uint32_t* pExecutableCount,
- VkPipelineExecutablePropertiesKHR* pProperties)
-{
- TU_FROM_HANDLE(tu_device, dev, _device);
- TU_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
- VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
- pProperties, pExecutableCount);
-
- util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
- vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
- gl_shader_stage stage = exe->stage;
- props->stages = mesa_to_vk_shader_stage(stage);
-
- if (!exe->is_binning)
- WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage));
- else
- WRITE_STR(props->name, "Binning VS");
-
- WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage));
-
- props->subgroupSize =
- dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
- }
- }
-
- return vk_outarray_status(&out);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPipelineExecutableStatisticsKHR(
- VkDevice _device,
- const VkPipelineExecutableInfoKHR* pExecutableInfo,
- uint32_t* pStatisticCount,
- VkPipelineExecutableStatisticKHR* pStatistics)
-{
- TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
- VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
- pStatistics, pStatisticCount);
-
- const struct tu_pipeline_executable *exe =
- tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "Max Waves Per Core");
- WRITE_STR(stat->description,
- "Maximum number of simultaneous waves per core.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.max_waves;
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "Instruction Count");
- WRITE_STR(stat->description,
- "Total number of IR3 instructions in the final generated "
- "shader executable.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.instrs_count;
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "Code size");
- WRITE_STR(stat->description,
- "Total number of dwords in the final generated "
- "shader executable.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.sizedwords;
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "NOPs Count");
- WRITE_STR(stat->description,
- "Number of NOP instructions in the final generated "
- "shader executable.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.nops_count;
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "MOV Count");
- WRITE_STR(stat->description,
- "Number of MOV instructions in the final generated "
- "shader executable.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.mov_count;
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "COV Count");
- WRITE_STR(stat->description,
- "Number of COV instructions in the final generated "
- "shader executable.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.cov_count;
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "Registers used");
- WRITE_STR(stat->description,
- "Number of registers used in the final generated "
- "shader executable.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.max_reg + 1;
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "Half-registers used");
- WRITE_STR(stat->description,
- "Number of half-registers used in the final generated "
- "shader executable.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.max_half_reg + 1;
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "Instructions with SS sync bit");
- WRITE_STR(stat->description,
- "SS bit is set for instructions which depend on a result "
- "of \"long\" instructions to prevent RAW hazard.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.ss;
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "Instructions with SY sync bit");
- WRITE_STR(stat->description,
- "SY bit is set for instructions which depend on a result "
- "of loads from global memory to prevent RAW hazard.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.sy;
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "Estimated cycles stalled on SS");
- WRITE_STR(stat->description,
- "A better metric to estimate the impact of SS syncs.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.sstall;
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "Estimated cycles stalled on SY");
- WRITE_STR(stat->description,
- "A better metric to estimate the impact of SY syncs.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.systall;
- }
-
- for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "cat%d instructions", i);
- WRITE_STR(stat->description,
- "Number of cat%d instructions.", i);
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.instrs_per_cat[i];
- }
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "STP Count");
- WRITE_STR(stat->description,
- "Number of STore Private instructions in the final generated "
- "shader executable.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.stp_count;
- }
-
- vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
- WRITE_STR(stat->name, "LDP Count");
- WRITE_STR(stat->description,
- "Number of LoaD Private instructions in the final generated "
- "shader executable.");
- stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
- stat->value.u64 = exe->stats.ldp_count;
- }
-
- return vk_outarray_status(&out);
-}
-
-static bool
-write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
- const char *data)
-{
- ir->isText = VK_TRUE;
-
- size_t data_len = strlen(data) + 1;
-
- if (ir->pData == NULL) {
- ir->dataSize = data_len;
- return true;
- }
-
- strncpy(ir->pData, data, ir->dataSize);
- if (ir->dataSize < data_len)
- return false;
-
- ir->dataSize = data_len;
- return true;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPipelineExecutableInternalRepresentationsKHR(
- VkDevice _device,
- const VkPipelineExecutableInfoKHR* pExecutableInfo,
- uint32_t* pInternalRepresentationCount,
- VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
-{
- TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
- VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
- pInternalRepresentations, pInternalRepresentationCount);
- bool incomplete_text = false;
-
- const struct tu_pipeline_executable *exe =
- tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
-
- if (exe->nir_from_spirv) {
- vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
- WRITE_STR(ir->name, "NIR from SPIRV");
- WRITE_STR(ir->description,
- "Initial NIR before any optimizations");
-
- if (!write_ir_text(ir, exe->nir_from_spirv))
- incomplete_text = true;
- }
- }
-
- if (exe->nir_final) {
- vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
- WRITE_STR(ir->name, "Final NIR");
- WRITE_STR(ir->description,
- "Final NIR before going into the back-end compiler");
-
- if (!write_ir_text(ir, exe->nir_final))
- incomplete_text = true;
- }
- }
-
- if (exe->disasm) {
- vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
- WRITE_STR(ir->name, "IR3 Assembly");
- WRITE_STR(ir->description,
- "Final IR3 assembly for the generated shader binary");
-
- if (!write_ir_text(ir, exe->disasm))
- incomplete_text = true;
- }
- }
-
- return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
+ vk_free2(&dev->alloc, pAllocator, pipeline);
}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_query.c b/lib/mesa/src/freedreno/vulkan/tu_query.c
index 6da5102cc..2cb710fb1 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_query.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_query.c
@@ -1,339 +1,57 @@
/*
* Copyrigh 2016 Red Hat Inc.
- * SPDX-License-Identifier: MIT
- *
* Based on anv:
* Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
-#include "tu_query.h"
+#include "tu_private.h"
+#include <assert.h>
#include <fcntl.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
#include "nir/nir_builder.h"
-#include "util/os_time.h"
-
-#include "vk_util.h"
-
-#include "tu_cmd_buffer.h"
-#include "tu_cs.h"
-#include "tu_device.h"
-
-#define NSEC_PER_SEC 1000000000ull
-#define WAIT_TIMEOUT 5
-#define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1)
-
-struct PACKED query_slot {
- uint64_t available;
-};
-
-struct PACKED occlusion_slot_value {
- /* Seems sample counters are placed to be 16-byte aligned
- * even though this query needs an 8-byte slot. */
- uint64_t value;
- uint64_t _padding;
-};
-
-struct PACKED occlusion_query_slot {
- struct query_slot common;
- uint64_t result;
-
- struct occlusion_slot_value begin;
- struct occlusion_slot_value end;
-};
-
-struct PACKED timestamp_query_slot {
- struct query_slot common;
- uint64_t result;
-};
-
-struct PACKED primitive_slot_value {
- uint64_t values[2];
-};
-
-struct PACKED pipeline_stat_query_slot {
- struct query_slot common;
- uint64_t results[STAT_COUNT];
-
- uint64_t begin[STAT_COUNT];
- uint64_t end[STAT_COUNT];
-};
-
-struct PACKED primitive_query_slot {
- struct query_slot common;
- /* The result of transform feedback queries is two integer values:
- * results[0] is the count of primitives written,
- * results[1] is the count of primitives generated.
- * Also a result for each stream is stored at 4 slots respectively.
- */
- uint64_t results[2];
-
- /* Primitive counters also need to be 16-byte aligned. */
- uint64_t _padding;
-
- struct primitive_slot_value begin[4];
- struct primitive_slot_value end[4];
-};
-
-struct PACKED perfcntr_query_slot {
- uint64_t result;
- uint64_t begin;
- uint64_t end;
-};
-
-struct PACKED perf_query_slot {
- struct query_slot common;
- struct perfcntr_query_slot perfcntr;
-};
-
-struct PACKED primitives_generated_query_slot {
- struct query_slot common;
- uint64_t result;
- uint64_t begin;
- uint64_t end;
-};
-
-/* Returns the IOVA of a given uint64_t field in a given slot of a query
- * pool. */
-#define query_iova(type, pool, query, field) \
- pool->bo->iova + pool->stride * (query) + offsetof(type, field)
-
-#define occlusion_query_iova(pool, query, field) \
- query_iova(struct occlusion_query_slot, pool, query, field)
-
-#define pipeline_stat_query_iova(pool, query, field) \
- pool->bo->iova + pool->stride * (query) + \
- offsetof(struct pipeline_stat_query_slot, field)
-
-#define primitive_query_iova(pool, query, field, i) \
- query_iova(struct primitive_query_slot, pool, query, field) + \
- offsetof(struct primitive_slot_value, values[i])
-
-#define perf_query_iova(pool, query, field, i) \
- pool->bo->iova + pool->stride * (query) + \
- sizeof(struct query_slot) + \
- sizeof(struct perfcntr_query_slot) * (i) + \
- offsetof(struct perfcntr_query_slot, field)
-
-#define primitives_generated_query_iova(pool, query, field) \
- query_iova(struct primitives_generated_query_slot, pool, query, field)
-
-#define query_available_iova(pool, query) \
- query_iova(struct query_slot, pool, query, available)
-
-#define query_result_iova(pool, query, type, i) \
- pool->bo->iova + pool->stride * (query) + \
- sizeof(struct query_slot) + sizeof(type) * (i)
-
-#define query_result_addr(pool, query, type, i) \
- pool->bo->map + pool->stride * (query) + \
- sizeof(struct query_slot) + sizeof(type) * (i)
-
-#define query_is_available(slot) slot->available
-
-static const VkPerformanceCounterUnitKHR
-fd_perfcntr_type_to_vk_unit[] = {
- [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
- [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
- [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
- [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
- [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
- /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */
- [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
- [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
- [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
- [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
- [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
- [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
- [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
-};
-
-/* TODO. Basically this comes from the freedreno implementation where
- * only UINT64 is used. We'd better confirm this by the blob vulkan driver
- * when it starts supporting perf query.
- */
-static const VkPerformanceCounterStorageKHR
-fd_perfcntr_type_to_vk_storage[] = {
- [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
- [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
- [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
- [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
- [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
- [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
- [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
- [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
- [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
- [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
- [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
- [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
-};
-
-/*
- * Returns a pointer to a given slot in a query pool.
- */
-static void* slot_address(struct tu_query_pool *pool, uint32_t query)
-{
- return (char*)pool->bo->map + query * pool->stride;
-}
-
-static void
-perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
- uint32_t index, uint32_t *gid, uint32_t *cid)
-
-{
- uint32_t i;
-
- for (i = 0; i < group_count; i++) {
- if (group[i].num_countables > index) {
- *gid = i;
- *cid = index;
- break;
- }
- index -= group[i].num_countables;
- }
-
- assert(i < group_count);
-}
-static int
-compare_perfcntr_pass(const void *a, const void *b)
-{
- return ((struct tu_perf_query_data *)a)->pass -
- ((struct tu_perf_query_data *)b)->pass;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_CreateQueryPool(VkDevice _device,
const VkQueryPoolCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator,
VkQueryPool *pQueryPool)
{
TU_FROM_HANDLE(tu_device, device, _device);
- assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
- assert(pCreateInfo->queryCount > 0);
-
- uint32_t pool_size, slot_size;
- const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
-
- pool_size = sizeof(struct tu_query_pool);
-
- switch (pCreateInfo->queryType) {
- case VK_QUERY_TYPE_OCCLUSION:
- slot_size = sizeof(struct occlusion_query_slot);
- break;
- case VK_QUERY_TYPE_TIMESTAMP:
- slot_size = sizeof(struct timestamp_query_slot);
- break;
- case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- slot_size = sizeof(struct primitive_query_slot);
- break;
- case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
- slot_size = sizeof(struct primitives_generated_query_slot);
- break;
- case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
- perf_query_info =
- vk_find_struct_const(pCreateInfo->pNext,
- QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
- assert(perf_query_info);
-
- slot_size = sizeof(struct perf_query_slot) +
- sizeof(struct perfcntr_query_slot) *
- (perf_query_info->counterIndexCount - 1);
-
- /* Size of the array pool->tu_perf_query_data */
- pool_size += sizeof(struct tu_perf_query_data) *
- perf_query_info->counterIndexCount;
- break;
- }
- case VK_QUERY_TYPE_PIPELINE_STATISTICS:
- slot_size = sizeof(struct pipeline_stat_query_slot);
- break;
- default:
- unreachable("Invalid query type");
- }
-
struct tu_query_pool *pool =
- vk_object_alloc(&device->vk, pAllocator, pool_size,
- VK_OBJECT_TYPE_QUERY_POOL);
- if (!pool)
- return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-
- if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
- pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
- &pool->perf_group_count);
-
- pool->counter_index_count = perf_query_info->counterIndexCount;
-
- /* Build all perf counters data that is requested, so we could get
- * correct group id, countable id, counter register and pass index with
- * only a counter index provided by applications at each command submit.
- *
- * Also, since this built data will be sorted by pass index later, we
- * should keep the original indices and store perfcntrs results according
- * to them so apps can get correct results with their own indices.
- */
- uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];
- memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));
- memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));
-
- for (uint32_t i = 0; i < pool->counter_index_count; i++) {
- uint32_t gid = 0, cid = 0;
-
- perfcntr_index(pool->perf_group, pool->perf_group_count,
- perf_query_info->pCounterIndices[i], &gid, &cid);
-
- pool->perf_query_data[i].gid = gid;
- pool->perf_query_data[i].cid = cid;
- pool->perf_query_data[i].app_idx = i;
-
- /* When a counter register is over the capacity(num_counters),
- * reset it for next pass.
- */
- if (regs[gid] < pool->perf_group[gid].num_counters) {
- pool->perf_query_data[i].cntr_reg = regs[gid]++;
- pool->perf_query_data[i].pass = pass[gid];
- } else {
- pool->perf_query_data[i].pass = ++pass[gid];
- pool->perf_query_data[i].cntr_reg = regs[gid] = 0;
- regs[gid]++;
- }
- }
+ vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
- /* Sort by pass index so we could easily prepare a command stream
- * with the ascending order of pass index.
- */
- qsort(pool->perf_query_data, pool->counter_index_count,
- sizeof(pool->perf_query_data[0]),
- compare_perfcntr_pass);
- }
-
- VkResult result = tu_bo_init_new(device, &pool->bo,
- pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool");
- if (result != VK_SUCCESS) {
- vk_object_free(&device->vk, pAllocator, pool);
- return result;
- }
-
- result = tu_bo_map(device, pool->bo);
- if (result != VK_SUCCESS) {
- tu_bo_finish(device, pool->bo);
- vk_object_free(&device->vk, pAllocator, pool);
- return result;
- }
-
- /* Initialize all query statuses to unavailable */
- memset(pool->bo->map, 0, pool->bo->size);
+ if (!pool)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- pool->type = pCreateInfo->queryType;
- pool->stride = slot_size;
- pool->size = pCreateInfo->queryCount;
- pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
*pQueryPool = tu_query_pool_to_handle(pool);
-
return VK_SUCCESS;
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_DestroyQueryPool(VkDevice _device,
VkQueryPool _pool,
const VkAllocationCallbacks *pAllocator)
@@ -344,211 +62,10 @@ tu_DestroyQueryPool(VkDevice _device,
if (!pool)
return;
- tu_bo_finish(device, pool->bo);
- vk_object_free(&device->vk, pAllocator, pool);
-}
-
-static uint32_t
-get_result_count(struct tu_query_pool *pool)
-{
- switch (pool->type) {
- /* Occulusion and timestamp queries write one integer value */
- case VK_QUERY_TYPE_OCCLUSION:
- case VK_QUERY_TYPE_TIMESTAMP:
- case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
- return 1;
- /* Transform feedback queries write two integer values */
- case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- return 2;
- case VK_QUERY_TYPE_PIPELINE_STATISTICS:
- return util_bitcount(pool->pipeline_statistics);
- case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
- return pool->counter_index_count;
- default:
- assert(!"Invalid query type");
- return 0;
- }
-}
-
-static uint32_t
-statistics_index(uint32_t *statistics)
-{
- uint32_t stat;
- stat = u_bit_scan(statistics);
-
- switch (1 << stat) {
- case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT:
- case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT:
- return 0;
- case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT:
- return 1;
- case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT:
- return 2;
- case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT:
- return 4;
- case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT:
- return 5;
- case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT:
- return 6;
- case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT:
- return 7;
- case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT:
- return 8;
- case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT:
- return 9;
- case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT:
- return 10;
- default:
- return 0;
- }
-}
-
-static bool
-is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)
-{
- return pipeline_statistics &
- (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
- VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
- VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
- VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT |
- VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
- VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT |
- VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT |
- VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT |
- VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT);
-}
-
-static bool
-is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)
-{
- return pipeline_statistics &
- VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
-}
-
-static bool
-is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
-{
- return pipeline_statistics &
- VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
-}
-
-/* Wait on the the availability status of a query up until a timeout. */
-static VkResult
-wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
- uint32_t query)
-{
- /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
- * scheduler friendly way instead of busy polling once the patch has landed
- * upstream. */
- struct query_slot *slot = slot_address(pool, query);
- uint64_t abs_timeout = os_time_get_absolute_timeout(
- WAIT_TIMEOUT * NSEC_PER_SEC);
- while(os_time_get_nano() < abs_timeout) {
- if (query_is_available(slot))
- return VK_SUCCESS;
- }
- return vk_error(device, VK_TIMEOUT);
+ vk_free2(&device->alloc, pAllocator, pool);
}
-/* Writes a query value to a buffer from the CPU. */
-static void
-write_query_value_cpu(char* base,
- uint32_t offset,
- uint64_t value,
- VkQueryResultFlags flags)
-{
- if (flags & VK_QUERY_RESULT_64_BIT) {
- *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
- } else {
- *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
- }
-}
-
-static VkResult
-get_query_pool_results(struct tu_device *device,
- struct tu_query_pool *pool,
- uint32_t firstQuery,
- uint32_t queryCount,
- size_t dataSize,
- void *pData,
- VkDeviceSize stride,
- VkQueryResultFlags flags)
-{
- assert(dataSize >= stride * queryCount);
-
- char *result_base = pData;
- VkResult result = VK_SUCCESS;
- for (uint32_t i = 0; i < queryCount; i++) {
- uint32_t query = firstQuery + i;
- struct query_slot *slot = slot_address(pool, query);
- bool available = query_is_available(slot);
- uint32_t result_count = get_result_count(pool);
- uint32_t statistics = pool->pipeline_statistics;
-
- if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
- VkResult wait_result = wait_for_available(device, pool, query);
- if (wait_result != VK_SUCCESS)
- return wait_result;
- available = true;
- } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
- /* From the Vulkan 1.1.130 spec:
- *
- * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
- * both not set then no result values are written to pData for
- * queries that are in the unavailable state at the time of the
- * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
- * availability state is still written to pData for those queries
- * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
- */
- result = VK_NOT_READY;
- if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
- result_base += stride;
- continue;
- }
- }
-
- for (uint32_t k = 0; k < result_count; k++) {
- if (available) {
- uint64_t *result;
-
- if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
- uint32_t stat_idx = statistics_index(&statistics);
- result = query_result_addr(pool, query, uint64_t, stat_idx);
- } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
- result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
- } else {
- result = query_result_addr(pool, query, uint64_t, k);
- }
-
- write_query_value_cpu(result_base, k, *result, flags);
- } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
- /* From the Vulkan 1.1.130 spec:
- *
- * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
- * is not set, and the query’s status is unavailable, an
- * intermediate result value between zero and the final result
- * value is written to pData for that query.
- *
- * Just return 0 here for simplicity since it's a valid result.
- */
- write_query_value_cpu(result_base, k, 0, flags);
- }
-
- if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
- /* From the Vulkan 1.1.130 spec:
- *
- * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
- * integer value written for each query is non-zero if the query’s
- * status was available or zero if the status was unavailable.
- */
- write_query_value_cpu(result_base, result_count, available, flags);
-
- result_base += stride;
- }
- return result;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
+VkResult
tu_GetQueryPoolResults(VkDevice _device,
VkQueryPool queryPool,
uint32_t firstQuery,
@@ -558,140 +75,10 @@ tu_GetQueryPoolResults(VkDevice _device,
VkDeviceSize stride,
VkQueryResultFlags flags)
{
- TU_FROM_HANDLE(tu_device, device, _device);
- TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
- assert(firstQuery + queryCount <= pool->size);
-
- if (vk_device_is_lost(&device->vk))
- return VK_ERROR_DEVICE_LOST;
-
- switch (pool->type) {
- case VK_QUERY_TYPE_OCCLUSION:
- case VK_QUERY_TYPE_TIMESTAMP:
- case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
- case VK_QUERY_TYPE_PIPELINE_STATISTICS:
- case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
- return get_query_pool_results(device, pool, firstQuery, queryCount,
- dataSize, pData, stride, flags);
- default:
- assert(!"Invalid query type");
- }
return VK_SUCCESS;
}
-/* Copies a query value from one buffer to another from the GPU. */
-static void
-copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
- struct tu_cs *cs,
- uint64_t src_iova,
- uint64_t base_write_iova,
- uint32_t offset,
- VkQueryResultFlags flags) {
- uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
- sizeof(uint64_t) : sizeof(uint32_t);
- uint64_t write_iova = base_write_iova + (offset * element_size);
-
- tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
- uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
- CP_MEM_TO_MEM_0_DOUBLE : 0;
- tu_cs_emit(cs, mem_to_mem_flags);
- tu_cs_emit_qw(cs, write_iova);
- tu_cs_emit_qw(cs, src_iova);
-}
-
-static void
-emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
- struct tu_cs *cs,
- struct tu_query_pool *pool,
- uint32_t firstQuery,
- uint32_t queryCount,
- struct tu_buffer *buffer,
- VkDeviceSize dstOffset,
- VkDeviceSize stride,
- VkQueryResultFlags flags)
-{
- /* From the Vulkan 1.1.130 spec:
- *
- * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
- * uses of vkCmdResetQueryPool in the same queue, without any additional
- * synchronization.
- *
- * To ensure that previous writes to the available bit are coherent, first
- * wait for all writes to complete.
- */
- tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
-
- for (uint32_t i = 0; i < queryCount; i++) {
- uint32_t query = firstQuery + i;
- uint64_t available_iova = query_available_iova(pool, query);
- uint64_t buffer_iova = buffer->iova + dstOffset + i * stride;
- uint32_t result_count = get_result_count(pool);
- uint32_t statistics = pool->pipeline_statistics;
-
- /* Wait for the available bit to be set if executed with the
- * VK_QUERY_RESULT_WAIT_BIT flag. */
- if (flags & VK_QUERY_RESULT_WAIT_BIT) {
- tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
- tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
- CP_WAIT_REG_MEM_0_POLL_MEMORY);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
- tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
- tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
- }
-
- for (uint32_t k = 0; k < result_count; k++) {
- uint64_t result_iova;
-
- if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
- uint32_t stat_idx = statistics_index(&statistics);
- result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
- } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
- result_iova = query_result_iova(pool, query,
- struct perfcntr_query_slot, k);
- } else {
- result_iova = query_result_iova(pool, query, uint64_t, k);
- }
-
- if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
- /* Unconditionally copying the bo->result into the buffer here is
- * valid because we only set bo->result on vkCmdEndQuery. Thus, even
- * if the query is unavailable, this will copy the correct partial
- * value of 0.
- */
- copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
- k /* offset */, flags);
- } else {
- /* Conditionally copy bo->result into the buffer based on whether the
- * query is available.
- *
- * NOTE: For the conditional packets to be executed, CP_COND_EXEC
- * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
- * that 0 < available < 2, aka available == 1.
- */
- tu_cs_reserve(cs, 7 + 6);
- tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
- tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
-
- /* Start of conditional execution */
- copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
- k /* offset */, flags);
- /* End of conditional execution */
- }
- }
-
- if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
- copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
- result_count /* offset */, flags);
- }
- }
-}
-
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
uint32_t firstQuery,
@@ -701,1032 +88,35 @@ tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
VkDeviceSize stride,
VkQueryResultFlags flags)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
- TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
- TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
- struct tu_cs *cs = &cmdbuf->cs;
- assert(firstQuery + queryCount <= pool->size);
-
- switch (pool->type) {
- case VK_QUERY_TYPE_OCCLUSION:
- case VK_QUERY_TYPE_TIMESTAMP:
- case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
- case VK_QUERY_TYPE_PIPELINE_STATISTICS:
- return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
- queryCount, buffer, dstOffset, stride, flags);
- case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
- unreachable("allowCommandBufferQueryCopies is false");
- default:
- assert(!"Invalid query type");
- }
-}
-
-static void
-emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
- struct tu_query_pool *pool,
- uint32_t firstQuery,
- uint32_t queryCount)
-{
- struct tu_cs *cs = &cmdbuf->cs;
-
- for (uint32_t i = 0; i < queryCount; i++) {
- uint32_t query = firstQuery + i;
- uint32_t statistics = pool->pipeline_statistics;
-
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, query_available_iova(pool, query));
- tu_cs_emit_qw(cs, 0x0);
-
- for (uint32_t k = 0; k < get_result_count(pool); k++) {
- uint64_t result_iova;
-
- if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
- uint32_t stat_idx = statistics_index(&statistics);
- result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
- } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
- result_iova = query_result_iova(pool, query,
- struct perfcntr_query_slot, k);
- } else {
- result_iova = query_result_iova(pool, query, uint64_t, k);
- }
-
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, result_iova);
- tu_cs_emit_qw(cs, 0x0);
- }
- }
-
}
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
uint32_t firstQuery,
uint32_t queryCount)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
- TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
-
- switch (pool->type) {
- case VK_QUERY_TYPE_TIMESTAMP:
- case VK_QUERY_TYPE_OCCLUSION:
- case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
- case VK_QUERY_TYPE_PIPELINE_STATISTICS:
- case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
- emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
- break;
- default:
- assert(!"Invalid query type");
- }
}
-VKAPI_ATTR void VKAPI_CALL
-tu_ResetQueryPool(VkDevice device,
- VkQueryPool queryPool,
- uint32_t firstQuery,
- uint32_t queryCount)
-{
- TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
-
- for (uint32_t i = 0; i < queryCount; i++) {
- struct query_slot *slot = slot_address(pool, i + firstQuery);
- slot->available = 0;
-
- for (uint32_t k = 0; k < get_result_count(pool); k++) {
- uint64_t *res;
-
- if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
- res = query_result_addr(pool, i + firstQuery,
- struct perfcntr_query_slot, k);
- } else {
- res = query_result_addr(pool, i + firstQuery, uint64_t, k);
- }
-
- *res = 0;
- }
- }
-}
-
-static void
-emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
- struct tu_query_pool *pool,
- uint32_t query)
-{
- /* From the Vulkan 1.1.130 spec:
- *
- * A query must begin and end inside the same subpass of a render pass
- * instance, or must both begin and end outside of a render pass
- * instance.
- *
- * Unlike on an immediate-mode renderer, Turnip renders all tiles on
- * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
- * query begins/ends inside the same subpass of a render pass, we need to
- * record the packets on the secondary draw command stream. cmdbuf->draw_cs
- * is then run on every tile during render, so we just need to accumulate
- * sample counts in slot->result to compute the query result.
- */
- struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
-
- uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
-
- tu_cs_emit_regs(cs,
- A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
-
- tu_cs_emit_regs(cs,
- A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova));
-
- tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
- tu_cs_emit(cs, ZPASS_DONE);
-}
-
-static void
-emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
- struct tu_query_pool *pool,
- uint32_t query)
-{
- struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
- uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin);
-
- if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
- bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
- cmdbuf->state.prim_counters_running++;
-
- /* Prevent starting primitive counters when it is supposed to be stopped
- * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
- */
- if (need_cond_exec) {
- tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
- CP_COND_REG_EXEC_0_SYSMEM |
- CP_COND_REG_EXEC_0_BINNING);
- }
-
- tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
-
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
- tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
- tu_cs_emit(cs, 0);
-
- if (need_cond_exec) {
- tu_cond_exec_end(cs);
- }
- }
-
- if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
- tu6_emit_event_write(cmdbuf, cs, START_FRAGMENT_CTRS);
- }
-
- if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
- tu6_emit_event_write(cmdbuf, cs, START_COMPUTE_CTRS);
- }
-
- tu_cs_emit_wfi(cs);
-
- tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
- tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
- CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
- CP_REG_TO_MEM_0_64B);
- tu_cs_emit_qw(cs, begin_iova);
-}
-
-static void
-emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
-{
- tu_cs_emit_pkt7(cs, CP_REG_TEST, 1);
- tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(
- REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) |
- A6XX_CP_REG_TEST_0_BIT(pass) |
- A6XX_CP_REG_TEST_0_WAIT_FOR_ME);
- tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
-}
-
-static void
-emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
- struct tu_query_pool *pool,
- uint32_t query)
-{
- struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
- uint32_t last_pass = ~0;
-
- if (cmdbuf->state.pass) {
- cmdbuf->state.rp.draw_cs_writes_to_cond_pred = true;
- }
-
- /* Querying perf counters happens in these steps:
- *
- * 0) There's a scratch reg to set a pass index for perf counters query.
- * Prepare cmd streams to set each pass index to the reg at device
- * creation time. See tu_CreateDevice in tu_device.c
- * 1) Emit command streams to read all requested perf counters at all
- * passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which
- * reads the scratch reg where pass index is set.
- * See emit_perfcntrs_pass_start.
- * 2) Pick the right cs setting proper pass index to the reg and prepend
- * it to the command buffer at each submit time.
- * See tu_QueueSubmit in tu_drm.c
- * 3) If the pass index in the reg is true, then executes the command
- * stream below CP_COND_REG_EXEC.
- */
-
- tu_cs_emit_wfi(cs);
-
- for (uint32_t i = 0; i < pool->counter_index_count; i++) {
- struct tu_perf_query_data *data = &pool->perf_query_data[i];
-
- if (last_pass != data->pass) {
- last_pass = data->pass;
-
- if (data->pass != 0)
- tu_cond_exec_end(cs);
- emit_perfcntrs_pass_start(cs, data->pass);
- }
-
- const struct fd_perfcntr_counter *counter =
- &pool->perf_group[data->gid].counters[data->cntr_reg];
- const struct fd_perfcntr_countable *countable =
- &pool->perf_group[data->gid].countables[data->cid];
-
- tu_cs_emit_pkt4(cs, counter->select_reg, 1);
- tu_cs_emit(cs, countable->selector);
- }
- tu_cond_exec_end(cs);
-
- last_pass = ~0;
- tu_cs_emit_wfi(cs);
-
- for (uint32_t i = 0; i < pool->counter_index_count; i++) {
- struct tu_perf_query_data *data = &pool->perf_query_data[i];
-
- if (last_pass != data->pass) {
- last_pass = data->pass;
-
- if (data->pass != 0)
- tu_cond_exec_end(cs);
- emit_perfcntrs_pass_start(cs, data->pass);
- }
-
- const struct fd_perfcntr_counter *counter =
- &pool->perf_group[data->gid].counters[data->cntr_reg];
-
- uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
-
- tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
- tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
- CP_REG_TO_MEM_0_64B);
- tu_cs_emit_qw(cs, begin_iova);
- }
- tu_cond_exec_end(cs);
-}
-
-static void
-emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
- struct tu_query_pool *pool,
- uint32_t query,
- uint32_t stream_id)
-{
- struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
- uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
-
- tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova));
- tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
-}
-
-static void
-emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
- struct tu_query_pool *pool,
- uint32_t query)
-{
- struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
- uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
-
- if (cmdbuf->state.pass) {
- cmdbuf->state.rp.has_prim_generated_query_in_rp = true;
- } else {
- cmdbuf->state.prim_generated_query_running_before_rp = true;
- }
-
- cmdbuf->state.prim_counters_running++;
-
- if (cmdbuf->state.pass) {
- /* Primitives that passed all tests are still counted in in each
- * tile even with HW binning beforehand. Do not permit it.
- */
- tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
- CP_COND_REG_EXEC_0_SYSMEM |
- CP_COND_REG_EXEC_0_BINNING);
- }
-
- tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
-
- tu_cs_emit_wfi(cs);
-
- tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
- tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
- CP_REG_TO_MEM_0_CNT(2) |
- CP_REG_TO_MEM_0_64B);
- tu_cs_emit_qw(cs, begin_iova);
-
- if (cmdbuf->state.pass) {
- tu_cond_exec_end(cs);
- }
-}
-
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
uint32_t query,
VkQueryControlFlags flags)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
- TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
- assert(query < pool->size);
-
- switch (pool->type) {
- case VK_QUERY_TYPE_OCCLUSION:
- /* In freedreno, there is no implementation difference between
- * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
- * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
- */
- emit_begin_occlusion_query(cmdbuf, pool, query);
- break;
- case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- emit_begin_xfb_query(cmdbuf, pool, query, 0);
- break;
- case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
- emit_begin_prim_generated_query(cmdbuf, pool, query);
- break;
- case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
- emit_begin_perf_query(cmdbuf, pool, query);
- break;
- case VK_QUERY_TYPE_PIPELINE_STATISTICS:
- emit_begin_stat_query(cmdbuf, pool, query);
- break;
- case VK_QUERY_TYPE_TIMESTAMP:
- unreachable("Unimplemented query type");
- default:
- assert(!"Invalid query type");
- }
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
- VkQueryPool queryPool,
- uint32_t query,
- VkQueryControlFlags flags,
- uint32_t index)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
- TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
- assert(query < pool->size);
-
- switch (pool->type) {
- case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- emit_begin_xfb_query(cmdbuf, pool, query, index);
- break;
- case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
- emit_begin_prim_generated_query(cmdbuf, pool, query);
- break;
- default:
- assert(!"Invalid query type");
- }
-}
-
-static void
-emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
- struct tu_query_pool *pool,
- uint32_t query)
-{
- /* Ending an occlusion query happens in a few steps:
- * 1) Set the slot->end to UINT64_MAX.
- * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
- * write the current sample count value into slot->end.
- * 3) Since (2) is asynchronous, wait until slot->end is not equal to
- * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
- * 4) Accumulate the results of the query (slot->end - slot->begin) into
- * slot->result.
- * 5) If vkCmdEndQuery is *not* called from within the scope of a render
- * pass, set the slot's available bit since the query is now done.
- * 6) If vkCmdEndQuery *is* called from within the scope of a render
- * pass, we cannot mark as available yet since the commands in
- * draw_cs are not run until vkCmdEndRenderPass.
- */
- const struct tu_render_pass *pass = cmdbuf->state.pass;
- struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
-
- uint64_t available_iova = query_available_iova(pool, query);
- uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
- uint64_t end_iova = occlusion_query_iova(pool, query, end);
- uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0);
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, end_iova);
- tu_cs_emit_qw(cs, 0xffffffffffffffffull);
-
- tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
-
- tu_cs_emit_regs(cs,
- A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
-
- tu_cs_emit_regs(cs,
- A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova));
-
- tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
- tu_cs_emit(cs, ZPASS_DONE);
-
- tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
- tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
- CP_WAIT_REG_MEM_0_POLL_MEMORY);
- tu_cs_emit_qw(cs, end_iova);
- tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
- tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
- tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
-
- /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
- tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
- tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
- tu_cs_emit_qw(cs, result_iova);
- tu_cs_emit_qw(cs, result_iova);
- tu_cs_emit_qw(cs, end_iova);
- tu_cs_emit_qw(cs, begin_iova);
-
- tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
-
- if (pass)
- /* Technically, queries should be tracked per-subpass, but here we track
- * at the render pass level to simply the code a bit. This is safe
- * because the only commands that use the available bit are
- * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
- * cannot be invoked from inside a render pass scope.
- */
- cs = &cmdbuf->draw_epilogue_cs;
-
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit_qw(cs, 0x1);
-}
-
-/* PRIMITIVE_CTRS is used for two distinct queries:
- * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT
- * - VK_QUERY_TYPE_PIPELINE_STATISTICS
- * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted
- * only for outer query.
- *
- * Also, pipeline stat query could run outside of renderpass and prim gen
- * query inside of secondary cmd buffer - for such case we ought to track
- * the status of pipeline stats query.
- */
-static void
-emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
- struct tu_cs *cs,
- enum VkQueryType query_type)
-{
- bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY;
- cmdbuf->state.prim_counters_running--;
- if (cmdbuf->state.prim_counters_running == 0) {
- bool need_cond_exec =
- is_secondary &&
- query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT &&
- is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics);
-
- if (!need_cond_exec) {
- tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
- } else {
- tu_cs_reserve(cs, 7 + 2);
- /* Check that pipeline stats query is not running, only then
- * we count stop the counter.
- */
- tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
- tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
- tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
- tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
- tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
-
- tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
- }
- }
-
- if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
- tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
- tu_cs_emit(cs, 1);
- }
}
-static void
-emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
- struct tu_query_pool *pool,
- uint32_t query)
-{
- struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
- uint64_t end_iova = pipeline_stat_query_iova(pool, query, end);
- uint64_t available_iova = query_available_iova(pool, query);
- uint64_t result_iova;
- uint64_t stat_start_iova;
- uint64_t stat_stop_iova;
-
- if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
- /* No need to conditionally execute STOP_PRIMITIVE_CTRS when
- * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a
- * renderpass, because it is already stopped.
- */
- emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS);
- }
-
- if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
- tu6_emit_event_write(cmdbuf, cs, STOP_FRAGMENT_CTRS);
- }
-
- if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
- tu6_emit_event_write(cmdbuf, cs, STOP_COMPUTE_CTRS);
- }
-
- tu_cs_emit_wfi(cs);
-
- tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
- tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) |
- CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
- CP_REG_TO_MEM_0_64B);
- tu_cs_emit_qw(cs, end_iova);
-
- for (int i = 0; i < STAT_COUNT; i++) {
- result_iova = query_result_iova(pool, query, uint64_t, i);
- stat_start_iova = pipeline_stat_query_iova(pool, query, begin[i]);
- stat_stop_iova = pipeline_stat_query_iova(pool, query, end[i]);
-
- tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
- tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
- CP_MEM_TO_MEM_0_DOUBLE |
- CP_MEM_TO_MEM_0_NEG_C);
-
- tu_cs_emit_qw(cs, result_iova);
- tu_cs_emit_qw(cs, result_iova);
- tu_cs_emit_qw(cs, stat_stop_iova);
- tu_cs_emit_qw(cs, stat_start_iova);
- }
-
- tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
-
- if (cmdbuf->state.pass)
- cs = &cmdbuf->draw_epilogue_cs;
-
- /* Set the availability to 1 */
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit_qw(cs, 0x1);
-}
-
-static void
-emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
- struct tu_query_pool *pool,
- uint32_t query)
-{
- struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
- uint64_t available_iova = query_available_iova(pool, query);
- uint64_t end_iova;
- uint64_t begin_iova;
- uint64_t result_iova;
- uint32_t last_pass = ~0;
-
- for (uint32_t i = 0; i < pool->counter_index_count; i++) {
- struct tu_perf_query_data *data = &pool->perf_query_data[i];
-
- if (last_pass != data->pass) {
- last_pass = data->pass;
-
- if (data->pass != 0)
- tu_cond_exec_end(cs);
- emit_perfcntrs_pass_start(cs, data->pass);
- }
-
- const struct fd_perfcntr_counter *counter =
- &pool->perf_group[data->gid].counters[data->cntr_reg];
-
- end_iova = perf_query_iova(pool, 0, end, data->app_idx);
-
- tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
- tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
- CP_REG_TO_MEM_0_64B);
- tu_cs_emit_qw(cs, end_iova);
- }
- tu_cond_exec_end(cs);
-
- last_pass = ~0;
- tu_cs_emit_wfi(cs);
-
- for (uint32_t i = 0; i < pool->counter_index_count; i++) {
- struct tu_perf_query_data *data = &pool->perf_query_data[i];
-
- if (last_pass != data->pass) {
- last_pass = data->pass;
-
-
- if (data->pass != 0)
- tu_cond_exec_end(cs);
- emit_perfcntrs_pass_start(cs, data->pass);
- }
-
- result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot,
- data->app_idx);
- begin_iova = perf_query_iova(pool, 0, begin, data->app_idx);
- end_iova = perf_query_iova(pool, 0, end, data->app_idx);
-
- /* result += end - begin */
- tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
- tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
- CP_MEM_TO_MEM_0_DOUBLE |
- CP_MEM_TO_MEM_0_NEG_C);
-
- tu_cs_emit_qw(cs, result_iova);
- tu_cs_emit_qw(cs, result_iova);
- tu_cs_emit_qw(cs, end_iova);
- tu_cs_emit_qw(cs, begin_iova);
- }
- tu_cond_exec_end(cs);
-
- tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
-
- if (cmdbuf->state.pass)
- cs = &cmdbuf->draw_epilogue_cs;
-
- /* Set the availability to 1 */
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit_qw(cs, 0x1);
-}
-
-static void
-emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
- struct tu_query_pool *pool,
- uint32_t query,
- uint32_t stream_id)
-{
- struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
-
- uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
- uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0);
- uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1);
- uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
- uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
- uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
- uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
- uint64_t available_iova = query_available_iova(pool, query);
-
- tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova));
- tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
-
- tu_cs_emit_wfi(cs);
- tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
-
- /* Set the count of written primitives */
- tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
- tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
- CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
- tu_cs_emit_qw(cs, result_written_iova);
- tu_cs_emit_qw(cs, result_written_iova);
- tu_cs_emit_qw(cs, end_written_iova);
- tu_cs_emit_qw(cs, begin_written_iova);
-
- tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
-
- /* Set the count of generated primitives */
- tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
- tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
- CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
- tu_cs_emit_qw(cs, result_generated_iova);
- tu_cs_emit_qw(cs, result_generated_iova);
- tu_cs_emit_qw(cs, end_generated_iova);
- tu_cs_emit_qw(cs, begin_generated_iova);
-
- /* Set the availability to 1 */
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit_qw(cs, 0x1);
-}
-
-static void
-emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
- struct tu_query_pool *pool,
- uint32_t query)
-{
- struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
-
- if (!cmdbuf->state.pass) {
- cmdbuf->state.prim_generated_query_running_before_rp = false;
- }
-
- uint64_t begin_iova = primitives_generated_query_iova(pool, query, begin);
- uint64_t end_iova = primitives_generated_query_iova(pool, query, end);
- uint64_t result_iova = primitives_generated_query_iova(pool, query, result);
- uint64_t available_iova = query_available_iova(pool, query);
-
- if (cmdbuf->state.pass) {
- tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
- CP_COND_REG_EXEC_0_SYSMEM |
- CP_COND_REG_EXEC_0_BINNING);
- }
-
- tu_cs_emit_wfi(cs);
-
- tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
- tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_7_LO) |
- CP_REG_TO_MEM_0_CNT(2) |
- CP_REG_TO_MEM_0_64B);
- tu_cs_emit_qw(cs, end_iova);
-
- tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
- tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
- CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
- tu_cs_emit_qw(cs, result_iova);
- tu_cs_emit_qw(cs, result_iova);
- tu_cs_emit_qw(cs, end_iova);
- tu_cs_emit_qw(cs, begin_iova);
-
- tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
-
- /* Should be after waiting for mem writes to have up to date info
- * about which query is running.
- */
- emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
-
- if (cmdbuf->state.pass) {
- tu_cond_exec_end(cs);
- }
-
- if (cmdbuf->state.pass)
- cs = &cmdbuf->draw_epilogue_cs;
-
- /* Set the availability to 1 */
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, available_iova);
- tu_cs_emit_qw(cs, 0x1);
-}
-
-/* Implement this bit of spec text from section 17.2 "Query Operation":
- *
- * If queries are used while executing a render pass instance that has
- * multiview enabled, the query uses N consecutive query indices in the
- * query pool (starting at query) where N is the number of bits set in the
- * view mask in the subpass the query is used in. How the numerical
- * results of the query are distributed among the queries is
- * implementation-dependent. For example, some implementations may write
- * each view’s results to a distinct query, while other implementations
- * may write the total result to the first query and write zero to the
- * other queries. However, the sum of the results in all the queries must
- * accurately reflect the total result of the query summed over all views.
- * Applications can sum the results from all the queries to compute the
- * total result.
- *
- * Since we execute all views at once, we write zero to the other queries.
- * Furthermore, because queries must be reset before use, and we set the
- * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
- */
-
-static void
-handle_multiview_queries(struct tu_cmd_buffer *cmd,
- struct tu_query_pool *pool,
- uint32_t query)
-{
- if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
- return;
-
- unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
- struct tu_cs *cs = &cmd->draw_epilogue_cs;
-
- for (uint32_t i = 1; i < views; i++) {
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
- tu_cs_emit_qw(cs, 0x1);
- }
-}
-
-VKAPI_ATTR void VKAPI_CALL
+void
tu_CmdEndQuery(VkCommandBuffer commandBuffer,
VkQueryPool queryPool,
uint32_t query)
{
- TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
- TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
- assert(query < pool->size);
-
- switch (pool->type) {
- case VK_QUERY_TYPE_OCCLUSION:
- emit_end_occlusion_query(cmdbuf, pool, query);
- break;
- case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- emit_end_xfb_query(cmdbuf, pool, query, 0);
- break;
- case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
- emit_end_prim_generated_query(cmdbuf, pool, query);
- break;
- case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
- emit_end_perf_query(cmdbuf, pool, query);
- break;
- case VK_QUERY_TYPE_PIPELINE_STATISTICS:
- emit_end_stat_query(cmdbuf, pool, query);
- break;
- case VK_QUERY_TYPE_TIMESTAMP:
- unreachable("Unimplemented query type");
- default:
- assert(!"Invalid query type");
- }
-
- handle_multiview_queries(cmdbuf, pool, query);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
- VkQueryPool queryPool,
- uint32_t query,
- uint32_t index)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
- TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
- assert(query < pool->size);
-
- switch (pool->type) {
- case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
- assert(index <= 4);
- emit_end_xfb_query(cmdbuf, pool, query, index);
- break;
- case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
- emit_end_prim_generated_query(cmdbuf, pool, query);
- break;
- default:
- assert(!"Invalid query type");
- }
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
- VkPipelineStageFlagBits2 pipelineStage,
- VkQueryPool queryPool,
- uint32_t query)
-{
- TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
- TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
-
- /* Inside a render pass, just write the timestamp multiple times so that
- * the user gets the last one if we use GMEM. There isn't really much
- * better we can do, and this seems to be what the blob does too.
- */
- struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
-
- /* Stages that will already have been executed by the time the CP executes
- * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
- * indirect stage counts as top-of-pipe too.
- */
- VkPipelineStageFlags2 top_of_pipe_flags =
- VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
- VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT;
-
- if (pipelineStage & ~top_of_pipe_flags) {
- /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
- * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
- * complete.
- *
- * Stalling the CP like this is really unfortunate, but I don't think
- * there's a better solution that allows all 48 bits of precision
- * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
- */
- tu_cs_emit_wfi(cs);
- }
-
- tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
- tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER) |
- CP_REG_TO_MEM_0_CNT(2) |
- CP_REG_TO_MEM_0_64B);
- tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0));
-
- /* Only flag availability once the entire renderpass is done, similar to
- * the begin/end path.
- */
- cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
-
- tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
- tu_cs_emit_qw(cs, query_available_iova(pool, query));
- tu_cs_emit_qw(cs, 0x1);
-
- /* From the spec for vkCmdWriteTimestamp:
- *
- * If vkCmdWriteTimestamp is called while executing a render pass
- * instance that has multiview enabled, the timestamp uses N consecutive
- * query indices in the query pool (starting at query) where N is the
- * number of bits set in the view mask of the subpass the command is
- * executed in. The resulting query values are determined by an
- * implementation-dependent choice of one of the following behaviors:
- *
- * - The first query is a timestamp value and (if more than one bit is
- * set in the view mask) zero is written to the remaining queries.
- * If two timestamps are written in the same subpass, the sum of the
- * execution time of all views between those commands is the
- * difference between the first query written by each command.
- *
- * - All N queries are timestamp values. If two timestamps are written
- * in the same subpass, the sum of the execution time of all views
- * between those commands is the sum of the difference between
- * corresponding queries written by each command. The difference
- * between corresponding queries may be the execution time of a
- * single view.
- *
- * We execute all views in the same draw call, so we implement the first
- * option, the same as regular queries.
- */
- handle_multiview_queries(cmd, pool, query);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
- VkPhysicalDevice physicalDevice,
- uint32_t queueFamilyIndex,
- uint32_t* pCounterCount,
- VkPerformanceCounterKHR* pCounters,
- VkPerformanceCounterDescriptionKHR* pCounterDescriptions)
-{
- TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
-
- uint32_t desc_count = *pCounterCount;
- uint32_t group_count;
- const struct fd_perfcntr_group *group =
- fd_perfcntrs(&phydev->dev_id, &group_count);
-
- VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
- VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
- pCounterDescriptions, &desc_count);
-
- for (int i = 0; i < group_count; i++) {
- for (int j = 0; j < group[i].num_countables; j++) {
-
- vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
- counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR;
- counter->unit =
- fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
- counter->storage =
- fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
-
- unsigned char sha1_result[20];
- _mesa_sha1_compute(group[i].countables[j].name,
- strlen(group[i].countables[j].name),
- sha1_result);
- memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
- }
-
- vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
- desc->flags = 0;
-
- snprintf(desc->name, sizeof(desc->name),
- "%s", group[i].countables[j].name);
- snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
- snprintf(desc->description, sizeof(desc->description),
- "%s: %s performance counter",
- group[i].name, group[i].countables[j].name);
- }
- }
- }
-
- return vk_outarray_status(&out);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
- VkPhysicalDevice physicalDevice,
- const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo,
- uint32_t* pNumPasses)
-{
- TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
- uint32_t group_count = 0;
- uint32_t gid = 0, cid = 0, n_passes;
- const struct fd_perfcntr_group *group =
- fd_perfcntrs(&phydev->dev_id, &group_count);
-
- uint32_t counters_requested[group_count];
- memset(counters_requested, 0x0, sizeof(counters_requested));
- *pNumPasses = 1;
-
- for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
- perfcntr_index(group, group_count,
- pPerformanceQueryCreateInfo->pCounterIndices[i],
- &gid, &cid);
-
- counters_requested[gid]++;
- }
-
- for (uint32_t i = 0; i < group_count; i++) {
- n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
- *pNumPasses = MAX2(*pNumPasses, n_passes);
- }
}
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_AcquireProfilingLockKHR(VkDevice device,
- const VkAcquireProfilingLockInfoKHR* pInfo)
-{
- /* TODO. Probably there's something to do for kgsl. */
- return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_ReleaseProfilingLockKHR(VkDevice device)
+void
+tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
+ VkPipelineStageFlagBits pipelineStage,
+ VkQueryPool queryPool,
+ uint32_t query)
{
- /* TODO. Probably there's something to do for kgsl. */
- return;
}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_shader.c b/lib/mesa/src/freedreno/vulkan/tu_shader.c
index e485f8f5c..f6e13d7c4 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_shader.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_shader.c
@@ -1,894 +1,336 @@
/*
* Copyright © 2019 Google LLC
- * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
-#include "tu_shader.h"
+#include "tu_private.h"
#include "spirv/nir_spirv.h"
#include "util/mesa-sha1.h"
-#include "nir/nir_xfb_info.h"
-#include "nir/nir_vulkan.h"
-#include "vk_pipeline.h"
-#include "vk_util.h"
#include "ir3/ir3_nir.h"
-#include "tu_device.h"
-#include "tu_descriptor_set.h"
-#include "tu_pipeline.h"
-
-nir_shader *
-tu_spirv_to_nir(struct tu_device *dev,
- void *mem_ctx,
- const VkPipelineShaderStageCreateInfo *stage_info,
- gl_shader_stage stage)
+static nir_shader *
+tu_spirv_to_nir(struct ir3_compiler *compiler,
+ const uint32_t *words,
+ size_t word_count,
+ gl_shader_stage stage,
+ const char *entry_point_name,
+ const VkSpecializationInfo *spec_info)
{
/* TODO these are made-up */
const struct spirv_to_nir_options spirv_options = {
- .ubo_addr_format = nir_address_format_vec2_index_32bit_offset,
- .ssbo_addr_format = nir_address_format_vec2_index_32bit_offset,
-
- /* Accessed via stg/ldg */
- .phys_ssbo_addr_format = nir_address_format_64bit_global,
-
- /* Accessed via the const register file */
- .push_const_addr_format = nir_address_format_logical,
-
- /* Accessed via ldl/stl */
- .shared_addr_format = nir_address_format_32bit_offset,
-
- /* Accessed via stg/ldg (not used with Vulkan?) */
- .global_addr_format = nir_address_format_64bit_global,
-
- /* Use 16-bit math for RelaxedPrecision ALU ops */
- .mediump_16bit_alu = true,
-
- /* ViewID is a sysval in geometry stages and an input in the FS */
- .view_index_is_input = stage == MESA_SHADER_FRAGMENT,
- .caps = {
- .transform_feedback = true,
- .tessellation = true,
- .draw_parameters = true,
- .image_read_without_format = true,
- .image_write_without_format = true,
- .variable_pointers = true,
- .stencil_export = true,
- .multiview = true,
- .shader_viewport_index_layer = true,
- .geometry_streams = true,
- .device_group = true,
- .descriptor_indexing = true,
- .descriptor_array_dynamic_indexing = true,
- .descriptor_array_non_uniform_indexing = true,
- .runtime_descriptor_array = true,
- .float_controls = true,
- .float16 = true,
- .int16 = true,
- .storage_16bit = dev->physical_device->info->a6xx.storage_16bit,
- .demote_to_helper_invocation = true,
- .vk_memory_model = true,
- .vk_memory_model_device_scope = true,
- .subgroup_basic = true,
- .subgroup_ballot = true,
- .subgroup_vote = true,
- .subgroup_quad = true,
- .subgroup_shuffle = true,
- .subgroup_arithmetic = true,
- .physical_storage_buffer_address = true,
- },
+ .lower_ubo_ssbo_access_to_offsets = true,
+ .caps = { false },
};
-
const nir_shader_compiler_options *nir_options =
- ir3_get_compiler_options(dev->compiler);
-
- nir_shader *nir;
- VkResult result =
- vk_pipeline_shader_stage_to_nir(&dev->vk, stage_info, &spirv_options,
- nir_options, mem_ctx, &nir);
- if (result != VK_SUCCESS)
- return NULL;
-
- if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_NIR)) {
- fprintf(stderr, "translated nir:\n");
- nir_print_shader(nir, stderr);
- }
-
- const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
- .point_coord = true,
- };
- NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
-
- NIR_PASS_V(nir, nir_lower_global_vars_to_local);
-
- /* Older glslang missing bf6efd0316d8 ("SPV: Fix #2293: keep relaxed
- * precision on arg passed to relaxed param") will pass function args through
- * a highp temporary, so we need the nir_opt_find_array_copies() and a copy
- * prop before we lower mediump vars, or you'll be unable to optimize out
- * array copies after lowering. We do this before splitting copies, since
- * that works against nir_opt_find_array_copies().
- * */
- NIR_PASS_V(nir, nir_opt_find_array_copies);
- NIR_PASS_V(nir, nir_opt_copy_prop_vars);
- NIR_PASS_V(nir, nir_opt_dce);
-
- NIR_PASS_V(nir, nir_split_var_copies);
- NIR_PASS_V(nir, nir_lower_var_copies);
-
- NIR_PASS_V(nir, nir_lower_mediump_vars, nir_var_function_temp | nir_var_shader_temp | nir_var_mem_shared);
- NIR_PASS_V(nir, nir_opt_copy_prop_vars);
- NIR_PASS_V(nir, nir_opt_combine_stores, nir_var_all);
-
- NIR_PASS_V(nir, nir_lower_is_helper_invocation);
-
- NIR_PASS_V(nir, nir_lower_system_values);
-
- NIR_PASS_V(nir, nir_lower_frexp);
-
- ir3_optimize_loop(dev->compiler, nir);
-
- NIR_PASS_V(nir, nir_opt_conditional_discard);
-
- return nir;
-}
-
-static void
-lower_load_push_constant(struct tu_device *dev,
- nir_builder *b,
- nir_intrinsic_instr *instr,
- struct tu_shader *shader,
- const struct tu_pipeline_layout *layout)
-{
- uint32_t base = nir_intrinsic_base(instr);
- assert(base % 4 == 0);
-
- if (tu6_shared_constants_enable(layout, dev->compiler)) {
- /* All stages share the same range. We could potentially add
- * push_constant_offset to layout and apply it, but this is good for
- * now.
- */
- base += dev->compiler->shared_consts_base_offset * 4;
- } else {
- assert(base >= shader->const_state.push_consts.lo * 4);
- base -= shader->const_state.push_consts.lo * 4;
- }
-
- nir_ssa_def *load =
- nir_load_uniform(b, instr->num_components,
- instr->dest.ssa.bit_size,
- nir_ushr(b, instr->src[0].ssa, nir_imm_int(b, 2)),
- .base = base);
-
- nir_ssa_def_rewrite_uses(&instr->dest.ssa, load);
-
- nir_instr_remove(&instr->instr);
-}
-
-static void
-lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr,
- struct tu_shader *shader,
- const struct tu_pipeline_layout *layout)
-{
- nir_ssa_def *vulkan_idx = instr->src[0].ssa;
-
- unsigned set = nir_intrinsic_desc_set(instr);
- unsigned binding = nir_intrinsic_binding(instr);
- struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
- struct tu_descriptor_set_binding_layout *binding_layout =
- &set_layout->binding[binding];
- nir_ssa_def *base;
-
- shader->active_desc_sets |= 1u << set;
-
- switch (binding_layout->type) {
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- if (layout->independent_sets) {
- /* With independent sets, we don't know
- * layout->set[set].dynamic_offset_start until after link time which
- * with fast linking means after the shader is compiled. We have to
- * get it from the const file instead.
- */
- base = nir_imm_int(b, binding_layout->dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS));
- nir_ssa_def *dynamic_offset_start =
- nir_load_uniform(b, 1, 32, nir_imm_int(b, 0),
- .base = shader->const_state.dynamic_offset_loc + set);
- base = nir_iadd(b, base, dynamic_offset_start);
- } else {
- base = nir_imm_int(b, (layout->set[set].dynamic_offset_start +
- binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS));
+ ir3_get_compiler_options(compiler);
+
+ /* convert VkSpecializationInfo */
+ struct nir_spirv_specialization *spec = NULL;
+ uint32_t num_spec = 0;
+ if (spec_info && spec_info->mapEntryCount) {
+ spec = malloc(sizeof(*spec) * spec_info->mapEntryCount);
+ if (!spec)
+ return NULL;
+
+ for (uint32_t i = 0; i < spec_info->mapEntryCount; i++) {
+ const VkSpecializationMapEntry *entry = &spec_info->pMapEntries[i];
+ const void *data = spec_info->pData + entry->offset;
+ assert(data + entry->size <= spec_info->pData + spec_info->dataSize);
+ spec[i].id = entry->constantID;
+ if (entry->size == 8)
+ spec[i].data64 = *(const uint64_t *) data;
+ else
+ spec[i].data32 = *(const uint32_t *) data;
+ spec[i].defined_on_module = false;
}
- set = MAX_SETS;
- break;
- default:
- base = nir_imm_int(b, binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS));
- break;
- }
-
- nir_ssa_def *shift;
- if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
- /* Inline uniform blocks cannot have arrays so the stride is unused */
- shift = nir_imm_int(b, 0);
- } else {
- unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
- assert(util_is_power_of_two_nonzero(stride));
- shift = nir_imm_int(b, util_logbase2(stride));
+ num_spec = spec_info->mapEntryCount;
}
- nir_ssa_def *def = nir_vec3(b, nir_imm_int(b, set),
- nir_iadd(b, base,
- nir_ishl(b, vulkan_idx, shift)),
- shift);
+ nir_shader *nir =
+ spirv_to_nir(words, word_count, spec, num_spec, stage, entry_point_name,
+ &spirv_options, nir_options);
- nir_ssa_def_rewrite_uses(&instr->dest.ssa, def);
- nir_instr_remove(&instr->instr);
-}
+ free(spec);
-static void
-lower_vulkan_resource_reindex(nir_builder *b, nir_intrinsic_instr *instr)
-{
- nir_ssa_def *old_index = instr->src[0].ssa;
- nir_ssa_def *delta = instr->src[1].ssa;
- nir_ssa_def *shift = nir_channel(b, old_index, 2);
-
- nir_ssa_def *new_index =
- nir_vec3(b, nir_channel(b, old_index, 0),
- nir_iadd(b, nir_channel(b, old_index, 1),
- nir_ishl(b, delta, shift)),
- shift);
-
- nir_ssa_def_rewrite_uses(&instr->dest.ssa, new_index);
- nir_instr_remove(&instr->instr);
-}
+ assert(nir->info.stage == stage);
+ nir_validate_shader(nir, "after spirv_to_nir");
-static void
-lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin)
-{
- nir_ssa_def *old_index = intrin->src[0].ssa;
- /* Loading the descriptor happens as part of the load/store instruction so
- * this is a no-op. We just need to turn the shift into an offset of 0.
- */
- nir_ssa_def *new_index =
- nir_vec3(b, nir_channel(b, old_index, 0),
- nir_channel(b, old_index, 1),
- nir_imm_int(b, 0));
- nir_ssa_def_rewrite_uses(&intrin->dest.ssa, new_index);
- nir_instr_remove(&intrin->instr);
+ return nir;
}
static void
-lower_ssbo_ubo_intrinsic(struct tu_device *dev,
- nir_builder *b, nir_intrinsic_instr *intrin)
+tu_sort_variables_by_location(struct exec_list *variables)
{
- const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
-
- /* The bindless base is part of the instruction, which means that part of
- * the "pointer" has to be constant. We solve this in the same way the blob
- * does, by generating a bunch of if-statements. In the usual case where
- * the descriptor set is constant we can skip that, though).
- */
-
- unsigned buffer_src;
- if (intrin->intrinsic == nir_intrinsic_store_ssbo) {
- /* This has the value first */
- buffer_src = 1;
- } else {
- buffer_src = 0;
- }
-
- nir_ssa_scalar scalar_idx = nir_ssa_scalar_resolved(intrin->src[buffer_src].ssa, 0);
- nir_ssa_def *descriptor_idx = nir_channel(b, intrin->src[buffer_src].ssa, 1);
-
- /* For isam, we need to use the appropriate descriptor if 16-bit storage is
- * enabled. Descriptor 0 is the 16-bit one, descriptor 1 is the 32-bit one.
- */
- if (dev->physical_device->info->a6xx.storage_16bit &&
- intrin->intrinsic == nir_intrinsic_load_ssbo &&
- (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
- intrin->dest.ssa.bit_size > 16) {
- descriptor_idx = nir_iadd(b, descriptor_idx, nir_imm_int(b, 1));
- }
-
- nir_ssa_def *results[MAX_SETS + 1] = { NULL };
-
- if (nir_ssa_scalar_is_const(scalar_idx)) {
- nir_ssa_def *bindless =
- nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = nir_ssa_scalar_as_uint(scalar_idx));
- nir_instr_rewrite_src_ssa(&intrin->instr, &intrin->src[buffer_src], bindless);
- return;
- }
-
- nir_ssa_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp);
- for (unsigned i = 0; i < MAX_SETS + 1; i++) {
- /* if (base_idx == i) { ... */
- nir_if *nif = nir_push_if(b, nir_ieq_imm(b, base_idx, i));
-
- nir_ssa_def *bindless =
- nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = i);
-
- nir_intrinsic_instr *copy =
- nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
-
- copy->num_components = intrin->num_components;
-
- for (unsigned src = 0; src < info->num_srcs; src++) {
- if (src == buffer_src)
- copy->src[src] = nir_src_for_ssa(bindless);
- else
- copy->src[src] = nir_src_for_ssa(intrin->src[src].ssa);
- }
-
- for (unsigned idx = 0; idx < info->num_indices; idx++) {
- copy->const_index[idx] = intrin->const_index[idx];
- }
-
- if (info->has_dest) {
- nir_ssa_dest_init(&copy->instr, &copy->dest,
- intrin->dest.ssa.num_components,
- intrin->dest.ssa.bit_size,
- NULL);
- results[i] = &copy->dest.ssa;
+ struct exec_list sorted;
+ exec_list_make_empty(&sorted);
+
+ nir_foreach_variable_safe(var, variables)
+ {
+ exec_node_remove(&var->node);
+
+ /* insert the variable into the sorted list */
+ nir_variable *next = NULL;
+ nir_foreach_variable(tmp, &sorted)
+ {
+ if (var->data.location < tmp->data.location) {
+ next = tmp;
+ break;
+ }
}
-
- nir_builder_instr_insert(b, &copy->instr);
-
- /* } else { ... */
- nir_push_else(b, nif);
+ if (next)
+ exec_node_insert_node_before(&next->node, &var->node);
+ else
+ exec_list_push_tail(&sorted, &var->node);
}
- nir_ssa_def *result =
- nir_ssa_undef(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size);
- for (int i = MAX_SETS; i >= 0; i--) {
- nir_pop_if(b, NULL);
- if (info->has_dest)
- result = nir_if_phi(b, results[i], result);
- }
-
- if (info->has_dest)
- nir_ssa_def_rewrite_uses(&intrin->dest.ssa, result);
- nir_instr_remove(&intrin->instr);
+ exec_list_move_nodes_to(&sorted, variables);
}
-static nir_ssa_def *
-build_bindless(struct tu_device *dev, nir_builder *b,
- nir_deref_instr *deref, bool is_sampler,
- struct tu_shader *shader,
- const struct tu_pipeline_layout *layout)
-{
- nir_variable *var = nir_deref_instr_get_variable(deref);
-
- unsigned set = var->data.descriptor_set;
- unsigned binding = var->data.binding;
- const struct tu_descriptor_set_binding_layout *bind_layout =
- &layout->set[set].layout->binding[binding];
-
- /* input attachments use non bindless workaround */
- if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT &&
- likely(!(dev->instance->debug_flags & TU_DEBUG_DYNAMIC))) {
- const struct glsl_type *glsl_type = glsl_without_array(var->type);
- uint32_t idx = var->data.index * 2;
-
- BITSET_SET_RANGE_INSIDE_WORD(b->shader->info.textures_used, idx, (idx + bind_layout->array_size * 2) - 1);
-
- /* D24S8 workaround: stencil of D24S8 will be sampled as uint */
- if (glsl_get_sampler_result_type(glsl_type) == GLSL_TYPE_UINT)
- idx += 1;
-
- if (deref->deref_type == nir_deref_type_var)
- return nir_imm_int(b, idx);
-
- nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1);
- return nir_iadd(b, nir_imm_int(b, idx),
- nir_imul_imm(b, arr_index, 2));
- }
-
- shader->active_desc_sets |= 1u << set;
-
- nir_ssa_def *desc_offset;
- unsigned descriptor_stride;
- unsigned offset = 0;
- /* Samplers come second in combined image/sampler descriptors, see
- * write_combined_image_sampler_descriptor().
- */
- if (is_sampler && bind_layout->type ==
- VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
- offset = 1;
- }
- desc_offset =
- nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) +
- offset);
- descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
-
- if (deref->deref_type != nir_deref_type_var) {
- assert(deref->deref_type == nir_deref_type_array);
-
- nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1);
- desc_offset = nir_iadd(b, desc_offset,
- nir_imul_imm(b, arr_index, descriptor_stride));
- }
-
- return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set);
-}
-
-static void
-lower_image_deref(struct tu_device *dev, nir_builder *b,
- nir_intrinsic_instr *instr, struct tu_shader *shader,
- const struct tu_pipeline_layout *layout)
+struct tu_shader *
+tu_shader_create(struct tu_device *dev,
+ gl_shader_stage stage,
+ const VkPipelineShaderStageCreateInfo *stage_info,
+ const VkAllocationCallbacks *alloc)
{
- nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
- nir_ssa_def *bindless = build_bindless(dev, b, deref, false, shader, layout);
- nir_rewrite_image_intrinsic(instr, bindless, true);
-}
+ const struct tu_shader_module *module =
+ tu_shader_module_from_handle(stage_info->module);
+ struct tu_shader *shader;
-static bool
-lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
- struct tu_device *dev,
- struct tu_shader *shader,
- const struct tu_pipeline_layout *layout)
-{
- switch (instr->intrinsic) {
- case nir_intrinsic_load_push_constant:
- lower_load_push_constant(dev, b, instr, shader, layout);
- return true;
-
- case nir_intrinsic_load_vulkan_descriptor:
- lower_load_vulkan_descriptor(b, instr);
- return true;
-
- case nir_intrinsic_vulkan_resource_index:
- lower_vulkan_resource_index(b, instr, shader, layout);
- return true;
- case nir_intrinsic_vulkan_resource_reindex:
- lower_vulkan_resource_reindex(b, instr);
- return true;
-
- case nir_intrinsic_load_ubo:
- case nir_intrinsic_load_ssbo:
- case nir_intrinsic_store_ssbo:
- case nir_intrinsic_ssbo_atomic_add:
- case nir_intrinsic_ssbo_atomic_imin:
- case nir_intrinsic_ssbo_atomic_umin:
- case nir_intrinsic_ssbo_atomic_imax:
- case nir_intrinsic_ssbo_atomic_umax:
- case nir_intrinsic_ssbo_atomic_and:
- case nir_intrinsic_ssbo_atomic_or:
- case nir_intrinsic_ssbo_atomic_xor:
- case nir_intrinsic_ssbo_atomic_exchange:
- case nir_intrinsic_ssbo_atomic_comp_swap:
- case nir_intrinsic_ssbo_atomic_fadd:
- case nir_intrinsic_ssbo_atomic_fmin:
- case nir_intrinsic_ssbo_atomic_fmax:
- case nir_intrinsic_ssbo_atomic_fcomp_swap:
- case nir_intrinsic_get_ssbo_size:
- lower_ssbo_ubo_intrinsic(dev, b, instr);
- return true;
-
- case nir_intrinsic_image_deref_load:
- case nir_intrinsic_image_deref_store:
- case nir_intrinsic_image_deref_atomic_add:
- case nir_intrinsic_image_deref_atomic_imin:
- case nir_intrinsic_image_deref_atomic_umin:
- case nir_intrinsic_image_deref_atomic_imax:
- case nir_intrinsic_image_deref_atomic_umax:
- case nir_intrinsic_image_deref_atomic_and:
- case nir_intrinsic_image_deref_atomic_or:
- case nir_intrinsic_image_deref_atomic_xor:
- case nir_intrinsic_image_deref_atomic_exchange:
- case nir_intrinsic_image_deref_atomic_comp_swap:
- case nir_intrinsic_image_deref_size:
- case nir_intrinsic_image_deref_samples:
- lower_image_deref(dev, b, instr, shader, layout);
- return true;
+ const uint32_t max_variant_count = (stage == MESA_SHADER_VERTEX) ? 2 : 1;
+ shader = vk_zalloc2(
+ &dev->alloc, alloc,
+ sizeof(*shader) + sizeof(struct ir3_shader_variant) * max_variant_count,
+ 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+ if (!shader)
+ return NULL;
- default:
- return false;
+ /* translate SPIR-V to NIR */
+ assert(module->code_size % 4 == 0);
+ nir_shader *nir = tu_spirv_to_nir(
+ dev->compiler, (const uint32_t *) module->code, module->code_size / 4,
+ stage, stage_info->pName, stage_info->pSpecializationInfo);
+ if (!nir) {
+ vk_free2(&dev->alloc, alloc, shader);
+ return NULL;
}
-}
-static void
-lower_tex_ycbcr(const struct tu_pipeline_layout *layout,
- nir_builder *builder,
- nir_tex_instr *tex)
-{
- int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
- assert(deref_src_idx >= 0);
- nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
-
- nir_variable *var = nir_deref_instr_get_variable(deref);
- const struct tu_descriptor_set_layout *set_layout =
- layout->set[var->data.descriptor_set].layout;
- const struct tu_descriptor_set_binding_layout *binding =
- &set_layout->binding[var->data.binding];
- const struct tu_sampler_ycbcr_conversion *ycbcr_samplers =
- tu_immutable_ycbcr_samplers(set_layout, binding);
-
- if (!ycbcr_samplers)
- return;
-
- /* For the following instructions, we don't apply any change */
- if (tex->op == nir_texop_txs ||
- tex->op == nir_texop_query_levels ||
- tex->op == nir_texop_lod)
- return;
-
- assert(tex->texture_index == 0);
- unsigned array_index = 0;
- if (deref->deref_type != nir_deref_type_var) {
- assert(deref->deref_type == nir_deref_type_array);
- if (!nir_src_is_const(deref->arr.index))
- return;
- array_index = nir_src_as_uint(deref->arr.index);
- array_index = MIN2(array_index, binding->array_size - 1);
+ if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_NIR)) {
+ fprintf(stderr, "translated nir:\n");
+ nir_print_shader(nir, stderr);
}
- const struct tu_sampler_ycbcr_conversion *ycbcr_sampler = ycbcr_samplers + array_index;
-
- if (ycbcr_sampler->ycbcr_model == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
- return;
- builder->cursor = nir_after_instr(&tex->instr);
+ /* TODO what needs to happen? */
- uint8_t bits = vk_format_get_component_bits(ycbcr_sampler->format,
- UTIL_FORMAT_COLORSPACE_RGB,
- PIPE_SWIZZLE_X);
-
- switch (ycbcr_sampler->format) {
- case VK_FORMAT_G8B8G8R8_422_UNORM:
- case VK_FORMAT_B8G8R8G8_422_UNORM:
- case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
- case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
- /* util_format_get_component_bits doesn't return what we want */
- bits = 8;
+ switch (stage) {
+ case MESA_SHADER_VERTEX:
+ tu_sort_variables_by_location(&nir->outputs);
+ break;
+ case MESA_SHADER_TESS_CTRL:
+ case MESA_SHADER_TESS_EVAL:
+ case MESA_SHADER_GEOMETRY:
+ tu_sort_variables_by_location(&nir->inputs);
+ tu_sort_variables_by_location(&nir->outputs);
+ break;
+ case MESA_SHADER_FRAGMENT:
+ tu_sort_variables_by_location(&nir->inputs);
+ break;
+ case MESA_SHADER_COMPUTE:
break;
default:
+ unreachable("invalid gl_shader_stage");
break;
}
- uint32_t bpcs[3] = {bits, bits, bits}; /* TODO: use right bpc for each channel ? */
- nir_ssa_def *result = nir_convert_ycbcr_to_rgb(builder,
- ycbcr_sampler->ycbcr_model,
- ycbcr_sampler->ycbcr_range,
- &tex->dest.ssa,
- bpcs);
- nir_ssa_def_rewrite_uses_after(&tex->dest.ssa, result,
- result->parent_instr);
+ nir_assign_var_locations(&nir->inputs, &nir->num_inputs,
+ ir3_glsl_type_size);
+ nir_assign_var_locations(&nir->outputs, &nir->num_outputs,
+ ir3_glsl_type_size);
+ nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms,
+ ir3_glsl_type_size);
- builder->cursor = nir_before_instr(&tex->instr);
-}
+ NIR_PASS_V(nir, nir_lower_system_values);
+ NIR_PASS_V(nir, nir_lower_frexp);
+ NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size, 0);
-static bool
-lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
- struct tu_shader *shader, const struct tu_pipeline_layout *layout)
-{
- lower_tex_ycbcr(layout, b, tex);
-
- int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
- if (sampler_src_idx >= 0) {
- nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src);
- nir_ssa_def *bindless = build_bindless(dev, b, deref, true, shader, layout);
- nir_instr_rewrite_src(&tex->instr, &tex->src[sampler_src_idx].src,
- nir_src_for_ssa(bindless));
- tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle;
- }
+ nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
- int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
- if (tex_src_idx >= 0) {
- nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src);
- nir_ssa_def *bindless = build_bindless(dev, b, deref, false, shader, layout);
- nir_instr_rewrite_src(&tex->instr, &tex->src[tex_src_idx].src,
- nir_src_for_ssa(bindless));
- tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle;
-
- /* for the input attachment case: */
- if (bindless->parent_instr->type != nir_instr_type_intrinsic)
- tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset;
- }
+ shader->ir3_shader.compiler = dev->compiler;
+ shader->ir3_shader.type = stage;
+ shader->ir3_shader.nir = nir;
- return true;
+ return shader;
}
-struct lower_instr_params {
- struct tu_device *dev;
- struct tu_shader *shader;
- const struct tu_pipeline_layout *layout;
-};
-
-static bool
-lower_instr(nir_builder *b, nir_instr *instr, void *cb_data)
+void
+tu_shader_destroy(struct tu_device *dev,
+ struct tu_shader *shader,
+ const VkAllocationCallbacks *alloc)
{
- struct lower_instr_params *params = cb_data;
- b->cursor = nir_before_instr(instr);
- switch (instr->type) {
- case nir_instr_type_tex:
- return lower_tex(b, nir_instr_as_tex(instr), params->dev, params->shader, params->layout);
- case nir_instr_type_intrinsic:
- return lower_intrinsic(b, nir_instr_as_intrinsic(instr), params->dev, params->shader, params->layout);
- default:
- return false;
- }
-}
+ if (shader->ir3_shader.nir)
+ ralloc_free(shader->ir3_shader.nir);
-/* Figure out the range of push constants that we're actually going to push to
- * the shader, and tell the backend to reserve this range when pushing UBO
- * constants.
- */
-
-static void
-gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader)
-{
- uint32_t min = UINT32_MAX, max = 0;
- nir_foreach_function(function, shader) {
- if (!function->impl)
- continue;
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block) {
- if (instr->type != nir_instr_type_intrinsic)
- continue;
-
- nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
- if (intrin->intrinsic != nir_intrinsic_load_push_constant)
- continue;
-
- uint32_t base = nir_intrinsic_base(intrin);
- uint32_t range = nir_intrinsic_range(intrin);
- min = MIN2(min, base);
- max = MAX2(max, base + range);
- break;
- }
- }
+ for (uint32_t i = 0; i < 1 + shader->has_binning_pass; i++) {
+ if (shader->variants[i].ir)
+ ir3_destroy(shader->variants[i].ir);
}
- if (min >= max) {
- tu_shader->const_state.push_consts.lo = 0;
- tu_shader->const_state.push_consts.dwords = 0;
- return;
- }
+ if (shader->ir3_shader.const_state.immediates)
+ free(shader->ir3_shader.const_state.immediates);
+ if (shader->binary)
+ free(shader->binary);
+ if (shader->binning_binary)
+ free(shader->binning_binary);
- /* CP_LOAD_STATE OFFSET and NUM_UNIT for SHARED_CONSTS are in units of
- * dwords while loading regular consts is in units of vec4's.
- * So we unify the unit here as dwords for tu_push_constant_range, then
- * we should consider correct unit when emitting.
- *
- * Note there's an alignment requirement of 16 dwords on OFFSET. Expand
- * the range and change units accordingly.
- */
- tu_shader->const_state.push_consts.lo = (min / 4) / 4 * 4;
- tu_shader->const_state.push_consts.dwords =
- align(max, 16) / 4 - tu_shader->const_state.push_consts.lo;
+ vk_free2(&dev->alloc, alloc, shader);
}
-static bool
-tu_lower_io(nir_shader *shader, struct tu_device *dev,
- struct tu_shader *tu_shader,
- const struct tu_pipeline_layout *layout)
+void
+tu_shader_compile_options_init(
+ struct tu_shader_compile_options *options,
+ const VkGraphicsPipelineCreateInfo *pipeline_info)
{
- if (!tu6_shared_constants_enable(layout, dev->compiler))
- gather_push_constants(shader, tu_shader);
-
- struct tu_const_state *const_state = &tu_shader->const_state;
- unsigned reserved_consts_vec4 =
- align(DIV_ROUND_UP(const_state->push_consts.dwords, 4),
- dev->compiler->const_upload_unit);
-
- if (layout->independent_sets) {
- const_state->dynamic_offset_loc = reserved_consts_vec4 * 4;
- reserved_consts_vec4 += DIV_ROUND_UP(MAX_SETS, 4);
- } else {
- const_state->dynamic_offset_loc = UINT32_MAX;
- }
-
- tu_shader->reserved_user_consts_vec4 = reserved_consts_vec4;
+ *options = (struct tu_shader_compile_options) {
+ /* TODO ir3_key */
- struct lower_instr_params params = {
- .dev = dev,
- .shader = tu_shader,
- .layout = layout,
+ .optimize = !(pipeline_info->flags &
+ VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT),
+ .include_binning_pass = true,
};
-
- bool progress = nir_shader_instructions_pass(shader,
- lower_instr,
- nir_metadata_none,
- &params);
-
- /* Remove now-unused variables so that when we gather the shader info later
- * they won't be counted.
- */
-
- if (progress)
- nir_opt_dce(shader);
-
- progress |=
- nir_remove_dead_variables(shader,
- nir_var_uniform | nir_var_mem_ubo | nir_var_mem_ssbo,
- NULL);
-
- return progress;
}
-static void
-shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
+static uint32_t *
+tu_compile_shader_variant(struct ir3_shader *shader,
+ const struct ir3_shader_key *key,
+ bool binning_pass,
+ struct ir3_shader_variant *variant)
{
- assert(glsl_type_is_vector_or_scalar(type));
-
- unsigned comp_size =
- glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
- unsigned length = glsl_get_vector_elements(type);
- *size = comp_size * length;
- *align = comp_size;
-}
-
-static void
-tu_gather_xfb_info(nir_shader *nir, struct ir3_stream_output_info *info)
-{
- nir_shader_gather_xfb_info(nir);
-
- if (!nir->xfb_info)
- return;
-
- nir_xfb_info *xfb = nir->xfb_info;
+ variant->shader = shader;
+ variant->type = shader->type;
+ variant->key = *key;
+ variant->binning_pass = binning_pass;
- uint8_t output_map[VARYING_SLOT_TESS_MAX];
- memset(output_map, 0, sizeof(output_map));
-
- nir_foreach_shader_out_variable(var, nir) {
- unsigned slots =
- var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
- : glsl_count_attribute_slots(var->type, false);
- for (unsigned i = 0; i < slots; i++)
- output_map[var->data.location + i] = var->data.driver_location + i;
- }
-
- assert(xfb->output_count <= IR3_MAX_SO_OUTPUTS);
- info->num_outputs = xfb->output_count;
-
- for (int i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
- info->stride[i] = xfb->buffers[i].stride / 4;
- info->buffer_to_stream[i] = xfb->buffer_to_stream[i];
- }
-
- info->streams_written = xfb->streams_written;
+ int ret = ir3_compile_shader_nir(shader->compiler, variant);
+ if (ret)
+ return NULL;
- for (int i = 0; i < xfb->output_count; i++) {
- info->output[i].register_index = output_map[xfb->outputs[i].location];
- info->output[i].start_component = xfb->outputs[i].component_offset;
- info->output[i].num_components =
- util_bitcount(xfb->outputs[i].component_mask);
- info->output[i].output_buffer = xfb->outputs[i].buffer;
- info->output[i].dst_offset = xfb->outputs[i].offset / 4;
- info->output[i].stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
- }
+ /* when assemble fails, we rely on tu_shader_destroy to clean up the
+ * variant
+ */
+ return ir3_shader_assemble(variant, shader->compiler->gpu_id);
}
-struct tu_shader *
-tu_shader_create(struct tu_device *dev,
- nir_shader *nir,
- const struct tu_shader_key *key,
- struct tu_pipeline_layout *layout,
- const VkAllocationCallbacks *alloc)
+VkResult
+tu_shader_compile(struct tu_device *dev,
+ struct tu_shader *shader,
+ const struct tu_shader *next_stage,
+ const struct tu_shader_compile_options *options,
+ const VkAllocationCallbacks *alloc)
{
- struct tu_shader *shader;
-
- shader = vk_zalloc2(
- &dev->vk.alloc, alloc,
- sizeof(*shader),
- 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
- if (!shader)
- return NULL;
-
- NIR_PASS_V(nir, nir_opt_access, &(nir_opt_access_options) {
- .is_vulkan = true,
- });
-
- if (nir->info.stage == MESA_SHADER_FRAGMENT) {
- NIR_PASS_V(nir, nir_lower_input_attachments,
- &(nir_input_attachment_options) {
- .use_fragcoord_sysval = true,
- .use_layer_id_sysval = false,
- /* When using multiview rendering, we must use
- * gl_ViewIndex as the layer id to pass to the texture
- * sampling function. gl_Layer doesn't work when
- * multiview is enabled.
- */
- .use_view_id_for_layer = key->multiview_mask != 0,
- });
+ if (options->optimize) {
+ /* ignore the key for the first pass of optimization */
+ ir3_optimize_nir(&shader->ir3_shader, shader->ir3_shader.nir, NULL);
+
+ if (unlikely(dev->physical_device->instance->debug_flags &
+ TU_DEBUG_NIR)) {
+ fprintf(stderr, "optimized nir:\n");
+ nir_print_shader(shader->ir3_shader.nir, stderr);
+ }
}
- /* This needs to happen before multiview lowering which rewrites store
- * instructions of the position variable, so that we can just rewrite one
- * store at the end instead of having to rewrite every store specified by
- * the user.
- */
- ir3_nir_lower_io_to_temporaries(nir);
+ shader->binary = tu_compile_shader_variant(
+ &shader->ir3_shader, &options->key, false, &shader->variants[0]);
+ if (!shader->binary)
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
- if (nir->info.stage == MESA_SHADER_VERTEX && key->multiview_mask) {
- tu_nir_lower_multiview(nir, key->multiview_mask, dev);
- }
+ /* compile another variant for the binning pass */
+ if (options->include_binning_pass &&
+ shader->ir3_shader.type == MESA_SHADER_VERTEX) {
+ shader->binning_binary = tu_compile_shader_variant(
+ &shader->ir3_shader, &options->key, true, &shader->variants[1]);
+ if (!shader->binning_binary)
+ return VK_ERROR_OUT_OF_HOST_MEMORY;
- if (nir->info.stage == MESA_SHADER_FRAGMENT && key->force_sample_interp) {
- nir_foreach_shader_in_variable(var, nir) {
- if (!var->data.centroid)
- var->data.sample = true;
- }
+ shader->has_binning_pass = true;
}
- NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
- nir_address_format_32bit_offset);
-
- NIR_PASS_V(nir, nir_lower_explicit_io,
- nir_var_mem_ubo | nir_var_mem_ssbo,
- nir_address_format_vec2_index_32bit_offset);
-
- NIR_PASS_V(nir, nir_lower_explicit_io,
- nir_var_mem_global,
- nir_address_format_64bit_global);
-
- if (nir->info.stage == MESA_SHADER_COMPUTE) {
- NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
- nir_var_mem_shared, shared_type_info);
- NIR_PASS_V(nir, nir_lower_explicit_io,
- nir_var_mem_shared,
- nir_address_format_32bit_offset);
-
- if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
- const unsigned chunk_size = 16; /* max single store size */
- /* Shared memory is allocated in 1024b chunks in HW, but the zero-init
- * extension only requires us to initialize the memory that the shader
- * is allocated at the API level, and it's up to the user to ensure
- * that accesses are limited to those bounds.
- */
- const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
- NIR_PASS_V(nir, nir_zero_initialize_shared_memory, shared_size, chunk_size);
+ if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_IR3)) {
+ fprintf(stderr, "disassembled ir3:\n");
+ fprintf(stderr, "shader: %s\n",
+ gl_shader_stage_name(shader->ir3_shader.type));
+ ir3_shader_disasm(&shader->variants[0], shader->binary, stderr);
+
+ if (shader->has_binning_pass) {
+ fprintf(stderr, "disassembled ir3:\n");
+ fprintf(stderr, "shader: %s (binning)\n",
+ gl_shader_stage_name(shader->ir3_shader.type));
+ ir3_shader_disasm(&shader->variants[1], shader->binning_binary,
+ stderr);
}
-
- const struct nir_lower_compute_system_values_options compute_sysval_options = {
- .has_base_workgroup_id = true,
- };
- NIR_PASS_V(nir, nir_lower_compute_system_values, &compute_sysval_options);
}
- nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
- nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
+ return VK_SUCCESS;
+}
- /* Gather information for transform feedback. This should be called after:
- * - nir_split_per_member_structs.
- * - nir_remove_dead_variables with varyings, so that we could align
- * stream outputs correctly.
- * - nir_assign_io_var_locations - to have valid driver_location
- */
- struct ir3_stream_output_info so_info = {};
- if (nir->info.stage == MESA_SHADER_VERTEX ||
- nir->info.stage == MESA_SHADER_TESS_EVAL ||
- nir->info.stage == MESA_SHADER_GEOMETRY)
- tu_gather_xfb_info(nir, &so_info);
+VkResult
+tu_CreateShaderModule(VkDevice _device,
+ const VkShaderModuleCreateInfo *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkShaderModule *pShaderModule)
+{
+ TU_FROM_HANDLE(tu_device, device, _device);
+ struct tu_shader_module *module;
- NIR_PASS_V(nir, tu_lower_io, dev, shader, layout);
+ assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO);
+ assert(pCreateInfo->flags == 0);
+ assert(pCreateInfo->codeSize % 4 == 0);
- nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+ module = vk_alloc2(&device->alloc, pAllocator,
+ sizeof(*module) + pCreateInfo->codeSize, 8,
+ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+ if (module == NULL)
+ return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
- ir3_finalize_nir(dev->compiler, nir);
+ module->code_size = pCreateInfo->codeSize;
+ memcpy(module->code, pCreateInfo->pCode, pCreateInfo->codeSize);
- bool shared_consts_enable = tu6_shared_constants_enable(layout, dev->compiler);
- if (shared_consts_enable)
- assert(!shader->const_state.push_consts.dwords);
+ _mesa_sha1_compute(module->code, module->code_size, module->sha1);
- shader->ir3_shader =
- ir3_shader_from_nir(dev->compiler, nir, &(struct ir3_shader_options) {
- .reserved_user_consts = shader->reserved_user_consts_vec4,
- .shared_consts_enable = shared_consts_enable,
- .api_wavesize = key->api_wavesize,
- .real_wavesize = key->real_wavesize,
- }, &so_info);
+ *pShaderModule = tu_shader_module_to_handle(module);
- return shader;
+ return VK_SUCCESS;
}
void
-tu_shader_destroy(struct tu_device *dev,
- struct tu_shader *shader,
- const VkAllocationCallbacks *alloc)
+tu_DestroyShaderModule(VkDevice _device,
+ VkShaderModule _module,
+ const VkAllocationCallbacks *pAllocator)
{
- ir3_shader_destroy(shader->ir3_shader);
+ TU_FROM_HANDLE(tu_device, device, _device);
+ TU_FROM_HANDLE(tu_shader_module, module, _module);
+
+ if (!module)
+ return;
- vk_free2(&dev->vk.alloc, alloc, shader);
+ vk_free2(&device->alloc, pAllocator, module);
}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_util.c b/lib/mesa/src/freedreno/vulkan/tu_util.c
index 9b0b9a420..e630460fb 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_util.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_util.c
@@ -1,21 +1,79 @@
/*
* Copyright © 2015 Intel Corporation
- * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
-#include "tu_util.h"
+#include "tu_private.h"
+#include <assert.h>
#include <errno.h>
#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
#include "util/u_math.h"
-#include "util/timespec.h"
#include "vk_enum_to_str.h"
-#include "tu_device.h"
-#include "tu_pass.h"
+/* TODO: Add Android support to tu_log funcs */
-void PRINTFLIKE(3, 4)
+/** Log an error message. */
+void tu_printflike(1, 2) tu_loge(const char *format, ...)
+{
+ va_list va;
+
+ va_start(va, format);
+ tu_loge_v(format, va);
+ va_end(va);
+}
+
+/** \see tu_loge() */
+void
+tu_loge_v(const char *format, va_list va)
+{
+ fprintf(stderr, "vk: error: ");
+ vfprintf(stderr, format, va);
+ fprintf(stderr, "\n");
+}
+
+/** Log an error message. */
+void tu_printflike(1, 2) tu_logi(const char *format, ...)
+{
+ va_list va;
+
+ va_start(va, format);
+ tu_logi_v(format, va);
+ va_end(va);
+}
+
+/** \see tu_logi() */
+void
+tu_logi_v(const char *format, va_list va)
+{
+ fprintf(stderr, "tu: info: ");
+ vfprintf(stderr, format, va);
+ fprintf(stderr, "\n");
+}
+
+void tu_printflike(3, 4)
__tu_finishme(const char *file, int line, const char *format, ...)
{
va_list ap;
@@ -25,17 +83,16 @@ void PRINTFLIKE(3, 4)
vsnprintf(buffer, sizeof(buffer), format, ap);
va_end(ap);
- mesa_loge("%s:%d: FINISHME: %s\n", file, line, buffer);
+ fprintf(stderr, "%s:%d: FINISHME: %s\n", file, line, buffer);
}
VkResult
-__vk_startup_errorf(struct tu_instance *instance,
- VkResult error,
- bool always_print,
- const char *file,
- int line,
- const char *format,
- ...)
+__vk_errorf(struct tu_instance *instance,
+ VkResult error,
+ const char *file,
+ int line,
+ const char *format,
+ ...)
{
va_list ap;
char buffer[256];
@@ -43,8 +100,7 @@ __vk_startup_errorf(struct tu_instance *instance,
const char *error_str = vk_Result_to_str(error);
#ifndef DEBUG
- if (!always_print)
- return error;
+ return error;
#endif
if (format) {
@@ -52,236 +108,10 @@ __vk_startup_errorf(struct tu_instance *instance,
vsnprintf(buffer, sizeof(buffer), format, ap);
va_end(ap);
- mesa_loge("%s:%d: %s (%s)\n", file, line, buffer, error_str);
+ fprintf(stderr, "%s:%d: %s (%s)\n", file, line, buffer, error_str);
} else {
- mesa_loge("%s:%d: %s\n", file, line, error_str);
+ fprintf(stderr, "%s:%d: %s\n", file, line, error_str);
}
return error;
}
-
-static void
-tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb,
- const struct tu_device *dev,
- const struct tu_render_pass *pass,
- enum tu_gmem_layout gmem_layout)
-{
- const uint32_t tile_align_w = pass->tile_align_w;
- const uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
- const uint32_t max_tile_width = dev->physical_device->info->tile_max_w;
- const uint32_t max_tile_height = dev->physical_device->info->tile_max_h;
- struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
-
- /* start from 1 tile */
- tiling->tile_count = (VkExtent2D) {
- .width = 1,
- .height = 1,
- };
- tiling->tile0 = (VkExtent2D) {
- .width = util_align_npot(fb->width, tile_align_w),
- .height = align(fb->height, tile_align_h),
- };
-
- /* will force to sysmem, don't bother trying to have a valid tile config
- * TODO: just skip all GMEM stuff when sysmem is forced?
- */
- if (!pass->gmem_pixels[gmem_layout])
- return;
-
- if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN)) {
- /* start with 2x2 tiles */
- tiling->tile_count.width = 2;
- tiling->tile_count.height = 2;
- tiling->tile0.width = util_align_npot(DIV_ROUND_UP(fb->width, 2), tile_align_w);
- tiling->tile0.height = align(DIV_ROUND_UP(fb->height, 2), tile_align_h);
- }
-
- /* do not exceed max tile width */
- while (tiling->tile0.width > max_tile_width) {
- tiling->tile_count.width++;
- tiling->tile0.width =
- util_align_npot(DIV_ROUND_UP(fb->width, tiling->tile_count.width), tile_align_w);
- }
-
- /* do not exceed max tile height */
- while (tiling->tile0.height > max_tile_height) {
- tiling->tile_count.height++;
- tiling->tile0.height =
- util_align_npot(DIV_ROUND_UP(fb->height, tiling->tile_count.height), tile_align_h);
- }
-
- /* do not exceed gmem size */
- while (tiling->tile0.width * tiling->tile0.height > pass->gmem_pixels[gmem_layout]) {
- if (tiling->tile0.width > MAX2(tile_align_w, tiling->tile0.height)) {
- tiling->tile_count.width++;
- tiling->tile0.width =
- util_align_npot(DIV_ROUND_UP(fb->width, tiling->tile_count.width), tile_align_w);
- } else {
- /* if this assert fails then layout is impossible.. */
- assert(tiling->tile0.height > tile_align_h);
- tiling->tile_count.height++;
- tiling->tile0.height =
- align(DIV_ROUND_UP(fb->height, tiling->tile_count.height), tile_align_h);
- }
- }
-}
-
-static void
-tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
- const struct tu_device *dev)
-{
- const uint32_t max_pipe_count = 32; /* A6xx */
-
- /* start from 1 tile per pipe */
- tiling->pipe0 = (VkExtent2D) {
- .width = 1,
- .height = 1,
- };
- tiling->pipe_count = tiling->tile_count;
-
- while (tiling->pipe_count.width * tiling->pipe_count.height > max_pipe_count) {
- if (tiling->pipe0.width < tiling->pipe0.height) {
- tiling->pipe0.width += 1;
- tiling->pipe_count.width =
- DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width);
- } else {
- tiling->pipe0.height += 1;
- tiling->pipe_count.height =
- DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height);
- }
- }
-}
-
-static void
-tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
- const struct tu_device *dev)
-{
- const uint32_t max_pipe_count = 32; /* A6xx */
- const uint32_t used_pipe_count =
- tiling->pipe_count.width * tiling->pipe_count.height;
- const VkExtent2D last_pipe = {
- .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
- .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
- };
-
- assert(used_pipe_count <= max_pipe_count);
- assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
-
- for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
- for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
- const uint32_t pipe_x = tiling->pipe0.width * x;
- const uint32_t pipe_y = tiling->pipe0.height * y;
- const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
- ? last_pipe.width
- : tiling->pipe0.width;
- const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
- ? last_pipe.height
- : tiling->pipe0.height;
- const uint32_t n = tiling->pipe_count.width * y + x;
-
- tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
- A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
- A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
- A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
- tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
- }
- }
-
- memset(tiling->pipe_config + used_pipe_count, 0,
- sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
-}
-
-static bool
-is_hw_binning_possible(const struct tu_tiling_config *tiling)
-{
- /* Similar to older gens, # of tiles per pipe cannot be more than 32.
- * But there are no hangs with 16 or more tiles per pipe in either
- * X or Y direction, so that limit does not seem to apply.
- */
- uint32_t tiles_per_pipe = tiling->pipe0.width * tiling->pipe0.height;
- return tiles_per_pipe <= 32;
-}
-
-static void
-tu_tiling_config_update_binning(struct tu_tiling_config *tiling, const struct tu_device *device)
-{
- tiling->binning_possible = is_hw_binning_possible(tiling);
-
- if (tiling->binning_possible) {
- tiling->binning = (tiling->tile_count.width * tiling->tile_count.height) > 2;
-
- if (unlikely(device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
- tiling->binning = true;
- if (unlikely(device->physical_device->instance->debug_flags &
- TU_DEBUG_NOBIN))
- tiling->binning = false;
- } else {
- tiling->binning = false;
- }
-}
-
-void
-tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
- const struct tu_device *device,
- const struct tu_render_pass *pass)
-{
- for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
- struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
- tu_tiling_config_update_tile_layout(fb, device, pass, gmem_layout);
- tu_tiling_config_update_pipe_layout(tiling, device);
- tu_tiling_config_update_pipes(tiling, device);
- tu_tiling_config_update_binning(tiling, device);
- }
-}
-
-void
-tu_dbg_log_gmem_load_store_skips(struct tu_device *device)
-{
- static uint32_t last_skipped_loads = 0;
- static uint32_t last_skipped_stores = 0;
- static uint32_t last_total_loads = 0;
- static uint32_t last_total_stores = 0;
- static struct timespec last_time = {};
-
- pthread_mutex_lock(&device->submit_mutex);
-
- struct timespec current_time;
- clock_gettime(CLOCK_MONOTONIC, &current_time);
-
- if (timespec_sub_to_nsec(&current_time, &last_time) > 1000 * 1000 * 1000) {
- last_time = current_time;
- } else {
- pthread_mutex_unlock(&device->submit_mutex);
- return;
- }
-
- struct tu6_global *global = device->global_bo->map;
-
- uint32_t current_taken_loads = global->dbg_gmem_taken_loads;
- uint32_t current_taken_stores = global->dbg_gmem_taken_stores;
- uint32_t current_total_loads = global->dbg_gmem_total_loads;
- uint32_t current_total_stores = global->dbg_gmem_total_stores;
-
- uint32_t skipped_loads = current_total_loads - current_taken_loads;
- uint32_t skipped_stores = current_total_stores - current_taken_stores;
-
- uint32_t current_time_frame_skipped_loads = skipped_loads - last_skipped_loads;
- uint32_t current_time_frame_skipped_stores = skipped_stores - last_skipped_stores;
-
- uint32_t current_time_frame_total_loads = current_total_loads - last_total_loads;
- uint32_t current_time_frame_total_stores = current_total_stores - last_total_stores;
-
- mesa_logi("[GMEM] loads total: %u skipped: %.1f%%\n",
- current_time_frame_total_loads,
- current_time_frame_skipped_loads / (float) current_time_frame_total_loads * 100.f);
- mesa_logi("[GMEM] stores total: %u skipped: %.1f%%\n",
- current_time_frame_total_stores,
- current_time_frame_skipped_stores / (float) current_time_frame_total_stores * 100.f);
-
- last_skipped_loads = skipped_loads;
- last_skipped_stores = skipped_stores;
- last_total_loads = current_total_loads;
- last_total_stores = current_total_stores;
-
- pthread_mutex_unlock(&device->submit_mutex);
-}
diff --git a/lib/mesa/src/freedreno/vulkan/tu_wsi.c b/lib/mesa/src/freedreno/vulkan/tu_wsi.c
index cf09cf9b6..21466108b 100644
--- a/lib/mesa/src/freedreno/vulkan/tu_wsi.c
+++ b/lib/mesa/src/freedreno/vulkan/tu_wsi.c
@@ -1,62 +1,272 @@
/*
* Copyright © 2016 Red Hat
- * SPDX-License-Identifier: MIT
- *
* based on intel anv code:
* Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
*/
-#include "tu_wsi.h"
+#include "tu_private.h"
#include "vk_util.h"
-#include "wsi_common_drm.h"
-#include "drm-uapi/drm_fourcc.h"
-
-#include "tu_device.h"
+#include "wsi_common.h"
-static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
+static PFN_vkVoidFunction
tu_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName)
{
- TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice);
- return vk_instance_get_proc_addr_unchecked(&pdevice->instance->vk, pName);
+ return tu_lookup_entrypoint_unchecked(pName);
}
-static bool
-tu_wsi_can_present_on_device(VkPhysicalDevice physicalDevice, int fd)
+VkResult
+tu_wsi_init(struct tu_physical_device *physical_device)
{
- TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice);
+ return wsi_device_init(&physical_device->wsi_device,
+ tu_physical_device_to_handle(physical_device),
+ tu_wsi_proc_addr, &physical_device->instance->alloc,
+ physical_device->master_fd, NULL);
+}
- return wsi_common_drm_devices_equal(fd, pdevice->local_fd);
+void
+tu_wsi_finish(struct tu_physical_device *physical_device)
+{
+ wsi_device_finish(&physical_device->wsi_device,
+ &physical_device->instance->alloc);
+}
+
+void
+tu_DestroySurfaceKHR(VkInstance _instance,
+ VkSurfaceKHR _surface,
+ const VkAllocationCallbacks *pAllocator)
+{
+ TU_FROM_HANDLE(tu_instance, instance, _instance);
+ ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface);
+
+ vk_free2(&instance->alloc, pAllocator, surface);
}
VkResult
-tu_wsi_init(struct tu_physical_device *physical_device)
+tu_GetPhysicalDeviceSurfaceSupportKHR(VkPhysicalDevice physicalDevice,
+ uint32_t queueFamilyIndex,
+ VkSurfaceKHR surface,
+ VkBool32 *pSupported)
{
- VkResult result;
+ TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
- result = wsi_device_init(&physical_device->wsi_device,
- tu_physical_device_to_handle(physical_device),
- tu_wsi_proc_addr,
- &physical_device->instance->vk.alloc,
- physical_device->master_fd,
- &physical_device->instance->dri_options,
- false);
- if (result != VK_SUCCESS)
- return result;
+ return wsi_common_get_surface_support(
+ &device->wsi_device, queueFamilyIndex, surface, pSupported);
+}
- physical_device->wsi_device.supports_modifiers = true;
- physical_device->wsi_device.can_present_on_device =
- tu_wsi_can_present_on_device;
+VkResult
+tu_GetPhysicalDeviceSurfaceCapabilitiesKHR(
+ VkPhysicalDevice physicalDevice,
+ VkSurfaceKHR surface,
+ VkSurfaceCapabilitiesKHR *pSurfaceCapabilities)
+{
+ TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
- physical_device->vk.wsi_device = &physical_device->wsi_device;
+ return wsi_common_get_surface_capabilities(&device->wsi_device, surface,
+ pSurfaceCapabilities);
+}
- return VK_SUCCESS;
+VkResult
+tu_GetPhysicalDeviceSurfaceCapabilities2KHR(
+ VkPhysicalDevice physicalDevice,
+ const VkPhysicalDeviceSurfaceInfo2KHR *pSurfaceInfo,
+ VkSurfaceCapabilities2KHR *pSurfaceCapabilities)
+{
+ TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
+
+ return wsi_common_get_surface_capabilities2(
+ &device->wsi_device, pSurfaceInfo, pSurfaceCapabilities);
+}
+
+VkResult
+tu_GetPhysicalDeviceSurfaceCapabilities2EXT(
+ VkPhysicalDevice physicalDevice,
+ VkSurfaceKHR surface,
+ VkSurfaceCapabilities2EXT *pSurfaceCapabilities)
+{
+ TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
+
+ return wsi_common_get_surface_capabilities2ext(
+ &device->wsi_device, surface, pSurfaceCapabilities);
+}
+
+VkResult
+tu_GetPhysicalDeviceSurfaceFormatsKHR(VkPhysicalDevice physicalDevice,
+ VkSurfaceKHR surface,
+ uint32_t *pSurfaceFormatCount,
+ VkSurfaceFormatKHR *pSurfaceFormats)
+{
+ TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
+
+ return wsi_common_get_surface_formats(
+ &device->wsi_device, surface, pSurfaceFormatCount, pSurfaceFormats);
+}
+
+VkResult
+tu_GetPhysicalDeviceSurfaceFormats2KHR(
+ VkPhysicalDevice physicalDevice,
+ const VkPhysicalDeviceSurfaceInfo2KHR *pSurfaceInfo,
+ uint32_t *pSurfaceFormatCount,
+ VkSurfaceFormat2KHR *pSurfaceFormats)
+{
+ TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
+
+ return wsi_common_get_surface_formats2(&device->wsi_device, pSurfaceInfo,
+ pSurfaceFormatCount,
+ pSurfaceFormats);
+}
+
+VkResult
+tu_GetPhysicalDeviceSurfacePresentModesKHR(VkPhysicalDevice physicalDevice,
+ VkSurfaceKHR surface,
+ uint32_t *pPresentModeCount,
+ VkPresentModeKHR *pPresentModes)
+{
+ TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
+
+ return wsi_common_get_surface_present_modes(
+ &device->wsi_device, surface, pPresentModeCount, pPresentModes);
+}
+
+VkResult
+tu_CreateSwapchainKHR(VkDevice _device,
+ const VkSwapchainCreateInfoKHR *pCreateInfo,
+ const VkAllocationCallbacks *pAllocator,
+ VkSwapchainKHR *pSwapchain)
+{
+ TU_FROM_HANDLE(tu_device, device, _device);
+ const VkAllocationCallbacks *alloc;
+ if (pAllocator)
+ alloc = pAllocator;
+ else
+ alloc = &device->alloc;
+
+ return wsi_common_create_swapchain(&device->physical_device->wsi_device,
+ tu_device_to_handle(device),
+ pCreateInfo, alloc, pSwapchain);
}
void
-tu_wsi_finish(struct tu_physical_device *physical_device)
+tu_DestroySwapchainKHR(VkDevice _device,
+ VkSwapchainKHR swapchain,
+ const VkAllocationCallbacks *pAllocator)
{
- physical_device->vk.wsi_device = NULL;
- wsi_device_finish(&physical_device->wsi_device,
- &physical_device->instance->vk.alloc);
+ TU_FROM_HANDLE(tu_device, device, _device);
+ const VkAllocationCallbacks *alloc;
+
+ if (pAllocator)
+ alloc = pAllocator;
+ else
+ alloc = &device->alloc;
+
+ wsi_common_destroy_swapchain(_device, swapchain, alloc);
+}
+
+VkResult
+tu_GetSwapchainImagesKHR(VkDevice device,
+ VkSwapchainKHR swapchain,
+ uint32_t *pSwapchainImageCount,
+ VkImage *pSwapchainImages)
+{
+ return wsi_common_get_images(swapchain, pSwapchainImageCount,
+ pSwapchainImages);
+}
+
+VkResult
+tu_AcquireNextImageKHR(VkDevice device,
+ VkSwapchainKHR swapchain,
+ uint64_t timeout,
+ VkSemaphore semaphore,
+ VkFence fence,
+ uint32_t *pImageIndex)
+{
+ VkAcquireNextImageInfoKHR acquire_info = {
+ .sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR,
+ .swapchain = swapchain,
+ .timeout = timeout,
+ .semaphore = semaphore,
+ .fence = fence,
+ .deviceMask = 0,
+ };
+
+ return tu_AcquireNextImage2KHR(device, &acquire_info, pImageIndex);
+}
+
+VkResult
+tu_AcquireNextImage2KHR(VkDevice _device,
+ const VkAcquireNextImageInfoKHR *pAcquireInfo,
+ uint32_t *pImageIndex)
+{
+ TU_FROM_HANDLE(tu_device, device, _device);
+ struct tu_physical_device *pdevice = device->physical_device;
+
+ VkResult result = wsi_common_acquire_next_image2(
+ &pdevice->wsi_device, _device, pAcquireInfo, pImageIndex);
+
+ /* TODO signal fence and semaphore */
+
+ return result;
+}
+
+VkResult
+tu_QueuePresentKHR(VkQueue _queue, const VkPresentInfoKHR *pPresentInfo)
+{
+ TU_FROM_HANDLE(tu_queue, queue, _queue);
+ return wsi_common_queue_present(
+ &queue->device->physical_device->wsi_device,
+ tu_device_to_handle(queue->device), _queue, queue->queue_family_index,
+ pPresentInfo);
+}
+
+VkResult
+tu_GetDeviceGroupPresentCapabilitiesKHR(
+ VkDevice device, VkDeviceGroupPresentCapabilitiesKHR *pCapabilities)
+{
+ memset(pCapabilities->presentMask, 0, sizeof(pCapabilities->presentMask));
+ pCapabilities->presentMask[0] = 0x1;
+ pCapabilities->modes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
+
+ return VK_SUCCESS;
+}
+
+VkResult
+tu_GetDeviceGroupSurfacePresentModesKHR(
+ VkDevice device,
+ VkSurfaceKHR surface,
+ VkDeviceGroupPresentModeFlagsKHR *pModes)
+{
+ *pModes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
+
+ return VK_SUCCESS;
+}
+
+VkResult
+tu_GetPhysicalDevicePresentRectanglesKHR(VkPhysicalDevice physicalDevice,
+ VkSurfaceKHR surface,
+ uint32_t *pRectCount,
+ VkRect2D *pRects)
+{
+ TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
+
+ return wsi_common_get_present_rectangles(&device->wsi_device, surface,
+ pRectCount, pRects);
}
diff --git a/lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_compute.c b/lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_compute.c
index 774e1603c..2e6a7fd21 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_compute.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_compute.c
@@ -24,190 +24,190 @@
* Rob Clark <robclark@freedesktop.org>
*/
-#define FD_BO_NO_HARDPIN 1
-
#include "pipe/p_state.h"
-#include "util/u_dump.h"
-#include "u_tracepoints.h"
#include "freedreno_resource.h"
-#include "freedreno_tracepoints.h"
#include "fd6_compute.h"
-#include "fd6_const.h"
#include "fd6_context.h"
#include "fd6_emit.h"
-#include "fd6_pack.h"
+
+struct fd6_compute_stateobj {
+ struct ir3_shader *shader;
+};
+
+
+static void *
+fd6_create_compute_state(struct pipe_context *pctx,
+ const struct pipe_compute_state *cso)
+{
+ struct fd_context *ctx = fd_context(pctx);
+
+ /* req_input_mem will only be non-zero for cl kernels (ie. clover).
+ * This isn't a perfect test because I guess it is possible (but
+ * uncommon) for none for the kernel parameters to be a global,
+ * but ctx->set_global_bindings() can't fail, so this is the next
+ * best place to fail if we need a newer version of kernel driver:
+ */
+ if ((cso->req_input_mem > 0) &&
+ fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) {
+ return NULL;
+ }
+
+ struct ir3_compiler *compiler = ctx->screen->compiler;
+ struct fd6_compute_stateobj *so = CALLOC_STRUCT(fd6_compute_stateobj);
+ so->shader = ir3_shader_create_compute(compiler, cso, &ctx->debug, pctx->screen);
+ return so;
+}
+
+static void
+fd6_delete_compute_state(struct pipe_context *pctx, void *hwcso)
+{
+ struct fd6_compute_stateobj *so = hwcso;
+ ir3_shader_destroy(so->shader);
+ free(so);
+}
/* maybe move to fd6_program? */
static void
-cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
- struct ir3_shader_variant *v) assert_dt
+cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v,
+ const struct pipe_grid_info *info)
{
- const struct ir3_info *i = &v->info;
- enum a6xx_threadsize thrsz = i->double_threadsize ? THREAD128 : THREAD64;
-
- OUT_REG(ring, A6XX_HLSQ_INVALIDATE_CMD(.vs_state = true, .hs_state = true,
- .ds_state = true, .gs_state = true,
- .fs_state = true, .cs_state = true,
- .gfx_ibo = true, .cs_ibo = true, ));
-
- OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL, 1);
- OUT_RING(ring, A6XX_HLSQ_CS_CNTL_CONSTLEN(v->constlen) |
- A6XX_HLSQ_CS_CNTL_ENABLED);
-
- OUT_PKT4(ring, REG_A6XX_SP_CS_CONFIG, 2);
- OUT_RING(ring, A6XX_SP_CS_CONFIG_ENABLED |
- A6XX_SP_CS_CONFIG_NIBO(ir3_shader_nibo(v)) |
- A6XX_SP_CS_CONFIG_NTEX(v->num_samp) |
- A6XX_SP_CS_CONFIG_NSAMP(v->num_samp)); /* SP_VS_CONFIG */
- OUT_RING(ring, v->instrlen); /* SP_VS_INSTRLEN */
-
- OUT_PKT4(ring, REG_A6XX_SP_CS_CTRL_REG0, 1);
- OUT_RING(ring,
- A6XX_SP_CS_CTRL_REG0_THREADSIZE(thrsz) |
- A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1) |
- A6XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(i->max_half_reg + 1) |
- COND(v->mergedregs, A6XX_SP_CS_CTRL_REG0_MERGEDREGS) |
- A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(v)));
-
- uint32_t shared_size = MAX2(((int)v->cs.req_local_mem - 1) / 1024, 1);
- OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
- OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
- A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
-
- if (ctx->screen->info->a6xx.has_lpac) {
- OUT_PKT4(ring, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
- OUT_RING(ring, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
- A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
- }
-
- uint32_t local_invocation_id, work_group_id;
- local_invocation_id =
- ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
- work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
-
- OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL_0, 2);
- OUT_RING(ring, A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
- A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
- A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
- A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
- OUT_RING(ring, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
- A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
-
- if (ctx->screen->info->a6xx.has_lpac) {
- OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 2);
- OUT_RING(ring, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
- A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
- A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
- A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
- OUT_RING(ring, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
- A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
- }
-
- OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);
- OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_OBJ_START_LO/HI */
-
- if (v->instrlen > 0)
- fd6_emit_shader(ctx, ring, v);
+ const struct ir3_info *i = &v->info;
+ enum a3xx_threadsize thrsz = FOUR_QUADS;
+
+ OUT_PKT4(ring, REG_A6XX_HLSQ_UPDATE_CNTL, 1);
+ OUT_RING(ring, 0xff);
+
+ unsigned constlen = align(v->constlen, 4);
+ OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL, 1);
+ OUT_RING(ring, A6XX_HLSQ_CS_CNTL_CONSTLEN(constlen) |
+ A6XX_HLSQ_CS_CNTL_ENABLED);
+
+ OUT_PKT4(ring, REG_A6XX_SP_CS_CONFIG, 2);
+ OUT_RING(ring, A6XX_SP_CS_CONFIG_ENABLED |
+ A6XX_SP_CS_CONFIG_NIBO(v->image_mapping.num_ibo) |
+ A6XX_SP_CS_CONFIG_NTEX(v->num_samp) |
+ A6XX_SP_CS_CONFIG_NSAMP(v->num_samp)); /* SP_VS_CONFIG */
+ OUT_RING(ring, v->instrlen); /* SP_VS_INSTRLEN */
+
+ OUT_PKT4(ring, REG_A6XX_SP_CS_CTRL_REG0, 1);
+ OUT_RING(ring, A6XX_SP_CS_CTRL_REG0_THREADSIZE(thrsz) |
+ A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1) |
+ A6XX_SP_CS_CTRL_REG0_MERGEDREGS |
+ A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(v->branchstack) |
+ COND(v->need_pixlod, A6XX_SP_CS_CTRL_REG0_PIXLODENABLE));
+
+ OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
+ OUT_RING(ring, 0x41);
+
+ uint32_t local_invocation_id, work_group_id;
+ local_invocation_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
+ work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORK_GROUP_ID);
+
+ OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL_0, 2);
+ OUT_RING(ring, A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
+ A6XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) |
+ A6XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) |
+ A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
+ OUT_RING(ring, 0x2fc); /* HLSQ_CS_UNKNOWN_B998 */
+
+ OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START_LO, 2);
+ OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_OBJ_START_LO/HI */
+
+ if (v->instrlen > 0)
+ fd6_emit_shader(ring, v);
}
static void
-fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt
+fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info)
{
- struct ir3_shader_key key = {};
- struct ir3_shader_variant *v;
- struct fd_ringbuffer *ring = ctx->batch->draw;
- unsigned nglobal = 0;
-
- v = ir3_shader_variant(ir3_get_shader(ctx->compute), key, false, &ctx->debug);
- if (!v)
- return;
-
- if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG)
- cs_program_emit(ctx, ring, v);
-
- fd6_emit_cs_state(ctx, ring, v);
- fd6_emit_cs_consts(v, ring, ctx, info);
-
- u_foreach_bit (i, ctx->global_bindings.enabled_mask)
- nglobal++;
-
- if (nglobal > 0) {
- /* global resources don't otherwise get an OUT_RELOC(), since
- * the raw ptr address is emitted in ir3_emit_cs_consts().
- * So to make the kernel aware that these buffers are referenced
- * by the batch, emit dummy reloc's as part of a no-op packet
- * payload:
- */
- OUT_PKT7(ring, CP_NOP, 2 * nglobal);
- u_foreach_bit (i, ctx->global_bindings.enabled_mask) {
- struct pipe_resource *prsc = ctx->global_bindings.buf[i];
- OUT_RELOC(ring, fd_resource(prsc)->bo, 0, 0, 0);
- }
- }
-
- OUT_PKT7(ring, CP_SET_MARKER, 1);
- OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
-
- const unsigned *local_size =
- info->block; // v->shader->nir->info->workgroup_size;
- const unsigned *num_groups = info->grid;
- /* for some reason, mesa/st doesn't set info->work_dim, so just assume 3: */
- const unsigned work_dim = info->work_dim ? info->work_dim : 3;
- OUT_PKT4(ring, REG_A6XX_HLSQ_CS_NDRANGE_0, 7);
- OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_0_KERNELDIM(work_dim) |
- A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) |
- A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) |
- A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1));
- OUT_RING(ring,
- A6XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(local_size[0] * num_groups[0]));
- OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_2_GLOBALOFF_X */
- OUT_RING(ring,
- A6XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(local_size[1] * num_groups[1]));
- OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_4_GLOBALOFF_Y */
- OUT_RING(ring,
- A6XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(local_size[2] * num_groups[2]));
- OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_6_GLOBALOFF_Z */
-
- OUT_PKT4(ring, REG_A6XX_HLSQ_CS_KERNEL_GROUP_X, 3);
- OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_X */
- OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */
- OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */
-
- trace_grid_info(&ctx->batch->trace, ring, info);
- trace_start_compute(&ctx->batch->trace, ring);
-
- if (info->indirect) {
- struct fd_resource *rsc = fd_resource(info->indirect);
-
- OUT_PKT7(ring, CP_EXEC_CS_INDIRECT, 4);
- OUT_RING(ring, 0x00000000);
- OUT_RELOC(ring, rsc->bo, info->indirect_offset, 0, 0); /* ADDR_LO/HI */
- OUT_RING(ring,
- A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
- A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
- A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
- } else {
- OUT_PKT7(ring, CP_EXEC_CS, 4);
- OUT_RING(ring, 0x00000000);
- OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0]));
- OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1]));
- OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2]));
- }
-
- trace_end_compute(&ctx->batch->trace, ring);
-
- OUT_WFI5(ring);
-
- fd6_cache_flush(ctx->batch, ring);
+ struct fd6_compute_stateobj *so = ctx->compute;
+ struct ir3_shader_key key = {};
+ struct ir3_shader_variant *v;
+ struct fd_ringbuffer *ring = ctx->batch->draw;
+ unsigned i, nglobal = 0;
+
+ fd6_emit_restore(ctx->batch, ring);
+
+ v = ir3_shader_variant(so->shader, key, false, &ctx->debug);
+ if (!v)
+ return;
+
+ if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG)
+ cs_program_emit(ring, v, info);
+
+ fd6_emit_cs_state(ctx, ring, v);
+ ir3_emit_cs_consts(v, ring, ctx, info);
+
+ foreach_bit(i, ctx->global_bindings.enabled_mask)
+ nglobal++;
+
+ if (nglobal > 0) {
+ /* global resources don't otherwise get an OUT_RELOC(), since
+ * the raw ptr address is emitted in ir3_emit_cs_consts().
+ * So to make the kernel aware that these buffers are referenced
+ * by the batch, emit dummy reloc's as part of a no-op packet
+ * payload:
+ */
+ OUT_PKT7(ring, CP_NOP, 2 * nglobal);
+ foreach_bit(i, ctx->global_bindings.enabled_mask) {
+ struct pipe_resource *prsc = ctx->global_bindings.buf[i];
+ OUT_RELOCW(ring, fd_resource(prsc)->bo, 0, 0, 0);
+ }
+ }
+
+ OUT_PKT7(ring, CP_SET_MARKER, 1);
+ OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(0x8));
+
+ const unsigned *local_size = info->block; // v->shader->nir->info->cs.local_size;
+ const unsigned *num_groups = info->grid;
+ /* for some reason, mesa/st doesn't set info->work_dim, so just assume 3: */
+ const unsigned work_dim = info->work_dim ? info->work_dim : 3;
+ OUT_PKT4(ring, REG_A6XX_HLSQ_CS_NDRANGE_0, 7);
+ OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_0_KERNELDIM(work_dim) |
+ A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) |
+ A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) |
+ A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1));
+ OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(local_size[0] * num_groups[0]));
+ OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_2_GLOBALOFF_X */
+ OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(local_size[1] * num_groups[1]));
+ OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_4_GLOBALOFF_Y */
+ OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(local_size[2] * num_groups[2]));
+ OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_6_GLOBALOFF_Z */
+
+ OUT_PKT4(ring, REG_A6XX_HLSQ_CS_KERNEL_GROUP_X, 3);
+ OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_X */
+ OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */
+ OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */
+
+ if (info->indirect) {
+ struct fd_resource *rsc = fd_resource(info->indirect);
+
+ OUT_PKT7(ring, CP_EXEC_CS_INDIRECT, 4);
+ OUT_RING(ring, 0x00000000);
+ OUT_RELOC(ring, rsc->bo, info->indirect_offset, 0, 0); /* ADDR_LO/HI */
+ OUT_RING(ring, A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
+ A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
+ A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
+ } else {
+ OUT_PKT7(ring, CP_EXEC_CS, 4);
+ OUT_RING(ring, 0x00000000);
+ OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0]));
+ OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1]));
+ OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2]));
+ }
+
+ OUT_WFI5(ring);
+
+ fd6_cache_flush(ctx->batch, ring);
}
void
-fd6_compute_init(struct pipe_context *pctx) disable_thread_safety_analysis
+fd6_compute_init(struct pipe_context *pctx)
{
- struct fd_context *ctx = fd_context(pctx);
- ctx->launch_grid = fd6_launch_grid;
- pctx->create_compute_state = ir3_shader_compute_state_create;
- pctx->delete_compute_state = ir3_shader_state_delete;
+ struct fd_context *ctx = fd_context(pctx);
+ ctx->launch_grid = fd6_launch_grid;
+ pctx->create_compute_state = fd6_create_compute_state;
+ pctx->delete_compute_state = fd6_delete_compute_state;
}