diff options
-rw-r--r-- | lib/mesa/src/amd/llvm/ac_llvm_cull.c | 512 | ||||
-rw-r--r-- | lib/mesa/src/amd/llvm/ac_llvm_cull.h | 52 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_pack.h | 209 | ||||
-rw-r--r-- | lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c | 92 |
4 files changed, 333 insertions, 532 deletions
diff --git a/lib/mesa/src/amd/llvm/ac_llvm_cull.c b/lib/mesa/src/amd/llvm/ac_llvm_cull.c index d37a9f847..c76d4e1f9 100644 --- a/lib/mesa/src/amd/llvm/ac_llvm_cull.c +++ b/lib/mesa/src/amd/llvm/ac_llvm_cull.c @@ -24,297 +24,205 @@ */ #include "ac_llvm_cull.h" - #include <llvm-c/Core.h> struct ac_position_w_info { - /* If a primitive intersects the W=0 plane, it causes a reflection - * of the determinant used for face culling. Every vertex behind - * the W=0 plane negates the determinant, so having 2 vertices behind - * the plane has no effect. This is i1 true if the determinant should be - * negated. - */ - LLVMValueRef w_reflection; - - /* If we simplify the "-w <= p <= w" view culling equation, we get - * "-w <= w", which can't be satisfied when w is negative. - * In perspective projection, a negative W means that the primitive - * is behind the viewer, but the equation is independent of the type - * of projection. - * - * w_accepted is false when all W are negative and therefore - * the primitive is invisible. - */ - LLVMValueRef w_accepted; - - /* The bounding box culling doesn't work and should be skipped when this is true. */ - LLVMValueRef any_w_negative; + /* If a primitive intersects the W=0 plane, it causes a reflection + * of the determinant used for face culling. Every vertex behind + * the W=0 plane negates the determinant, so having 2 vertices behind + * the plane has no effect. This is i1 true if the determinant should be + * negated. + */ + LLVMValueRef w_reflection; + + /* If we simplify the "-w <= p <= w" view culling equation, we get + * "-w <= w", which can't be satisfied when w is negative. + * In perspective projection, a negative W means that the primitive + * is behind the viewer, but the equation is independent of the type + * of projection. + * + * w_accepted is false when all W are negative and therefore + * the primitive is invisible. + */ + LLVMValueRef w_accepted; + + LLVMValueRef all_w_positive; + LLVMValueRef any_w_negative; }; -static void ac_analyze_position_w(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], - struct ac_position_w_info *w, unsigned num_vertices) +static void ac_analyze_position_w(struct ac_llvm_context *ctx, + LLVMValueRef pos[3][4], + struct ac_position_w_info *w) { - LLVMBuilderRef builder = ctx->builder; - LLVMValueRef all_w_negative = ctx->i1true; - - w->w_reflection = ctx->i1false; - w->any_w_negative = ctx->i1false; - - for (unsigned i = 0; i < num_vertices; i++) { - LLVMValueRef neg_w; - - neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, ""); - /* If neg_w is true, negate w_reflection. */ - w->w_reflection = LLVMBuildXor(builder, w->w_reflection, neg_w, ""); - w->any_w_negative = LLVMBuildOr(builder, w->any_w_negative, neg_w, ""); - all_w_negative = LLVMBuildAnd(builder, all_w_negative, neg_w, ""); - } - w->w_accepted = LLVMBuildNot(builder, all_w_negative, ""); + LLVMBuilderRef builder = ctx->builder; + LLVMValueRef all_w_negative = ctx->i1true; + + w->w_reflection = ctx->i1false; + w->any_w_negative = ctx->i1false; + + for (unsigned i = 0; i < 3; i++) { + LLVMValueRef neg_w; + + neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, ""); + /* If neg_w is true, negate w_reflection. */ + w->w_reflection = LLVMBuildXor(builder, w->w_reflection, neg_w, ""); + w->any_w_negative = LLVMBuildOr(builder, w->any_w_negative, neg_w, ""); + all_w_negative = LLVMBuildAnd(builder, all_w_negative, neg_w, ""); + } + w->all_w_positive = LLVMBuildNot(builder, w->any_w_negative, ""); + w->w_accepted = LLVMBuildNot(builder, all_w_negative, ""); } /* Perform front/back face culling and return true if the primitive is accepted. */ -static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], - struct ac_position_w_info *w, bool cull_front, bool cull_back, - bool cull_zero_area) -{ - LLVMBuilderRef builder = ctx->builder; - - if (cull_front && cull_back) - return ctx->i1false; - - if (!cull_front && !cull_back && !cull_zero_area) - return ctx->i1true; - - /* Front/back face culling. Also if the determinant == 0, the triangle - * area is 0. - */ - LLVMValueRef det_t0 = LLVMBuildFSub(builder, pos[2][0], pos[0][0], ""); - LLVMValueRef det_t1 = LLVMBuildFSub(builder, pos[1][1], pos[0][1], ""); - LLVMValueRef det_t2 = LLVMBuildFSub(builder, pos[0][0], pos[1][0], ""); - LLVMValueRef det_t3 = LLVMBuildFSub(builder, pos[0][1], pos[2][1], ""); - /* t0 * t1 - t2 * t3 = t2 * -t3 + t0 * t1 = fma(t2, -t3, t0 * t1) */ - LLVMValueRef det = ac_build_fmad(ctx, det_t2, LLVMBuildFNeg(builder, det_t3, ""), - LLVMBuildFMul(builder, det_t0, det_t1, "")); - - /* Negative W negates the determinant. */ - det = LLVMBuildSelect(builder, w->w_reflection, LLVMBuildFNeg(builder, det, ""), det, ""); - - LLVMValueRef accepted = NULL; - if (cull_front) { - LLVMRealPredicate cond = cull_zero_area ? LLVMRealOGT : LLVMRealOGE; - accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, ""); - } else if (cull_back) { - LLVMRealPredicate cond = cull_zero_area ? LLVMRealOLT : LLVMRealOLE; - accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, ""); - } else if (cull_zero_area) { - accepted = LLVMBuildFCmp(builder, LLVMRealONE, det, ctx->f32_0, ""); - } - - if (accepted) { - /* Don't reject NaN and +/-infinity, these are tricky. - * Just trust fixed-function HW to handle these cases correctly. - */ - accepted = LLVMBuildOr(builder, accepted, ac_build_is_inf_or_nan(ctx, det), ""); - } - - return accepted; -} - -static void rotate_45degrees(struct ac_llvm_context *ctx, LLVMValueRef v[2]) +static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, + LLVMValueRef pos[3][4], + struct ac_position_w_info *w, + bool cull_front, + bool cull_back, + bool cull_zero_area) { - /* sin(45) == cos(45) */ - LLVMValueRef sincos45 = LLVMConstReal(ctx->f32, 0.707106781); - - /* x2 = x*cos45 - y*sin45 = x*sincos45 - y*sincos45 - * y2 = x*sin45 + y*cos45 = x*sincos45 + y*sincos45 - */ - LLVMValueRef first = LLVMBuildFMul(ctx->builder, v[0], sincos45, ""); - - /* Doing 2x ffma while duplicating the multiplication is 33% faster than fmul+fadd+fadd. */ - LLVMValueRef result[2] = { - ac_build_fmad(ctx, LLVMBuildFNeg(ctx->builder, v[1], ""), sincos45, first), - ac_build_fmad(ctx, v[1], sincos45, first), - }; - - memcpy(v, result, sizeof(result)); + LLVMBuilderRef builder = ctx->builder; + + if (cull_front && cull_back) + return ctx->i1false; + + if (!cull_front && !cull_back && !cull_zero_area) + return ctx->i1true; + + /* Front/back face culling. Also if the determinant == 0, the triangle + * area is 0. + */ + LLVMValueRef det_t0 = LLVMBuildFSub(builder, pos[2][0], pos[0][0], ""); + LLVMValueRef det_t1 = LLVMBuildFSub(builder, pos[1][1], pos[0][1], ""); + LLVMValueRef det_t2 = LLVMBuildFSub(builder, pos[0][0], pos[1][0], ""); + LLVMValueRef det_t3 = LLVMBuildFSub(builder, pos[0][1], pos[2][1], ""); + LLVMValueRef det_p0 = LLVMBuildFMul(builder, det_t0, det_t1, ""); + LLVMValueRef det_p1 = LLVMBuildFMul(builder, det_t2, det_t3, ""); + LLVMValueRef det = LLVMBuildFSub(builder, det_p0, det_p1, ""); + + /* Negative W negates the determinant. */ + det = LLVMBuildSelect(builder, w->w_reflection, + LLVMBuildFNeg(builder, det, ""), + det, ""); + + LLVMValueRef accepted = NULL; + if (cull_front) { + LLVMRealPredicate cond = cull_zero_area ? LLVMRealOGT : LLVMRealOGE; + accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, ""); + } else if (cull_back) { + LLVMRealPredicate cond = cull_zero_area ? LLVMRealOLT : LLVMRealOLE; + accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, ""); + } else if (cull_zero_area) { + accepted = LLVMBuildFCmp(builder, LLVMRealONE, det, ctx->f32_0, ""); + } + return accepted; } /* Perform view culling and small primitive elimination and return true * if the primitive is accepted and initially_accepted == true. */ -static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], - LLVMValueRef initially_accepted, struct ac_position_w_info *w, - LLVMValueRef vp_scale[2], LLVMValueRef vp_translate[2], - LLVMValueRef small_prim_precision, - LLVMValueRef clip_half_line_width[2], - struct ac_cull_options *options, - ac_cull_accept_func accept_func, void *userdata) +static LLVMValueRef cull_bbox(struct ac_llvm_context *ctx, + LLVMValueRef pos[3][4], + LLVMValueRef initially_accepted, + struct ac_position_w_info *w, + LLVMValueRef vp_scale[2], + LLVMValueRef vp_translate[2], + LLVMValueRef small_prim_precision, + bool cull_view_xy, + bool cull_view_near_z, + bool cull_view_far_z, + bool cull_small_prims, + bool use_halfz_clip_space) { - LLVMBuilderRef builder = ctx->builder; - - if (!options->cull_view_xy && !options->cull_view_near_z && !options->cull_view_far_z && - !options->cull_small_prims) { - if (accept_func) - accept_func(ctx, initially_accepted, userdata); - return; - } - - ac_build_ifcc(ctx, initially_accepted, 10000000); - { - LLVMValueRef bbox_min[3], bbox_max[3]; - LLVMValueRef accepted = ctx->i1true; - - /* Compute the primitive bounding box for easy culling. */ - for (unsigned chan = 0; chan < (options->cull_view_near_z || - options->cull_view_far_z ? 3 : 2); chan++) { - assert(options->num_vertices >= 2); - bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]); - bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]); - - if (options->num_vertices == 3) { - bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]); - bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]); - } - - if (clip_half_line_width[chan]) { - bbox_min[chan] = LLVMBuildFSub(builder, bbox_min[chan], clip_half_line_width[chan], ""); - bbox_max[chan] = LLVMBuildFAdd(builder, bbox_max[chan], clip_half_line_width[chan], ""); - } - } - - /* View culling. */ - if (options->cull_view_xy || options->cull_view_near_z || options->cull_view_far_z) { - for (unsigned chan = 0; chan < 3; chan++) { - LLVMValueRef visible; - - if ((options->cull_view_xy && chan <= 1) || (options->cull_view_near_z && chan == 2)) { - float t = chan == 2 && options->use_halfz_clip_space ? 0 : -1; - visible = LLVMBuildFCmp(builder, LLVMRealOGE, bbox_max[chan], - LLVMConstReal(ctx->f32, t), ""); - accepted = LLVMBuildAnd(builder, accepted, visible, ""); - } - - if ((options->cull_view_xy && chan <= 1) || (options->cull_view_far_z && chan == 2)) { - visible = LLVMBuildFCmp(builder, LLVMRealOLE, bbox_min[chan], ctx->f32_1, ""); - accepted = LLVMBuildAnd(builder, accepted, visible, ""); - } - } - } - - /* Small primitive culling - triangles. */ - if (options->cull_small_prims && options->num_vertices == 3) { - /* Assuming a sample position at (0.5, 0.5), if we round - * the bounding box min/max extents and the results of - * the rounding are equal in either the X or Y direction, - * the bounding box does not intersect the sample. - * - * See these GDC slides for pictures: - * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf - */ - LLVMValueRef min, max, not_equal[2], visible; - - for (unsigned chan = 0; chan < 2; chan++) { - /* Convert the position to screen-space coordinates. */ - min = ac_build_fmad(ctx, bbox_min[chan], vp_scale[chan], vp_translate[chan]); - max = ac_build_fmad(ctx, bbox_max[chan], vp_scale[chan], vp_translate[chan]); - /* Scale the bounding box according to the precision of - * the rasterizer and the number of MSAA samples. */ - min = LLVMBuildFSub(builder, min, small_prim_precision, ""); - max = LLVMBuildFAdd(builder, max, small_prim_precision, ""); - - /* Determine if the bbox intersects the sample point. - * It also works for MSAA, but vp_scale, vp_translate, - * and small_prim_precision are computed differently. - */ - min = ac_build_round(ctx, min); - max = ac_build_round(ctx, max); - not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, ""); - } - visible = LLVMBuildAnd(builder, not_equal[0], not_equal[1], ""); - accepted = LLVMBuildAnd(builder, accepted, visible, ""); - } - - /* Small primitive culling - lines. */ - if (options->cull_small_prims && options->num_vertices == 2) { - /* This only works with lines without perpendicular end caps (lines with perpendicular - * end caps are rasterized as quads and thus can't be culled as small prims in 99% of - * cases because line_width >= 1). - * - * This takes advantage of the diamont exit rule, which says that every pixel - * has a diamond inside it touching the pixel boundary and only if a line exits - * the diamond, that pixel is filled. If a line enters the diamond or stays - * outside the diamond, the pixel isn't filled. - * - * This algorithm is a little simpler than that. The space outside all diamonds also - * has the same diamond shape, which we'll call corner diamonds. - * - * The idea is to cull all lines that are entirely inside a diamond, including - * corner diamonds. If a line is entirely inside a diamond, it can be culled because - * it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled - * because it doesn't enter any diamond and thus can't exit any diamond. - * - * The viewport is rotated by 45 degress to turn diamonds into squares, and a bounding - * box test is used to determine whether a line is entirely inside any square (diamond). - * - * The line width doesn't matter. Wide lines only duplicate filled pixels in either X or - * Y direction from the filled pixels. MSAA also doesn't matter. MSAA should ideally use - * perpendicular end caps that enable quad rasterization for lines. Thus, this should - * always use non-MSAA viewport transformation and non-MSAA small prim precision. - * - * A good test is piglit/lineloop because it draws 10k subpixel lines in a circle. - * It should contain no holes if this matches hw behavior. - */ - LLVMValueRef v0[2], v1[2]; - - /* Get vertex positions in pixels. */ - for (unsigned chan = 0; chan < 2; chan++) { - v0[chan] = ac_build_fmad(ctx, pos[0][chan], vp_scale[chan], vp_translate[chan]); - v1[chan] = ac_build_fmad(ctx, pos[1][chan], vp_scale[chan], vp_translate[chan]); - } - - /* Rotate the viewport by 45 degress, so that diamonds become squares. */ - rotate_45degrees(ctx, v0); - rotate_45degrees(ctx, v1); - - LLVMValueRef not_equal[2]; - - for (unsigned chan = 0; chan < 2; chan++) { - /* The width of each square is sqrt(0.5), so scale it to 1 because we want - * round() to give us the position of the closest center of a square (diamond). - */ - v0[chan] = LLVMBuildFMul(builder, v0[chan], LLVMConstReal(ctx->f32, 1.414213562), ""); - v1[chan] = LLVMBuildFMul(builder, v1[chan], LLVMConstReal(ctx->f32, 1.414213562), ""); - - /* Compute the bounding box around both vertices. We do this because we must - * enlarge the line area by the precision of the rasterizer. - */ - LLVMValueRef min = ac_build_fmin(ctx, v0[chan], v1[chan]); - LLVMValueRef max = ac_build_fmax(ctx, v0[chan], v1[chan]); - - /* Enlarge the bounding box by the precision of the rasterizer. */ - min = LLVMBuildFSub(builder, min, small_prim_precision, ""); - max = LLVMBuildFAdd(builder, max, small_prim_precision, ""); - - /* Round the bounding box corners. If both rounded corners are equal, - * the bounding box is entirely inside a square (diamond). - */ - min = ac_build_round(ctx, min); - max = ac_build_round(ctx, max); - not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, ""); - } - - accepted = LLVMBuildAnd(builder, accepted, - LLVMBuildOr(builder, not_equal[0], not_equal[1], ""), ""); - } - - /* Disregard the bounding box culling if any W is negative because the code - * doesn't work with that. - */ - accepted = LLVMBuildOr(builder, accepted, w->any_w_negative, ""); - - if (accept_func) - accept_func(ctx, accepted, userdata); - } - ac_build_endif(ctx, 10000000); + LLVMBuilderRef builder = ctx->builder; + + if (!cull_view_xy && !cull_view_near_z && !cull_view_far_z && !cull_small_prims) + return initially_accepted; + + /* Skip the culling if the primitive has already been rejected or + * if any W is negative. The bounding box culling doesn't work when + * W is negative. + */ + LLVMValueRef cond = LLVMBuildAnd(builder, initially_accepted, + w->all_w_positive, ""); + LLVMValueRef accepted_var = ac_build_alloca_undef(ctx, ctx->i1, ""); + LLVMBuildStore(builder, initially_accepted, accepted_var); + + ac_build_ifcc(ctx, cond, 10000000 /* does this matter? */); + { + LLVMValueRef bbox_min[3], bbox_max[3]; + LLVMValueRef accepted = initially_accepted; + + /* Compute the primitive bounding box for easy culling. */ + for (unsigned chan = 0; chan < (cull_view_near_z || cull_view_far_z ? 3 : 2); chan++) { + bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]); + bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]); + + bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]); + bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]); + } + + /* View culling. */ + if (cull_view_xy || cull_view_near_z || cull_view_far_z) { + for (unsigned chan = 0; chan < 3; chan++) { + LLVMValueRef visible; + + if ((cull_view_xy && chan <= 1) || + (cull_view_near_z && chan == 2)) { + float t = chan == 2 && use_halfz_clip_space ? 0 : -1; + visible = LLVMBuildFCmp(builder, LLVMRealOGE, bbox_max[chan], + LLVMConstReal(ctx->f32, t), ""); + accepted = LLVMBuildAnd(builder, accepted, visible, ""); + } + + if ((cull_view_xy && chan <= 1) || + (cull_view_far_z && chan == 2)) { + visible = LLVMBuildFCmp(builder, LLVMRealOLE, bbox_min[chan], + ctx->f32_1, ""); + accepted = LLVMBuildAnd(builder, accepted, visible, ""); + } + } + } + + /* Small primitive elimination. */ + if (cull_small_prims) { + /* Assuming a sample position at (0.5, 0.5), if we round + * the bounding box min/max extents and the results of + * the rounding are equal in either the X or Y direction, + * the bounding box does not intersect the sample. + * + * See these GDC slides for pictures: + * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf + */ + LLVMValueRef min, max, not_equal[2], visible; + + for (unsigned chan = 0; chan < 2; chan++) { + /* Convert the position to screen-space coordinates. */ + min = ac_build_fmad(ctx, bbox_min[chan], + vp_scale[chan], vp_translate[chan]); + max = ac_build_fmad(ctx, bbox_max[chan], + vp_scale[chan], vp_translate[chan]); + /* Scale the bounding box according to the precision of + * the rasterizer and the number of MSAA samples. */ + min = LLVMBuildFSub(builder, min, small_prim_precision, ""); + max = LLVMBuildFAdd(builder, max, small_prim_precision, ""); + + /* Determine if the bbox intersects the sample point. + * It also works for MSAA, but vp_scale, vp_translate, + * and small_prim_precision are computed differently. + */ + min = ac_build_round(ctx, min); + max = ac_build_round(ctx, max); + not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, ""); + } + visible = LLVMBuildAnd(builder, not_equal[0], not_equal[1], ""); + accepted = LLVMBuildAnd(builder, accepted, visible, ""); + } + + LLVMBuildStore(builder, accepted, accepted_var); + } + ac_build_endif(ctx, 10000000); + + return LLVMBuildLoad(builder, accepted_var, ""); } /** @@ -332,28 +240,36 @@ static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], * the rasterizer. Set to num_samples / 2^subpixel_bits. * subpixel_bits are defined by the quantization mode. * \param options See ac_cull_options. - * \param accept_func Callback invoked in the inner-most branch where the primitive is accepted. */ -void ac_cull_primitive(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], - LLVMValueRef initially_accepted, LLVMValueRef vp_scale[2], - LLVMValueRef vp_translate[2], LLVMValueRef small_prim_precision, - LLVMValueRef clip_half_line_width[2], struct ac_cull_options *options, - ac_cull_accept_func accept_func, void *userdata) +LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx, + LLVMValueRef pos[3][4], + LLVMValueRef initially_accepted, + LLVMValueRef vp_scale[2], + LLVMValueRef vp_translate[2], + LLVMValueRef small_prim_precision, + struct ac_cull_options *options) { - struct ac_position_w_info w; - ac_analyze_position_w(ctx, pos, &w, options->num_vertices); - - /* W culling. */ - LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true; - accepted = LLVMBuildAnd(ctx->builder, accepted, initially_accepted, ""); - - /* Face culling. */ - accepted = LLVMBuildAnd( - ctx->builder, accepted, - ac_cull_face(ctx, pos, &w, options->cull_front, options->cull_back, options->cull_zero_area), - ""); - - /* View culling and small primitive elimination. */ - cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate, small_prim_precision, - clip_half_line_width, options, accept_func, userdata); + struct ac_position_w_info w; + ac_analyze_position_w(ctx, pos, &w); + + /* W culling. */ + LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true; + accepted = LLVMBuildAnd(ctx->builder, accepted, initially_accepted, ""); + + /* Face culling. */ + accepted = LLVMBuildAnd(ctx->builder, accepted, + ac_cull_face(ctx, pos, &w, + options->cull_front, + options->cull_back, + options->cull_zero_area), ""); + + /* View culling and small primitive elimination. */ + accepted = cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate, + small_prim_precision, + options->cull_view_xy, + options->cull_view_near_z, + options->cull_view_far_z, + options->cull_small_prims, + options->use_halfz_clip_space); + return accepted; } diff --git a/lib/mesa/src/amd/llvm/ac_llvm_cull.h b/lib/mesa/src/amd/llvm/ac_llvm_cull.h index dc978d3fe..0aa6c902a 100644 --- a/lib/mesa/src/amd/llvm/ac_llvm_cull.h +++ b/lib/mesa/src/amd/llvm/ac_llvm_cull.h @@ -29,35 +29,31 @@ #include "ac_llvm_build.h" struct ac_cull_options { - /* In general, I recommend setting all to true except view Z culling, - * which isn't so effective because W culling is cheaper and partially - * replaces near Z culling, and you don't need to set Position.z - * if Z culling is disabled. - * - * If something doesn't work, turn some of these off to find out what. - */ - bool cull_front; - bool cull_back; - bool cull_view_xy; - bool cull_view_near_z; - bool cull_view_far_z; - bool cull_small_prims; - bool cull_zero_area; - bool cull_w; /* cull primitives with all W < 0 */ - - bool use_halfz_clip_space; - - uint8_t num_vertices; /* 1..3 */ + /* In general, I recommend setting all to true except view Z culling, + * which isn't so effective because W culling is cheaper and partially + * replaces near Z culling, and you don't need to set Position.z + * if Z culling is disabled. + * + * If something doesn't work, turn some of these off to find out what. + */ + bool cull_front; + bool cull_back; + bool cull_view_xy; + bool cull_view_near_z; + bool cull_view_far_z; + bool cull_small_prims; + bool cull_zero_area; + bool cull_w; /* cull primitives with all W < 0 */ + + bool use_halfz_clip_space; }; -/* Callback invoked in the inner-most branch where the primitive is accepted. */ -typedef void (*ac_cull_accept_func)(struct ac_llvm_context *ctx, LLVMValueRef accepted, - void *userdata); - -void ac_cull_primitive(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4], - LLVMValueRef initially_accepted, LLVMValueRef vp_scale[2], - LLVMValueRef vp_translate[2], LLVMValueRef small_prim_precision, - LLVMValueRef clip_half_line_width[2], struct ac_cull_options *options, - ac_cull_accept_func accept_func, void *userdata); +LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx, + LLVMValueRef pos[3][4], + LLVMValueRef initially_accepted, + LLVMValueRef vp_scale[2], + LLVMValueRef vp_translate[2], + LLVMValueRef small_prim_precision, + struct ac_cull_options *options); #endif diff --git a/lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_pack.h b/lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_pack.h index f6d1fbf09..b3b9bf924 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_pack.h +++ b/lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_pack.h @@ -27,149 +27,86 @@ #include "a6xx.xml.h" struct fd_reg_pair { - uint32_t reg; - uint64_t value; - struct fd_bo *bo; - bool is_address; - bool bo_write; - uint32_t bo_offset; - uint32_t bo_shift; + uint32_t reg; + uint64_t value; + struct fd_bo *bo; + bool is_address; + bool bo_write; + uint32_t bo_offset; + uint32_t bo_shift; }; #define __bo_type struct fd_bo * #include "a6xx-pack.xml.h" -#include "adreno-pm4-pack.xml.h" -#define __assert_eq(a, b) \ - do { \ - if ((a) != (b)) { \ - fprintf(stderr, "assert failed: " #a " (0x%x) != " #b " (0x%x)\n", a, \ - b); \ - assert((a) == (b)); \ - } \ - } while (0) +#define __assert_eq(a, b) \ + do { \ + if ((a) != (b)) { \ + fprintf(stderr, "assert failed: " #a " (0x%x) != " #b " (0x%x)\n", a, b); \ + assert((a) == (b)); \ + } \ + } while (0) -#if !FD_BO_NO_HARDPIN -# error 'Hardpin unsupported' -#endif +#define __ONE_REG(i, ...) \ + do { \ + const struct fd_reg_pair regs[] = { __VA_ARGS__ }; \ + if (i < ARRAY_SIZE(regs) && regs[i].reg > 0) { \ + __assert_eq(regs[0].reg + i, regs[i].reg); \ + if (regs[i].bo) { \ + struct fd_reloc reloc = { \ + .bo = regs[i].bo, \ + .flags = FD_RELOC_READ | \ + (regs[i].bo_write ? FD_RELOC_WRITE : 0), \ + \ + .offset = regs[i].bo_offset, \ + .or = regs[i].value, \ + .shift = regs[i].bo_shift, \ + .orhi = regs[i].value >> 32 \ + }; \ + ring->cur = p; \ + p += 2; \ + fd_ringbuffer_reloc(ring, &reloc); \ + } else { \ + *p++ = regs[i].value; \ + if (regs[i].is_address) \ + *p++ = regs[i].value >> 32; \ + } \ + } \ + } while (0) -#define __ONE_REG(i, ...) \ - do { \ - const struct fd_reg_pair regs[] = {__VA_ARGS__}; \ - /* NOTE: allow regs[0].reg==0, this happens in OUT_PKT() */ \ - if (i < ARRAY_SIZE(regs) && (i == 0 || regs[i].reg > 0)) { \ - __assert_eq(regs[0].reg + i, regs[i].reg); \ - if (regs[i].bo) { \ - uint64_t *p64 = (uint64_t *)p; \ - *p64 = __reloc_iova(regs[i].bo, regs[i].bo_offset, regs[i].value, \ - regs[i].bo_shift); \ - p += 2; \ - fd_ringbuffer_attach_bo(ring, regs[i].bo); \ - } else { \ - *p++ = regs[i].value; \ - if (regs[i].is_address) \ - *p++ = regs[i].value >> 32; \ - } \ - } \ - } while (0) - -#define OUT_REG(ring, ...) \ - do { \ - const struct fd_reg_pair regs[] = {__VA_ARGS__}; \ - unsigned count = ARRAY_SIZE(regs); \ - \ - STATIC_ASSERT(ARRAY_SIZE(regs) > 0); \ - STATIC_ASSERT(ARRAY_SIZE(regs) <= 16); \ - \ - BEGIN_RING(ring, count + 1); \ - uint32_t *p = ring->cur; \ - *p++ = pm4_pkt4_hdr(regs[0].reg, count); \ - \ - __ONE_REG(0, __VA_ARGS__); \ - __ONE_REG(1, __VA_ARGS__); \ - __ONE_REG(2, __VA_ARGS__); \ - __ONE_REG(3, __VA_ARGS__); \ - __ONE_REG(4, __VA_ARGS__); \ - __ONE_REG(5, __VA_ARGS__); \ - __ONE_REG(6, __VA_ARGS__); \ - __ONE_REG(7, __VA_ARGS__); \ - __ONE_REG(8, __VA_ARGS__); \ - __ONE_REG(9, __VA_ARGS__); \ - __ONE_REG(10, __VA_ARGS__); \ - __ONE_REG(11, __VA_ARGS__); \ - __ONE_REG(12, __VA_ARGS__); \ - __ONE_REG(13, __VA_ARGS__); \ - __ONE_REG(14, __VA_ARGS__); \ - __ONE_REG(15, __VA_ARGS__); \ - ring->cur = p; \ - } while (0) - -#define OUT_PKT(ring, opcode, ...) \ - do { \ - const struct fd_reg_pair regs[] = {__VA_ARGS__}; \ - unsigned count = ARRAY_SIZE(regs); \ - \ - STATIC_ASSERT(ARRAY_SIZE(regs) <= 16); \ - \ - BEGIN_RING(ring, count + 1); \ - uint32_t *p = ring->cur; \ - *p++ = pm4_pkt7_hdr(opcode, count); \ - \ - __ONE_REG(0, __VA_ARGS__); \ - __ONE_REG(1, __VA_ARGS__); \ - __ONE_REG(2, __VA_ARGS__); \ - __ONE_REG(3, __VA_ARGS__); \ - __ONE_REG(4, __VA_ARGS__); \ - __ONE_REG(5, __VA_ARGS__); \ - __ONE_REG(6, __VA_ARGS__); \ - __ONE_REG(7, __VA_ARGS__); \ - __ONE_REG(8, __VA_ARGS__); \ - __ONE_REG(9, __VA_ARGS__); \ - __ONE_REG(10, __VA_ARGS__); \ - __ONE_REG(11, __VA_ARGS__); \ - __ONE_REG(12, __VA_ARGS__); \ - __ONE_REG(13, __VA_ARGS__); \ - __ONE_REG(14, __VA_ARGS__); \ - __ONE_REG(15, __VA_ARGS__); \ - ring->cur = p; \ - } while (0) - -/* similar to OUT_PKT() but appends specified # of dwords - * copied for buf to the end of the packet (ie. for use- - * cases like CP_LOAD_STATE) - */ -#define OUT_PKTBUF(ring, opcode, dwords, sizedwords, ...) \ - do { \ - const struct fd_reg_pair regs[] = {__VA_ARGS__}; \ - unsigned count = ARRAY_SIZE(regs); \ - \ - STATIC_ASSERT(ARRAY_SIZE(regs) <= 16); \ - count += sizedwords; \ - \ - BEGIN_RING(ring, count + 1); \ - uint32_t *p = ring->cur; \ - *p++ = pm4_pkt7_hdr(opcode, count); \ - \ - __ONE_REG(0, __VA_ARGS__); \ - __ONE_REG(1, __VA_ARGS__); \ - __ONE_REG(2, __VA_ARGS__); \ - __ONE_REG(3, __VA_ARGS__); \ - __ONE_REG(4, __VA_ARGS__); \ - __ONE_REG(5, __VA_ARGS__); \ - __ONE_REG(6, __VA_ARGS__); \ - __ONE_REG(7, __VA_ARGS__); \ - __ONE_REG(8, __VA_ARGS__); \ - __ONE_REG(9, __VA_ARGS__); \ - __ONE_REG(10, __VA_ARGS__); \ - __ONE_REG(11, __VA_ARGS__); \ - __ONE_REG(12, __VA_ARGS__); \ - __ONE_REG(13, __VA_ARGS__); \ - __ONE_REG(14, __VA_ARGS__); \ - __ONE_REG(15, __VA_ARGS__); \ - memcpy(p, dwords, 4 * sizedwords); \ - p += sizedwords; \ - ring->cur = p; \ - } while (0) +#define OUT_REG(ring, ...) \ + do { \ + const struct fd_reg_pair regs[] = { __VA_ARGS__ }; \ + unsigned count = ARRAY_SIZE(regs); \ + \ + STATIC_ASSERT(count > 0); \ + STATIC_ASSERT(count <= 16); \ + \ + BEGIN_RING(ring, count + 1); \ + uint32_t *p = ring->cur; \ + *p++ = CP_TYPE4_PKT | count | \ + (_odd_parity_bit(count) << 7) | \ + ((regs[0].reg & 0x3ffff) << 8) | \ + ((_odd_parity_bit(regs[0].reg) << 27)); \ + \ + __ONE_REG( 0, __VA_ARGS__); \ + __ONE_REG( 1, __VA_ARGS__); \ + __ONE_REG( 2, __VA_ARGS__); \ + __ONE_REG( 3, __VA_ARGS__); \ + __ONE_REG( 4, __VA_ARGS__); \ + __ONE_REG( 5, __VA_ARGS__); \ + __ONE_REG( 6, __VA_ARGS__); \ + __ONE_REG( 7, __VA_ARGS__); \ + __ONE_REG( 8, __VA_ARGS__); \ + __ONE_REG( 9, __VA_ARGS__); \ + __ONE_REG(10, __VA_ARGS__); \ + __ONE_REG(11, __VA_ARGS__); \ + __ONE_REG(12, __VA_ARGS__); \ + __ONE_REG(13, __VA_ARGS__); \ + __ONE_REG(14, __VA_ARGS__); \ + __ONE_REG(15, __VA_ARGS__); \ + ring->cur = p; \ + } while (0) #endif /* FD6_PACK_H */ diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c index 7900ba9da..122e69762 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c @@ -72,11 +72,8 @@ static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *c uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); - if (ctx->screen->info.gfx_level >= GFX11) - rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW); - else if (ctx->screen->info.gfx_level >= GFX10) - rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | + if (ctx->screen->info.chip_class >= GFX10) + rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); else rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | @@ -94,7 +91,9 @@ static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index) struct si_shader_context *ctx = si_shader_context_from_abi(abi); struct si_shader_selector *sel = ctx->shader->selector; - if (sel->info.base.num_ubos == 1 && sel->info.base.num_ssbos == 0) { + LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers); + + if (sel->info.const_buffers_declared == 1 && sel->info.shader_buffers_declared == 0) { return load_const_buffer_desc_fast_path(ctx); } @@ -102,27 +101,19 @@ static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index) index = LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), ""); - return ac_build_load_to_sgpr(&ctx->ac, - ac_get_ptr_arg(&ctx->ac, &ctx->args, ctx->const_and_shader_buffers), - index); + return ac_build_load_to_sgpr(&ctx->ac, ptr, index); } -static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write, bool non_uniform) +static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write) { struct si_shader_context *ctx = si_shader_context_from_abi(abi); - - /* Fast path if the shader buffer is in user SGPRs. */ - if (LLVMIsConstant(index) && - LLVMConstIntGetZExtValue(index) < ctx->shader->selector->cs_num_shaderbufs_in_user_sgprs) - return ac_get_arg(&ctx->ac, ctx->cs_shaderbuf[LLVMConstIntGetZExtValue(index)]); + LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers); index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers); index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0), index, ""); - return ac_build_load_to_sgpr(&ctx->ac, - ac_get_ptr_arg(&ctx->ac, &ctx->args, ctx->const_and_shader_buffers), - index); + return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index); } /** @@ -138,7 +129,7 @@ static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, boo */ static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc) { - if (ctx->screen->info.gfx_level <= GFX7) { + if (ctx->screen->info.chip_class <= GFX7) { return rsrc; } else { LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0); @@ -151,41 +142,18 @@ static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rs } } -static LLVMValueRef force_write_compress_off(struct si_shader_context *ctx, LLVMValueRef rsrc) -{ - LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0); - LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_00A018_WRITE_COMPRESS_ENABLE, 0); - LLVMValueRef tmp; - - tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, ""); - tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, ""); - return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, ""); -} - -static LLVMValueRef fixup_image_desc(struct si_shader_context *ctx, LLVMValueRef rsrc, - bool uses_store) -{ - if (uses_store && ctx->ac.gfx_level <= GFX9) - rsrc = force_dcc_off(ctx, rsrc); - - if (!uses_store && ctx->screen->info.has_image_load_dcc_bug && - ctx->screen->always_allow_dcc_stores) - rsrc = force_write_compress_off(ctx, rsrc); - - return rsrc; -} - /* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should * adjust "index" to point to FMASK. */ -static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, struct ac_llvm_pointer list, +static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list, LLVMValueRef index, enum ac_descriptor_type desc_type, bool uses_store, bool bindless) { + LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef rsrc; if (desc_type == AC_DESC_BUFFER) { index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1); - list.pointee_type = ctx->ac.v4i32; + list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); } else { assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK); } @@ -195,16 +163,15 @@ static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, struct ac_ else rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index); - if (desc_type == AC_DESC_IMAGE) - rsrc = fixup_image_desc(ctx, rsrc, uses_store); - + if (desc_type == AC_DESC_IMAGE && uses_store) + rsrc = force_dcc_off(ctx, rsrc); return rsrc; } /** * Load an image view, fmask view. or sampler state descriptor. */ -static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, struct ac_llvm_pointer list, +static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, LLVMValueRef list, LLVMValueRef index, enum ac_descriptor_type type) { LLVMBuilderRef builder = ctx->ac.builder; @@ -217,18 +184,17 @@ static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, struct a case AC_DESC_BUFFER: /* The buffer is in [4:7]. */ index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1); - list.pointee_type = ctx->ac.v4i32; + list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); break; case AC_DESC_FMASK: /* The FMASK is at [8:15]. */ - assert(ctx->screen->info.gfx_level < GFX11); index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1); break; case AC_DESC_SAMPLER: /* The sampler state is at [12:15]. */ index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), LLVMConstInt(ctx->ac.i32, 3, 0)); - list.pointee_type = ctx->ac.v4i32; + list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), ""); break; case AC_DESC_PLANE_0: case AC_DESC_PLANE_1: @@ -256,7 +222,7 @@ static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned assert(desc_type <= AC_DESC_BUFFER); if (bindless) { - struct ac_llvm_pointer list = ac_get_ptr_arg(&ctx->ac, &ctx->args, ctx->bindless_samplers_and_images); + LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images); /* dynamic_index is the bindless handle */ if (image) { @@ -278,17 +244,14 @@ static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned */ dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), ""); - list.v = ac_build_pointer_add(&ctx->ac, ctx->ac.v8i32, list.v, dynamic_index); + list = ac_build_pointer_add(&ctx->ac, list, dynamic_index); return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type); } unsigned num_slots = image ? ctx->num_images : ctx->num_samplers; + assert(const_index < num_slots || dynamic_index); - /* Redirect invalid resource indices to the first array element. */ - if (const_index >= num_slots) - const_index = base_index; - - struct ac_llvm_pointer list = ac_get_ptr_arg(&ctx->ac, &ctx->args, ctx->samplers_and_images); + LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images); LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false); if (dynamic_index) { @@ -307,17 +270,6 @@ static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned } if (image) { - /* Fast path if the image is in user SGPRs. */ - if (!dynamic_index && - const_index < ctx->shader->selector->cs_num_images_in_user_sgprs && - (desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER)) { - LLVMValueRef rsrc = ac_get_arg(&ctx->ac, ctx->cs_image[const_index]); - - if (desc_type == AC_DESC_IMAGE) - rsrc = fixup_image_desc(ctx, rsrc, write); - return rsrc; - } - /* FMASKs are separate from images. */ if (desc_type == AC_DESC_FMASK) { index = |