4 files changed, 333 insertions, 532 deletions
diff --git a/lib/mesa/src/amd/llvm/ac_llvm_cull.c b/lib/mesa/src/amd/llvm/ac_llvm_cull.c
index d37a9f847..c76d4e1f9 100644
--- a/lib/mesa/src/amd/llvm/ac_llvm_cull.c
+++ b/lib/mesa/src/amd/llvm/ac_llvm_cull.c
@@ -24,297 +24,205 @@
  */
 
 #include "ac_llvm_cull.h"
-
 #include <llvm-c/Core.h>
 
 struct ac_position_w_info {
-   /* If a primitive intersects the W=0 plane, it causes a reflection
-    * of the determinant used for face culling. Every vertex behind
-    * the W=0 plane negates the determinant, so having 2 vertices behind
-    * the plane has no effect. This is i1 true if the determinant should be
-    * negated.
-    */
-   LLVMValueRef w_reflection;
-
-   /* If we simplify the "-w <= p <= w" view culling equation, we get
-    * "-w <= w", which can't be satisfied when w is negative.
-    * In perspective projection, a negative W means that the primitive
-    * is behind the viewer, but the equation is independent of the type
-    * of projection.
-    *
-    * w_accepted is false when all W are negative and therefore
-    * the primitive is invisible.
-    */
-   LLVMValueRef w_accepted;
-
-   /* The bounding box culling doesn't work and should be skipped when this is true. */
-   LLVMValueRef any_w_negative;
+	/* If a primitive intersects the W=0 plane, it causes a reflection
+	 * of the determinant used for face culling. Every vertex behind
+	 * the W=0 plane negates the determinant, so having 2 vertices behind
+	 * the plane has no effect. This is i1 true if the determinant should be
+	 * negated.
+	 */
+	LLVMValueRef w_reflection;
+
+	/* If we simplify the "-w <= p <= w" view culling equation, we get
+	 * "-w <= w", which can't be satisfied when w is negative.
+	 * In perspective projection, a negative W means that the primitive
+	 * is behind the viewer, but the equation is independent of the type
+	 * of projection.
+	 *
+	 * w_accepted is false when all W are negative and therefore
+	 * the primitive is invisible.
+	 */
+	LLVMValueRef w_accepted;
+
+	LLVMValueRef all_w_positive;
+	LLVMValueRef any_w_negative;
 };
 
-static void ac_analyze_position_w(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
-                                  struct ac_position_w_info *w, unsigned num_vertices)
+static void ac_analyze_position_w(struct ac_llvm_context *ctx,
+				  LLVMValueRef pos[3][4],
+				  struct ac_position_w_info *w)
 {
-   LLVMBuilderRef builder = ctx->builder;
-   LLVMValueRef all_w_negative = ctx->i1true;
-
-   w->w_reflection = ctx->i1false;
-   w->any_w_negative = ctx->i1false;
-
-   for (unsigned i = 0; i < num_vertices; i++) {
-      LLVMValueRef neg_w;
-
-      neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, "");
-      /* If neg_w is true, negate w_reflection. */
-      w->w_reflection = LLVMBuildXor(builder, w->w_reflection, neg_w, "");
-      w->any_w_negative = LLVMBuildOr(builder, w->any_w_negative, neg_w, "");
-      all_w_negative = LLVMBuildAnd(builder, all_w_negative, neg_w, "");
-   }
-   w->w_accepted = LLVMBuildNot(builder, all_w_negative, "");
+	LLVMBuilderRef builder = ctx->builder;
+	LLVMValueRef all_w_negative = ctx->i1true;
+
+	w->w_reflection = ctx->i1false;
+	w->any_w_negative = ctx->i1false;
+
+	for (unsigned i = 0; i < 3; i++) {
+		LLVMValueRef neg_w;
+
+		neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, "");
+		/* If neg_w is true, negate w_reflection. */
+		w->w_reflection = LLVMBuildXor(builder, w->w_reflection, neg_w, "");
+		w->any_w_negative = LLVMBuildOr(builder, w->any_w_negative, neg_w, "");
+		all_w_negative = LLVMBuildAnd(builder, all_w_negative, neg_w, "");
+	}
+	w->all_w_positive = LLVMBuildNot(builder, w->any_w_negative, "");
+	w->w_accepted = LLVMBuildNot(builder, all_w_negative, "");
 }
 
 /* Perform front/back face culling and return true if the primitive is accepted. */
-static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
-                                 struct ac_position_w_info *w, bool cull_front, bool cull_back,
-                                 bool cull_zero_area)
-{
-   LLVMBuilderRef builder = ctx->builder;
-
-   if (cull_front && cull_back)
-      return ctx->i1false;
-
-   if (!cull_front && !cull_back && !cull_zero_area)
-      return ctx->i1true;
-
-   /* Front/back face culling. Also if the determinant == 0, the triangle
-    * area is 0.
-    */
-   LLVMValueRef det_t0 = LLVMBuildFSub(builder, pos[2][0], pos[0][0], "");
-   LLVMValueRef det_t1 = LLVMBuildFSub(builder, pos[1][1], pos[0][1], "");
-   LLVMValueRef det_t2 = LLVMBuildFSub(builder, pos[0][0], pos[1][0], "");
-   LLVMValueRef det_t3 = LLVMBuildFSub(builder, pos[0][1], pos[2][1], "");
-   /* t0 * t1 - t2 * t3  =  t2 * -t3 + t0 * t1  =  fma(t2, -t3, t0 * t1) */
-   LLVMValueRef det = ac_build_fmad(ctx, det_t2, LLVMBuildFNeg(builder, det_t3, ""),
-                                    LLVMBuildFMul(builder, det_t0, det_t1, ""));
-
-   /* Negative W negates the determinant. */
-   det = LLVMBuildSelect(builder, w->w_reflection, LLVMBuildFNeg(builder, det, ""), det, "");
-
-   LLVMValueRef accepted = NULL;
-   if (cull_front) {
-      LLVMRealPredicate cond = cull_zero_area ? LLVMRealOGT : LLVMRealOGE;
-      accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, "");
-   } else if (cull_back) {
-      LLVMRealPredicate cond = cull_zero_area ? LLVMRealOLT : LLVMRealOLE;
-      accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, "");
-   } else if (cull_zero_area) {
-      accepted = LLVMBuildFCmp(builder, LLVMRealONE, det, ctx->f32_0, "");
-   }
-
-   if (accepted) {
-      /* Don't reject NaN and +/-infinity, these are tricky.
-       * Just trust fixed-function HW to handle these cases correctly.
-       */
-      accepted = LLVMBuildOr(builder, accepted, ac_build_is_inf_or_nan(ctx, det), "");
-   }
-
-   return accepted;
-}
-
-static void rotate_45degrees(struct ac_llvm_context *ctx, LLVMValueRef v[2])
+static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx,
+				 LLVMValueRef pos[3][4],
+				 struct ac_position_w_info *w,
+				 bool cull_front,
+				 bool cull_back,
+				 bool cull_zero_area)
 {
-   /* sin(45) == cos(45) */
-   LLVMValueRef sincos45 = LLVMConstReal(ctx->f32, 0.707106781);
-
-   /* x2  =  x*cos45 - y*sin45  =  x*sincos45 - y*sincos45
-    * y2  =  x*sin45 + y*cos45  =  x*sincos45 + y*sincos45
-    */
-   LLVMValueRef first = LLVMBuildFMul(ctx->builder, v[0], sincos45, "");
-
-   /* Doing 2x ffma while duplicating the multiplication is 33% faster than fmul+fadd+fadd. */
-   LLVMValueRef result[2] = {
-      ac_build_fmad(ctx, LLVMBuildFNeg(ctx->builder, v[1], ""), sincos45, first),
-      ac_build_fmad(ctx, v[1], sincos45, first),
-   };
-
-   memcpy(v, result, sizeof(result));
+	LLVMBuilderRef builder = ctx->builder;
+
+	if (cull_front && cull_back)
+		return ctx->i1false;
+
+	if (!cull_front && !cull_back && !cull_zero_area)
+		return ctx->i1true;
+
+	/* Front/back face culling. Also if the determinant == 0, the triangle
+	 * area is 0.
+	 */
+	LLVMValueRef det_t0 = LLVMBuildFSub(builder, pos[2][0], pos[0][0], "");
+	LLVMValueRef det_t1 = LLVMBuildFSub(builder, pos[1][1], pos[0][1], "");
+	LLVMValueRef det_t2 = LLVMBuildFSub(builder, pos[0][0], pos[1][0], "");
+	LLVMValueRef det_t3 = LLVMBuildFSub(builder, pos[0][1], pos[2][1], "");
+	LLVMValueRef det_p0 = LLVMBuildFMul(builder, det_t0, det_t1, "");
+	LLVMValueRef det_p1 = LLVMBuildFMul(builder, det_t2, det_t3, "");
+	LLVMValueRef det = LLVMBuildFSub(builder, det_p0, det_p1, "");
+
+	/* Negative W negates the determinant. */
+	det = LLVMBuildSelect(builder, w->w_reflection,
+			      LLVMBuildFNeg(builder, det, ""),
+			      det, "");
+
+	LLVMValueRef accepted = NULL;
+	if (cull_front) {
+		LLVMRealPredicate cond = cull_zero_area ? LLVMRealOGT : LLVMRealOGE;
+		accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, "");
+	} else if (cull_back) {
+		LLVMRealPredicate cond = cull_zero_area ? LLVMRealOLT : LLVMRealOLE;
+		accepted = LLVMBuildFCmp(builder, cond, det, ctx->f32_0, "");
+	} else if (cull_zero_area) {
+		accepted = LLVMBuildFCmp(builder, LLVMRealONE, det, ctx->f32_0, "");
+	}
+	return accepted;
 }
 
 /* Perform view culling and small primitive elimination and return true
  * if the primitive is accepted and initially_accepted == true. */
-static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
-                      LLVMValueRef initially_accepted, struct ac_position_w_info *w,
-                      LLVMValueRef vp_scale[2], LLVMValueRef vp_translate[2],
-                      LLVMValueRef small_prim_precision,
-                      LLVMValueRef clip_half_line_width[2],
-                      struct ac_cull_options *options,
-                      ac_cull_accept_func accept_func, void *userdata)
+static LLVMValueRef cull_bbox(struct ac_llvm_context *ctx,
+			      LLVMValueRef pos[3][4],
+			      LLVMValueRef initially_accepted,
+			      struct ac_position_w_info *w,
+			      LLVMValueRef vp_scale[2],
+			      LLVMValueRef vp_translate[2],
+			      LLVMValueRef small_prim_precision,
+			      bool cull_view_xy,
+			      bool cull_view_near_z,
+			      bool cull_view_far_z,
+			      bool cull_small_prims,
+			      bool use_halfz_clip_space)
 {
-   LLVMBuilderRef builder = ctx->builder;
-
-   if (!options->cull_view_xy && !options->cull_view_near_z && !options->cull_view_far_z &&
-       !options->cull_small_prims) {
-      if (accept_func)
-         accept_func(ctx, initially_accepted, userdata);
-      return;
-   }
-
-   ac_build_ifcc(ctx, initially_accepted, 10000000);
-   {
-      LLVMValueRef bbox_min[3], bbox_max[3];
-      LLVMValueRef accepted = ctx->i1true;
-
-      /* Compute the primitive bounding box for easy culling. */
-      for (unsigned chan = 0; chan < (options->cull_view_near_z ||
-                                      options->cull_view_far_z ? 3 : 2); chan++) {
-         assert(options->num_vertices >= 2);
-         bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]);
-         bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]);
-
-         if (options->num_vertices == 3) {
-            bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]);
-            bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]);
-         }
-
-         if (clip_half_line_width[chan]) {
-            bbox_min[chan] = LLVMBuildFSub(builder, bbox_min[chan], clip_half_line_width[chan], "");
-            bbox_max[chan] = LLVMBuildFAdd(builder, bbox_max[chan], clip_half_line_width[chan], "");
-         }
-      }
-
-      /* View culling. */
-      if (options->cull_view_xy || options->cull_view_near_z || options->cull_view_far_z) {
-         for (unsigned chan = 0; chan < 3; chan++) {
-            LLVMValueRef visible;
-
-            if ((options->cull_view_xy && chan <= 1) || (options->cull_view_near_z && chan == 2)) {
-               float t = chan == 2 && options->use_halfz_clip_space ? 0 : -1;
-               visible = LLVMBuildFCmp(builder, LLVMRealOGE, bbox_max[chan],
-                                       LLVMConstReal(ctx->f32, t), "");
-               accepted = LLVMBuildAnd(builder, accepted, visible, "");
-            }
-
-            if ((options->cull_view_xy && chan <= 1) || (options->cull_view_far_z && chan == 2)) {
-               visible = LLVMBuildFCmp(builder, LLVMRealOLE, bbox_min[chan], ctx->f32_1, "");
-               accepted = LLVMBuildAnd(builder, accepted, visible, "");
-            }
-         }
-      }
-
-      /* Small primitive culling - triangles. */
-      if (options->cull_small_prims && options->num_vertices == 3) {
-         /* Assuming a sample position at (0.5, 0.5), if we round
-          * the bounding box min/max extents and the results of
-          * the rounding are equal in either the X or Y direction,
-          * the bounding box does not intersect the sample.
-          *
-          * See these GDC slides for pictures:
-          * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
-          */
-         LLVMValueRef min, max, not_equal[2], visible;
-
-         for (unsigned chan = 0; chan < 2; chan++) {
-            /* Convert the position to screen-space coordinates. */
-            min = ac_build_fmad(ctx, bbox_min[chan], vp_scale[chan], vp_translate[chan]);
-            max = ac_build_fmad(ctx, bbox_max[chan], vp_scale[chan], vp_translate[chan]);
-            /* Scale the bounding box according to the precision of
-             * the rasterizer and the number of MSAA samples. */
-            min = LLVMBuildFSub(builder, min, small_prim_precision, "");
-            max = LLVMBuildFAdd(builder, max, small_prim_precision, "");
-
-            /* Determine if the bbox intersects the sample point.
-             * It also works for MSAA, but vp_scale, vp_translate,
-             * and small_prim_precision are computed differently.
-             */
-            min = ac_build_round(ctx, min);
-            max = ac_build_round(ctx, max);
-            not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, "");
-         }
-         visible = LLVMBuildAnd(builder, not_equal[0], not_equal[1], "");
-         accepted = LLVMBuildAnd(builder, accepted, visible, "");
-      }
-
-      /* Small primitive culling - lines. */
-      if (options->cull_small_prims && options->num_vertices == 2) {
-         /* This only works with lines without perpendicular end caps (lines with perpendicular
-          * end caps are rasterized as quads and thus can't be culled as small prims in 99% of
-          * cases because line_width >= 1).
-          *
-          * This takes advantage of the diamont exit rule, which says that every pixel
-          * has a diamond inside it touching the pixel boundary and only if a line exits
-          * the diamond, that pixel is filled. If a line enters the diamond or stays
-          * outside the diamond, the pixel isn't filled.
-          *
-          * This algorithm is a little simpler than that. The space outside all diamonds also
-          * has the same diamond shape, which we'll call corner diamonds.
-          *
-          * The idea is to cull all lines that are entirely inside a diamond, including
-          * corner diamonds. If a line is entirely inside a diamond, it can be culled because
-          * it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled
-          * because it doesn't enter any diamond and thus can't exit any diamond.
-          *
-          * The viewport is rotated by 45 degress to turn diamonds into squares, and a bounding
-          * box test is used to determine whether a line is entirely inside any square (diamond).
-          *
-          * The line width doesn't matter. Wide lines only duplicate filled pixels in either X or
-          * Y direction from the filled pixels. MSAA also doesn't matter. MSAA should ideally use
-          * perpendicular end caps that enable quad rasterization for lines. Thus, this should
-          * always use non-MSAA viewport transformation and non-MSAA small prim precision.
-          *
-          * A good test is piglit/lineloop because it draws 10k subpixel lines in a circle.
-          * It should contain no holes if this matches hw behavior.
-          */
-         LLVMValueRef v0[2], v1[2];
-
-         /* Get vertex positions in pixels. */
-         for (unsigned chan = 0; chan < 2; chan++) {
-            v0[chan] = ac_build_fmad(ctx, pos[0][chan], vp_scale[chan], vp_translate[chan]);
-            v1[chan] = ac_build_fmad(ctx, pos[1][chan], vp_scale[chan], vp_translate[chan]);
-         }
-
-         /* Rotate the viewport by 45 degress, so that diamonds become squares. */
-         rotate_45degrees(ctx, v0);
-         rotate_45degrees(ctx, v1);
-
-         LLVMValueRef not_equal[2];
-
-         for (unsigned chan = 0; chan < 2; chan++) {
-            /* The width of each square is sqrt(0.5), so scale it to 1 because we want
-             * round() to give us the position of the closest center of a square (diamond).
-             */
-            v0[chan] = LLVMBuildFMul(builder, v0[chan], LLVMConstReal(ctx->f32, 1.414213562), "");
-            v1[chan] = LLVMBuildFMul(builder, v1[chan], LLVMConstReal(ctx->f32, 1.414213562), "");
-
-            /* Compute the bounding box around both vertices. We do this because we must
-             * enlarge the line area by the precision of the rasterizer.
-             */
-            LLVMValueRef min = ac_build_fmin(ctx, v0[chan], v1[chan]);
-            LLVMValueRef max = ac_build_fmax(ctx, v0[chan], v1[chan]);
-
-            /* Enlarge the bounding box by the precision of the rasterizer. */
-            min = LLVMBuildFSub(builder, min, small_prim_precision, "");
-            max = LLVMBuildFAdd(builder, max, small_prim_precision, "");
-
-            /* Round the bounding box corners. If both rounded corners are equal,
-             * the bounding box is entirely inside a square (diamond).
-             */
-            min = ac_build_round(ctx, min);
-            max = ac_build_round(ctx, max);
-            not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, "");
-         }
-
-         accepted = LLVMBuildAnd(builder, accepted,
-                                 LLVMBuildOr(builder, not_equal[0], not_equal[1], ""), "");
-      }
-
-      /* Disregard the bounding box culling if any W is negative because the code
-       * doesn't work with that.
-       */
-      accepted = LLVMBuildOr(builder, accepted, w->any_w_negative, "");
-
-      if (accept_func)
-         accept_func(ctx, accepted, userdata);
-   }
-   ac_build_endif(ctx, 10000000);
+	LLVMBuilderRef builder = ctx->builder;
+
+	if (!cull_view_xy && !cull_view_near_z && !cull_view_far_z && !cull_small_prims)
+		return initially_accepted;
+
+	/* Skip the culling if the primitive has already been rejected or
+	 * if any W is negative. The bounding box culling doesn't work when
+	 * W is negative.
+	 */
+	LLVMValueRef cond = LLVMBuildAnd(builder, initially_accepted,
+					 w->all_w_positive, "");
+	LLVMValueRef accepted_var = ac_build_alloca_undef(ctx, ctx->i1, "");
+	LLVMBuildStore(builder, initially_accepted, accepted_var);
+
+	ac_build_ifcc(ctx, cond, 10000000 /* does this matter? */);
+	{
+		LLVMValueRef bbox_min[3], bbox_max[3];
+		LLVMValueRef accepted = initially_accepted;
+
+		/* Compute the primitive bounding box for easy culling. */
+		for (unsigned chan = 0; chan < (cull_view_near_z || cull_view_far_z ? 3 : 2); chan++) {
+			bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]);
+			bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]);
+
+			bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]);
+			bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]);
+		}
+
+		/* View culling. */
+		if (cull_view_xy || cull_view_near_z || cull_view_far_z) {
+			for (unsigned chan = 0; chan < 3; chan++) {
+				LLVMValueRef visible;
+
+				if ((cull_view_xy && chan <= 1) ||
+				    (cull_view_near_z && chan == 2)) {
+					float t = chan == 2 && use_halfz_clip_space ? 0 : -1;
+					visible = LLVMBuildFCmp(builder, LLVMRealOGE, bbox_max[chan],
+								LLVMConstReal(ctx->f32, t), "");
+					accepted = LLVMBuildAnd(builder, accepted, visible, "");
+				}
+
+				if ((cull_view_xy && chan <= 1) ||
+				    (cull_view_far_z && chan == 2)) {
+					visible = LLVMBuildFCmp(builder, LLVMRealOLE, bbox_min[chan],
+								ctx->f32_1, "");
+					accepted = LLVMBuildAnd(builder, accepted, visible, "");
+				}
+			}
+		}
+
+		/* Small primitive elimination. */
+		if (cull_small_prims) {
+			/* Assuming a sample position at (0.5, 0.5), if we round
+			 * the bounding box min/max extents and the results of
+			 * the rounding are equal in either the X or Y direction,
+			 * the bounding box does not intersect the sample.
+			 *
+			 * See these GDC slides for pictures:
+			 * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
+			 */
+			LLVMValueRef min, max, not_equal[2], visible;
+
+			for (unsigned chan = 0; chan < 2; chan++) {
+				/* Convert the position to screen-space coordinates. */
+				min = ac_build_fmad(ctx, bbox_min[chan],
+						    vp_scale[chan], vp_translate[chan]);
+				max = ac_build_fmad(ctx, bbox_max[chan],
+						    vp_scale[chan], vp_translate[chan]);
+				/* Scale the bounding box according to the precision of
+				 * the rasterizer and the number of MSAA samples. */
+				min = LLVMBuildFSub(builder, min, small_prim_precision, "");
+				max = LLVMBuildFAdd(builder, max, small_prim_precision, "");
+
+				/* Determine if the bbox intersects the sample point.
+				 * It also works for MSAA, but vp_scale, vp_translate,
+				 * and small_prim_precision are computed differently.
+				 */
+				min = ac_build_round(ctx, min);
+				max = ac_build_round(ctx, max);
+				not_equal[chan] = LLVMBuildFCmp(builder, LLVMRealONE, min, max, "");
+			}
+			visible = LLVMBuildAnd(builder, not_equal[0], not_equal[1], "");
+			accepted = LLVMBuildAnd(builder, accepted, visible, "");
+		}
+
+		LLVMBuildStore(builder, accepted, accepted_var);
+	}
+	ac_build_endif(ctx, 10000000);
+
+	return LLVMBuildLoad(builder, accepted_var, "");
 }
 
 /**
@@ -332,28 +240,36 @@ static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
  *                              the rasterizer. Set to num_samples / 2^subpixel_bits.
  *                              subpixel_bits are defined by the quantization mode.
  * \param options               See ac_cull_options.
- * \param accept_func           Callback invoked in the inner-most branch where the primitive is accepted.
  */
-void ac_cull_primitive(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
-                       LLVMValueRef initially_accepted, LLVMValueRef vp_scale[2],
-                       LLVMValueRef vp_translate[2], LLVMValueRef small_prim_precision,
-                       LLVMValueRef clip_half_line_width[2], struct ac_cull_options *options,
-                       ac_cull_accept_func accept_func, void *userdata)
+LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx,
+			      LLVMValueRef pos[3][4],
+			      LLVMValueRef initially_accepted,
+			      LLVMValueRef vp_scale[2],
+			      LLVMValueRef vp_translate[2],
+			      LLVMValueRef small_prim_precision,
+			      struct ac_cull_options *options)
 {
-   struct ac_position_w_info w;
-   ac_analyze_position_w(ctx, pos, &w, options->num_vertices);
-
-   /* W culling. */
-   LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true;
-   accepted = LLVMBuildAnd(ctx->builder, accepted, initially_accepted, "");
-
-   /* Face culling. */
-   accepted = LLVMBuildAnd(
-      ctx->builder, accepted,
-      ac_cull_face(ctx, pos, &w, options->cull_front, options->cull_back, options->cull_zero_area),
-      "");
-
-   /* View culling and small primitive elimination. */
-   cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate, small_prim_precision,
-             clip_half_line_width, options, accept_func, userdata);
+	struct ac_position_w_info w;
+	ac_analyze_position_w(ctx, pos, &w);
+
+	/* W culling. */
+	LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true;
+	accepted = LLVMBuildAnd(ctx->builder, accepted, initially_accepted, "");
+
+	/* Face culling. */
+	accepted = LLVMBuildAnd(ctx->builder, accepted,
+				ac_cull_face(ctx, pos, &w,
+					     options->cull_front,
+					     options->cull_back,
+					     options->cull_zero_area), "");
+
+	/* View culling and small primitive elimination. */
+	accepted = cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate,
+			     small_prim_precision,
+			     options->cull_view_xy,
+			     options->cull_view_near_z,
+			     options->cull_view_far_z,
+			     options->cull_small_prims,
+			     options->use_halfz_clip_space);
+	return accepted;
 }
diff --git a/lib/mesa/src/amd/llvm/ac_llvm_cull.h b/lib/mesa/src/amd/llvm/ac_llvm_cull.h
index dc978d3fe..0aa6c902a 100644
--- a/lib/mesa/src/amd/llvm/ac_llvm_cull.h
+++ b/lib/mesa/src/amd/llvm/ac_llvm_cull.h
@@ -29,35 +29,31 @@
 #include "ac_llvm_build.h"
 
 struct ac_cull_options {
-   /* In general, I recommend setting all to true except view Z culling,
-    * which isn't so effective because W culling is cheaper and partially
-    * replaces near Z culling, and you don't need to set Position.z
-    * if Z culling is disabled.
-    *
-    * If something doesn't work, turn some of these off to find out what.
-    */
-   bool cull_front;
-   bool cull_back;
-   bool cull_view_xy;
-   bool cull_view_near_z;
-   bool cull_view_far_z;
-   bool cull_small_prims;
-   bool cull_zero_area;
-   bool cull_w; /* cull primitives with all W < 0 */
-
-   bool use_halfz_clip_space;
-
-   uint8_t num_vertices; /* 1..3 */
+	/* In general, I recommend setting all to true except view Z culling,
+	 * which isn't so effective because W culling is cheaper and partially
+	 * replaces near Z culling, and you don't need to set Position.z
+	 * if Z culling is disabled.
+	 *
+	 * If something doesn't work, turn some of these off to find out what.
+	 */
+	bool cull_front;
+	bool cull_back;
+	bool cull_view_xy;
+	bool cull_view_near_z;
+	bool cull_view_far_z;
+	bool cull_small_prims;
+	bool cull_zero_area;
+	bool cull_w; /* cull primitives with all W < 0 */
+
+	bool use_halfz_clip_space;
 };
 
-/* Callback invoked in the inner-most branch where the primitive is accepted. */
-typedef void (*ac_cull_accept_func)(struct ac_llvm_context *ctx, LLVMValueRef accepted,
-                                    void *userdata);
-
-void ac_cull_primitive(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
-                       LLVMValueRef initially_accepted, LLVMValueRef vp_scale[2],
-                       LLVMValueRef vp_translate[2], LLVMValueRef small_prim_precision,
-                       LLVMValueRef clip_half_line_width[2], struct ac_cull_options *options,
-                       ac_cull_accept_func accept_func, void *userdata);
+LLVMValueRef ac_cull_triangle(struct ac_llvm_context *ctx,
+			      LLVMValueRef pos[3][4],
+			      LLVMValueRef initially_accepted,
+			      LLVMValueRef vp_scale[2],
+			      LLVMValueRef vp_translate[2],
+			      LLVMValueRef small_prim_precision,
+			      struct ac_cull_options *options);
 
 #endif
diff --git a/lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_pack.h b/lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_pack.h
index f6d1fbf09..b3b9bf924 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_pack.h
+++ b/lib/mesa/src/gallium/drivers/freedreno/a6xx/fd6_pack.h
@@ -27,149 +27,86 @@
 #include "a6xx.xml.h"
 
 struct fd_reg_pair {
-   uint32_t reg;
-   uint64_t value;
-   struct fd_bo *bo;
-   bool is_address;
-   bool bo_write;
-   uint32_t bo_offset;
-   uint32_t bo_shift;
+	uint32_t reg;
+	uint64_t value;
+	struct fd_bo *bo;
+	bool is_address;
+	bool bo_write;
+	uint32_t bo_offset;
+	uint32_t bo_shift;
 };
 
 #define __bo_type struct fd_bo *
 
 #include "a6xx-pack.xml.h"
-#include "adreno-pm4-pack.xml.h"
 
-#define __assert_eq(a, b)                                                      \
-   do {                                                                        \
-      if ((a) != (b)) {                                                        \
-         fprintf(stderr, "assert failed: " #a " (0x%x) != " #b " (0x%x)\n", a, \
-                 b);                                                           \
-         assert((a) == (b));                                                   \
-      }                                                                        \
-   } while (0)
+#define __assert_eq(a, b)													\
+	do {																\
+		if ((a) != (b)) {												\
+			fprintf(stderr, "assert failed: " #a " (0x%x) != " #b " (0x%x)\n", a, b); \
+			assert((a) == (b));											\
+		}																\
+	} while (0)
 
-#if !FD_BO_NO_HARDPIN
-#  error 'Hardpin unsupported'
-#endif
+#define __ONE_REG(i, ...)											\
+	do {															\
+		const struct fd_reg_pair regs[] = { __VA_ARGS__ };			\
+		if (i < ARRAY_SIZE(regs) && regs[i].reg > 0) {				\
+			__assert_eq(regs[0].reg + i, regs[i].reg);				\
+			if (regs[i].bo) {										\
+				struct fd_reloc reloc = {							\
+					.bo = regs[i].bo,								\
+					.flags = FD_RELOC_READ |						\
+						(regs[i].bo_write ? FD_RELOC_WRITE : 0),	\
+																	\
+					.offset = regs[i].bo_offset,					\
+					.or = regs[i].value,							\
+					.shift = regs[i].bo_shift,						\
+					.orhi = regs[i].value >> 32						\
+				};													\
+				ring->cur = p;										\
+				p += 2;												\
+				fd_ringbuffer_reloc(ring, &reloc);					\
+			} else {												\
+				*p++ = regs[i].value;								\
+				if (regs[i].is_address)								\
+					*p++ = regs[i].value >> 32;						\
+			}														\
+		}															\
+	} while (0)
 
-#define __ONE_REG(i, ...)                                                      \
-   do {                                                                        \
-      const struct fd_reg_pair regs[] = {__VA_ARGS__};                         \
-      /* NOTE: allow regs[0].reg==0, this happens in OUT_PKT() */              \
-      if (i < ARRAY_SIZE(regs) && (i == 0 || regs[i].reg > 0)) {               \
-         __assert_eq(regs[0].reg + i, regs[i].reg);                            \
-         if (regs[i].bo) {                                                     \
-            uint64_t *p64 = (uint64_t *)p;                                     \
-            *p64 = __reloc_iova(regs[i].bo, regs[i].bo_offset, regs[i].value,  \
-                                regs[i].bo_shift);                             \
-            p += 2;                                                            \
-            fd_ringbuffer_attach_bo(ring, regs[i].bo);                         \
-         } else {                                                              \
-            *p++ = regs[i].value;                                              \
-            if (regs[i].is_address)                                            \
-               *p++ = regs[i].value >> 32;                                     \
-         }                                                                     \
-      }                                                                        \
-   } while (0)
-
-#define OUT_REG(ring, ...)                                                     \
-   do {                                                                        \
-      const struct fd_reg_pair regs[] = {__VA_ARGS__};                         \
-      unsigned count = ARRAY_SIZE(regs);                                       \
-                                                                               \
-      STATIC_ASSERT(ARRAY_SIZE(regs) > 0);                                     \
-      STATIC_ASSERT(ARRAY_SIZE(regs) <= 16);                                   \
-                                                                               \
-      BEGIN_RING(ring, count + 1);                                             \
-      uint32_t *p = ring->cur;                                                 \
-      *p++ = pm4_pkt4_hdr(regs[0].reg, count);                                 \
-                                                                               \
-      __ONE_REG(0, __VA_ARGS__);                                               \
-      __ONE_REG(1, __VA_ARGS__);                                               \
-      __ONE_REG(2, __VA_ARGS__);                                               \
-      __ONE_REG(3, __VA_ARGS__);                                               \
-      __ONE_REG(4, __VA_ARGS__);                                               \
-      __ONE_REG(5, __VA_ARGS__);                                               \
-      __ONE_REG(6, __VA_ARGS__);                                               \
-      __ONE_REG(7, __VA_ARGS__);                                               \
-      __ONE_REG(8, __VA_ARGS__);                                               \
-      __ONE_REG(9, __VA_ARGS__);                                               \
-      __ONE_REG(10, __VA_ARGS__);                                              \
-      __ONE_REG(11, __VA_ARGS__);                                              \
-      __ONE_REG(12, __VA_ARGS__);                                              \
-      __ONE_REG(13, __VA_ARGS__);                                              \
-      __ONE_REG(14, __VA_ARGS__);                                              \
-      __ONE_REG(15, __VA_ARGS__);                                              \
-      ring->cur = p;                                                           \
-   } while (0)
-
-#define OUT_PKT(ring, opcode, ...)                                             \
-   do {                                                                        \
-      const struct fd_reg_pair regs[] = {__VA_ARGS__};                         \
-      unsigned count = ARRAY_SIZE(regs);                                       \
-                                                                               \
-      STATIC_ASSERT(ARRAY_SIZE(regs) <= 16);                                   \
-                                                                               \
-      BEGIN_RING(ring, count + 1);                                             \
-      uint32_t *p = ring->cur;                                                 \
-      *p++ = pm4_pkt7_hdr(opcode, count);                                      \
-                                                                               \
-      __ONE_REG(0, __VA_ARGS__);                                               \
-      __ONE_REG(1, __VA_ARGS__);                                               \
-      __ONE_REG(2, __VA_ARGS__);                                               \
-      __ONE_REG(3, __VA_ARGS__);                                               \
-      __ONE_REG(4, __VA_ARGS__);                                               \
-      __ONE_REG(5, __VA_ARGS__);                                               \
-      __ONE_REG(6, __VA_ARGS__);                                               \
-      __ONE_REG(7, __VA_ARGS__);                                               \
-      __ONE_REG(8, __VA_ARGS__);                                               \
-      __ONE_REG(9, __VA_ARGS__);                                               \
-      __ONE_REG(10, __VA_ARGS__);                                              \
-      __ONE_REG(11, __VA_ARGS__);                                              \
-      __ONE_REG(12, __VA_ARGS__);                                              \
-      __ONE_REG(13, __VA_ARGS__);                                              \
-      __ONE_REG(14, __VA_ARGS__);                                              \
-      __ONE_REG(15, __VA_ARGS__);                                              \
-      ring->cur = p;                                                           \
-   } while (0)
-
-/* similar to OUT_PKT() but appends specified # of dwords
- * copied for buf to the end of the packet (ie. for use-
- * cases like CP_LOAD_STATE)
- */
-#define OUT_PKTBUF(ring, opcode, dwords, sizedwords, ...)                      \
-   do {                                                                        \
-      const struct fd_reg_pair regs[] = {__VA_ARGS__};                         \
-      unsigned count = ARRAY_SIZE(regs);                                       \
-                                                                               \
-      STATIC_ASSERT(ARRAY_SIZE(regs) <= 16);                                   \
-      count += sizedwords;                                                     \
-                                                                               \
-      BEGIN_RING(ring, count + 1);                                             \
-      uint32_t *p = ring->cur;                                                 \
-      *p++ = pm4_pkt7_hdr(opcode, count);                                      \
-                                                                               \
-      __ONE_REG(0, __VA_ARGS__);                                               \
-      __ONE_REG(1, __VA_ARGS__);                                               \
-      __ONE_REG(2, __VA_ARGS__);                                               \
-      __ONE_REG(3, __VA_ARGS__);                                               \
-      __ONE_REG(4, __VA_ARGS__);                                               \
-      __ONE_REG(5, __VA_ARGS__);                                               \
-      __ONE_REG(6, __VA_ARGS__);                                               \
-      __ONE_REG(7, __VA_ARGS__);                                               \
-      __ONE_REG(8, __VA_ARGS__);                                               \
-      __ONE_REG(9, __VA_ARGS__);                                               \
-      __ONE_REG(10, __VA_ARGS__);                                              \
-      __ONE_REG(11, __VA_ARGS__);                                              \
-      __ONE_REG(12, __VA_ARGS__);                                              \
-      __ONE_REG(13, __VA_ARGS__);                                              \
-      __ONE_REG(14, __VA_ARGS__);                                              \
-      __ONE_REG(15, __VA_ARGS__);                                              \
-      memcpy(p, dwords, 4 * sizedwords);                                       \
-      p += sizedwords;                                                         \
-      ring->cur = p;                                                           \
-   } while (0)
+#define OUT_REG(ring, ...)									\
+	do {													\
+		const struct fd_reg_pair regs[] = { __VA_ARGS__ };	\
+		unsigned count = ARRAY_SIZE(regs);					\
+															\
+		STATIC_ASSERT(count > 0);							\
+		STATIC_ASSERT(count <= 16);							\
+															\
+		BEGIN_RING(ring, count + 1);						\
+		uint32_t *p = ring->cur;							\
+		*p++ = CP_TYPE4_PKT | count |						\
+			(_odd_parity_bit(count) << 7) |					\
+			((regs[0].reg & 0x3ffff) << 8) |				\
+			((_odd_parity_bit(regs[0].reg) << 27));			\
+															\
+		__ONE_REG( 0, __VA_ARGS__);							\
+		__ONE_REG( 1, __VA_ARGS__);							\
+		__ONE_REG( 2, __VA_ARGS__);							\
+		__ONE_REG( 3, __VA_ARGS__);							\
+		__ONE_REG( 4, __VA_ARGS__);							\
+		__ONE_REG( 5, __VA_ARGS__);							\
+		__ONE_REG( 6, __VA_ARGS__);							\
+		__ONE_REG( 7, __VA_ARGS__);							\
+		__ONE_REG( 8, __VA_ARGS__);							\
+		__ONE_REG( 9, __VA_ARGS__);							\
+		__ONE_REG(10, __VA_ARGS__);							\
+		__ONE_REG(11, __VA_ARGS__);							\
+		__ONE_REG(12, __VA_ARGS__);							\
+		__ONE_REG(13, __VA_ARGS__);							\
+		__ONE_REG(14, __VA_ARGS__);							\
+		__ONE_REG(15, __VA_ARGS__);							\
+		ring->cur = p;										\
+	} while (0)
 
 #endif /* FD6_PACK_H */
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
index 7900ba9da..122e69762 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
@@ -72,11 +72,8 @@ static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *c
    uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
                     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
 
-   if (ctx->screen->info.gfx_level >= GFX11)
-      rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
-               S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
-   else if (ctx->screen->info.gfx_level >= GFX10)
-      rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
+   if (ctx->screen->info.chip_class >= GFX10)
+      rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
    else
       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
@@ -94,7 +91,9 @@ static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
    struct si_shader_selector *sel = ctx->shader->selector;
 
-   if (sel->info.base.num_ubos == 1 && sel->info.base.num_ssbos == 0) {
+   LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
+
+   if (sel->info.const_buffers_declared == 1 && sel->info.shader_buffers_declared == 0) {
       return load_const_buffer_desc_fast_path(ctx);
    }
 
@@ -102,27 +101,19 @@ static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
    index =
       LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");
 
-   return ac_build_load_to_sgpr(&ctx->ac,
-                                ac_get_ptr_arg(&ctx->ac, &ctx->args, ctx->const_and_shader_buffers),
-                                index);
+   return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
 }
 
-static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write, bool non_uniform)
+static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
 {
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-
-   /* Fast path if the shader buffer is in user SGPRs. */
-   if (LLVMIsConstant(index) &&
-       LLVMConstIntGetZExtValue(index) < ctx->shader->selector->cs_num_shaderbufs_in_user_sgprs)
-      return ac_get_arg(&ctx->ac, ctx->cs_shaderbuf[LLVMConstIntGetZExtValue(index)]);
+   LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
 
    index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
    index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
                         index, "");
 
-   return ac_build_load_to_sgpr(&ctx->ac,
-                                ac_get_ptr_arg(&ctx->ac, &ctx->args, ctx->const_and_shader_buffers),
-                                index);
+   return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
 }
 
 /**
@@ -138,7 +129,7 @@ static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, boo
  */
 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
 {
-   if (ctx->screen->info.gfx_level <= GFX7) {
+   if (ctx->screen->info.chip_class <= GFX7) {
       return rsrc;
    } else {
       LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
@@ -151,41 +142,18 @@ static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rs
    }
 }
 
-static LLVMValueRef force_write_compress_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
-{
-   LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
-   LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_00A018_WRITE_COMPRESS_ENABLE, 0);
-   LLVMValueRef tmp;
-
-   tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
-   tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
-   return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
-}
-
-static LLVMValueRef fixup_image_desc(struct si_shader_context *ctx, LLVMValueRef rsrc,
-                                     bool uses_store)
-{
-   if (uses_store && ctx->ac.gfx_level <= GFX9)
-      rsrc = force_dcc_off(ctx, rsrc);
-
-   if (!uses_store && ctx->screen->info.has_image_load_dcc_bug &&
-       ctx->screen->always_allow_dcc_stores)
-      rsrc = force_write_compress_off(ctx, rsrc);
-
-   return rsrc;
-}
-
 /* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should
  * adjust "index" to point to FMASK. */
-static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, struct ac_llvm_pointer list,
+static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list,
                                        LLVMValueRef index, enum ac_descriptor_type desc_type,
                                        bool uses_store, bool bindless)
 {
+   LLVMBuilderRef builder = ctx->ac.builder;
    LLVMValueRef rsrc;
 
    if (desc_type == AC_DESC_BUFFER) {
       index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
-      list.pointee_type = ctx->ac.v4i32;
+      list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
    } else {
       assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK);
    }
@@ -195,16 +163,15 @@ static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, struct ac_
    else
       rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
 
-   if (desc_type == AC_DESC_IMAGE)
-      rsrc = fixup_image_desc(ctx, rsrc, uses_store);
-
+   if (desc_type == AC_DESC_IMAGE && uses_store)
+      rsrc = force_dcc_off(ctx, rsrc);
    return rsrc;
 }
 
 /**
  * Load an image view, fmask view. or sampler state descriptor.
  */
-static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, struct ac_llvm_pointer list,
+static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, LLVMValueRef list,
                                          LLVMValueRef index, enum ac_descriptor_type type)
 {
    LLVMBuilderRef builder = ctx->ac.builder;
@@ -217,18 +184,17 @@ static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, struct a
    case AC_DESC_BUFFER:
       /* The buffer is in [4:7]. */
       index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);
-      list.pointee_type = ctx->ac.v4i32;
+      list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
       break;
    case AC_DESC_FMASK:
       /* The FMASK is at [8:15]. */
-      assert(ctx->screen->info.gfx_level < GFX11);
       index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
       break;
    case AC_DESC_SAMPLER:
       /* The sampler state is at [12:15]. */
       index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
                             LLVMConstInt(ctx->ac.i32, 3, 0));
-      list.pointee_type = ctx->ac.v4i32;
+      list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
       break;
    case AC_DESC_PLANE_0:
    case AC_DESC_PLANE_1:
@@ -256,7 +222,7 @@ static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned
    assert(desc_type <= AC_DESC_BUFFER);
 
    if (bindless) {
-      struct ac_llvm_pointer list = ac_get_ptr_arg(&ctx->ac, &ctx->args, ctx->bindless_samplers_and_images);
+      LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);
 
       /* dynamic_index is the bindless handle */
       if (image) {
@@ -278,17 +244,14 @@ static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned
        */
       dynamic_index =
          LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
-      list.v = ac_build_pointer_add(&ctx->ac, ctx->ac.v8i32, list.v, dynamic_index);
+      list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
       return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
    }
 
    unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
+   assert(const_index < num_slots || dynamic_index);
 
-   /* Redirect invalid resource indices to the first array element. */
-   if (const_index >= num_slots)
-      const_index = base_index;
-
-   struct ac_llvm_pointer list = ac_get_ptr_arg(&ctx->ac, &ctx->args, ctx->samplers_and_images);
+   LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
    LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
 
    if (dynamic_index) {
@@ -307,17 +270,6 @@ static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned
    }
 
    if (image) {
-      /* Fast path if the image is in user SGPRs. */
-      if (!dynamic_index &&
-          const_index < ctx->shader->selector->cs_num_images_in_user_sgprs &&
-          (desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER)) {
-         LLVMValueRef rsrc = ac_get_arg(&ctx->ac, ctx->cs_image[const_index]);
-
-         if (desc_type == AC_DESC_IMAGE)
-            rsrc = fixup_image_desc(ctx, rsrc, write);
-         return rsrc;
-      }
-
       /* FMASKs are separate from images. */
       if (desc_type == AC_DESC_FMASK) {
          index =