Merge Mesa 23.1.9

author: Jonathan Gray <jsg@cvs.openbsd.org> 2023-11-02 04:53:47 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2023-11-02 04:53:47 +0000
commit: b44518130b33cadb5c1d619e9e936ae0e0dbf7cb (patch)
tree: 6069eb03c39fbc79808a7d94f857118cce75cbe3 /lib/mesa/src/intel/compiler/brw_fs.cpp
parent: 32aeb3c41fedbbd7b11aacfec48e8f699d16bff0 (diff)
1 files changed, 583 insertions, 443 deletions
diff --git a/lib/mesa/src/intel/compiler/brw_fs.cpp b/lib/mesa/src/intel/compiler/brw_fs.cpp
index db4806b50..21f7071c4 100644
--- a/lib/mesa/src/intel/compiler/brw_fs.cpp
+++ b/lib/mesa/src/intel/compiler/brw_fs.cpp
@@ -38,11 +38,14 @@
 #include "brw_dead_control_flow.h"
 #include "brw_private.h"
 #include "dev/intel_debug.h"
+#include "dev/intel_wa.h"
 #include "compiler/glsl_types.h"
 #include "compiler/nir/nir_builder.h"
 #include "program/prog_parameter.h"
 #include "util/u_math.h"
 
+#include <memory>
+
 using namespace brw;
 
 static unsigned get_lowered_simd_width(const struct brw_compiler *compiler,
@@ -246,7 +249,6 @@ fs_inst::is_control_source(unsigned arg) const
 {
    switch (opcode) {
    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
-   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
       return arg == 0;
 
@@ -304,9 +306,6 @@ fs_inst::is_payload(unsigned arg) const
    case SHADER_OPCODE_BARRIER:
       return arg == 0;
 
-   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
-      return arg == 1;
-
    case SHADER_OPCODE_SEND:
       return arg == 2 || arg == 3;
 
@@ -642,10 +641,19 @@ fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
 bool
 fs_inst::is_partial_write() const
 {
-   return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
-           (this->exec_size * type_sz(this->dst.type)) < 32 ||
-           !this->dst.is_contiguous() ||
-           this->dst.offset % REG_SIZE != 0);
+   if (this->predicate && !this->predicate_trivial &&
+       this->opcode != BRW_OPCODE_SEL)
+      return true;
+
+   if (this->dst.offset % REG_SIZE != 0)
+      return true;
+
+   /* SEND instructions always write whole registers */
+   if (this->opcode == SHADER_OPCODE_SEND)
+      return false;
+
+   return this->exec_size * type_sz(this->dst.type) < 32 ||
+          !this->dst.is_contiguous();
 }
 
 unsigned
@@ -745,13 +753,13 @@ fs_inst::components_read(unsigned i) const
    case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
    case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
    case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
-      assert(src[2].file == IMM);
+      assert(src[A64_LOGICAL_ARG].file == IMM);
       return 1;
 
    case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
-      assert(src[2].file == IMM);
-      if (i == 1) { /* data to write */
-         const unsigned comps = src[2].ud / exec_size;
+      assert(src[A64_LOGICAL_ARG].file == IMM);
+      if (i == A64_LOGICAL_SRC) { /* data to write */
+         const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
          assert(comps > 0);
          return comps;
       } else {
@@ -773,41 +781,13 @@ fs_inst::components_read(unsigned i) const
       }
 
    case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
-      assert(src[2].file == IMM);
-      return i == 1 ? src[2].ud : 1;
+      assert(src[A64_LOGICAL_ARG].file == IMM);
+      return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
 
    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
-   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
-   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
-      assert(src[2].file == IMM);
-      if (i == 1) {
-         /* Data source */
-         const unsigned op = src[2].ud;
-         switch (op) {
-         case BRW_AOP_INC:
-         case BRW_AOP_DEC:
-         case BRW_AOP_PREDEC:
-            return 0;
-         case BRW_AOP_CMPWR:
-            return 2;
-         default:
-            return 1;
-         }
-      } else {
-         return 1;
-      }
-
-   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
-   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
-   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
-      assert(src[2].file == IMM);
-      if (i == 1) {
-         /* Data source */
-         const unsigned op = src[2].ud;
-         return op == BRW_AOP_FCMPWR ? 2 : 1;
-      } else {
-         return 1;
-      }
+      assert(src[A64_LOGICAL_ARG].file == IMM);
+      return i == A64_LOGICAL_SRC ?
+             lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
 
    case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
    case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
@@ -837,31 +817,14 @@ fs_inst::components_read(unsigned i) const
       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
       /* Surface operation source. */
-      else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_CMPWR)
-         return 2;
-      else if (i == SURFACE_LOGICAL_SRC_DATA &&
-               (op == BRW_AOP_INC || op == BRW_AOP_DEC || op == BRW_AOP_PREDEC))
-         return 0;
+      else if (i == SURFACE_LOGICAL_SRC_DATA)
+         return lsc_op_num_data_values(op);
       else
          return 1;
    }
    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
       return (i == 0 ? 2 : 1);
 
-   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
-      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
-             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
-      const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
-      /* Surface coordinates. */
-      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
-         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
-      /* Surface operation source. */
-      else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_FCMPWR)
-         return 2;
-      else
-         return 1;
-   }
-
    case SHADER_OPCODE_URB_WRITE_LOGICAL:
       if (i == URB_LOGICAL_SRC_DATA)
          return mlen - 1 -
@@ -909,12 +872,6 @@ fs_inst::size_read(int arg) const
          return 1;
       break;
 
-   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
-      /* The payload is actually stored in src1 */
-      if (arg == 1)
-         return mlen * REG_SIZE;
-      break;
-
    case FS_OPCODE_LINTERP:
       if (arg == 1)
          return 16;
@@ -922,7 +879,7 @@ fs_inst::size_read(int arg) const
 
    case SHADER_OPCODE_LOAD_PAYLOAD:
       if (arg < this->header_size)
-         return REG_SIZE;
+         return retype(src[arg], BRW_REGISTER_TYPE_UD).component_size(8);
       break;
 
    case CS_OPCODE_CS_TERMINATE:
@@ -1120,7 +1077,7 @@ fs_visitor::vgrf(const glsl_type *const type)
                  brw_type_for_base_type(type));
 }
 
-fs_reg::fs_reg(enum brw_reg_file file, int nr)
+fs_reg::fs_reg(enum brw_reg_file file, unsigned nr)
 {
    init();
    this->file = file;
@@ -1129,7 +1086,7 @@ fs_reg::fs_reg(enum brw_reg_file file, int nr)
    this->stride = (file == UNIFORM ? 0 : 1);
 }
 
-fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
+fs_reg::fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type)
 {
    init();
    this->file = file;
@@ -1275,7 +1232,7 @@ fs_visitor::emit_samplepos_setup()
    const fs_builder abld = bld.annotate("compute sample position");
    fs_reg pos = abld.vgrf(BRW_REGISTER_TYPE_F, 2);
 
-   if (!wm_prog_data->persample_dispatch) {
+   if (wm_prog_data->persample_dispatch == BRW_NEVER) {
       /* From ARB_sample_shading specification:
        * "When rendering to a non-multisample buffer, or if multisample
        *  rasterization is disabled, gl_SamplePosition will always be
@@ -1310,6 +1267,16 @@ fs_visitor::emit_samplepos_setup()
       abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f));
    }
 
+   if (wm_prog_data->persample_dispatch == BRW_SOMETIMES) {
+      check_dynamic_msaa_flag(abld, wm_prog_data,
+                              BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH);
+      for (unsigned i = 0; i < 2; i++) {
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
+                               brw_imm_f(0.5f)));
+      }
+   }
+
    return pos;
 }
 
@@ -1318,12 +1285,13 @@ fs_visitor::emit_sampleid_setup()
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    ASSERTED brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
    assert(devinfo->ver >= 6);
 
    const fs_builder abld = bld.annotate("compute sample id");
    fs_reg sample_id = abld.vgrf(BRW_REGISTER_TYPE_UD);
 
-   assert(key->multisample_fbo);
+   assert(key->multisample_fbo != BRW_NEVER);
 
    if (devinfo->ver >= 8) {
       /* Sample ID comes in as 4-bit numbers in g1.0:
@@ -1413,6 +1381,13 @@ fs_visitor::emit_sampleid_setup()
       abld.emit(FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2);
    }
 
+   if (key->multisample_fbo == BRW_SOMETIMES) {
+      check_dynamic_msaa_flag(abld, wm_prog_data,
+                              BRW_WM_MSAA_FLAG_MULTISAMPLE_FBO);
+      set_predicate(BRW_PREDICATE_NORMAL,
+                    abld.SEL(sample_id, sample_id, brw_imm_ud(0)));
+   }
+
    return sample_id;
 }
 
@@ -1423,39 +1398,44 @@ fs_visitor::emit_samplemaskin_setup()
    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
    assert(devinfo->ver >= 6);
 
-   fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D);
-
    /* The HW doesn't provide us with expected values. */
-   assert(!wm_prog_data->per_coarse_pixel_dispatch);
+   assert(wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS);
 
    fs_reg coverage_mask =
       fetch_payload_reg(bld, fs_payload().sample_mask_in_reg, BRW_REGISTER_TYPE_D);
 
-   if (wm_prog_data->persample_dispatch) {
-      /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
-       * and a mask representing which sample is being processed by the
-       * current shader invocation.
-       *
-       * From the OES_sample_variables specification:
-       * "When per-sample shading is active due to the use of a fragment input
-       *  qualified by "sample" or due to the use of the gl_SampleID or
-       *  gl_SamplePosition variables, only the bit for the current sample is
-       *  set in gl_SampleMaskIn."
-       */
-      const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
+   if (wm_prog_data->persample_dispatch == BRW_NEVER)
+      return coverage_mask;
+
+   /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
+    * and a mask representing which sample is being processed by the
+    * current shader invocation.
+    *
+    * From the OES_sample_variables specification:
+    * "When per-sample shading is active due to the use of a fragment input
+    *  qualified by "sample" or due to the use of the gl_SampleID or
+    *  gl_SamplePosition variables, only the bit for the current sample is
+    *  set in gl_SampleMaskIn."
+    */
+   const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
 
-      if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
-         nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup();
+   if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
+      nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup();
+
+   fs_reg one = vgrf(glsl_type::int_type);
+   fs_reg enabled_mask = vgrf(glsl_type::int_type);
+   abld.MOV(one, brw_imm_d(1));
+   abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
+   fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D);
+   abld.AND(mask, enabled_mask, coverage_mask);
+
+   if (wm_prog_data->persample_dispatch == BRW_ALWAYS)
+      return mask;
+
+   check_dynamic_msaa_flag(abld, wm_prog_data,
+                           BRW_WM_MSAA_FLAG_PERSAMPLE_DISPATCH);
+   set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
 
-      fs_reg one = vgrf(glsl_type::int_type);
-      fs_reg enabled_mask = vgrf(glsl_type::int_type);
-      abld.MOV(one, brw_imm_d(1));
-      abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
-      abld.AND(mask, enabled_mask, coverage_mask);
-   } else {
-      /* In per-pixel mode, the coverage mask is sufficient. */
-      mask = coverage_mask;
-   }
    return mask;
 }
 
@@ -1464,37 +1444,44 @@ fs_visitor::emit_shading_rate_setup()
 {
    assert(devinfo->ver >= 11);
 
-   const fs_builder abld = bld.annotate("compute fragment shading rate");
-   fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD);
-
    struct brw_wm_prog_data *wm_prog_data =
       brw_wm_prog_data(bld.shader->stage_prog_data);
 
    /* Coarse pixel shading size fields overlap with other fields of not in
     * coarse pixel dispatch mode, so report 0 when that's not the case.
     */
-   if (wm_prog_data->per_coarse_pixel_dispatch) {
-      /* The shading rates provided in the shader are the actual 2D shading
-       * rate while the SPIR-V built-in is the enum value that has the shading
-       * rate encoded as a bitfield.  Fortunately, the bitfield value is just
-       * the shading rate divided by two and shifted.
-       */
+   if (wm_prog_data->coarse_pixel_dispatch == BRW_NEVER)
+      return brw_imm_ud(0);
+
+   const fs_builder abld = bld.annotate("compute fragment shading rate");
 
-      /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
-      fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
-      /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
-      fs_reg actual_y = byte_offset(actual_x, 1);
+   /* The shading rates provided in the shader are the actual 2D shading
+    * rate while the SPIR-V built-in is the enum value that has the shading
+    * rate encoded as a bitfield.  Fortunately, the bitfield value is just
+    * the shading rate divided by two and shifted.
+    */
 
-      fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD);
-      fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
+   fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
+   /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
+   fs_reg actual_y = byte_offset(actual_x, 1);
 
-      abld.SHR(int_rate_y, actual_y, brw_imm_ud(1));
-      abld.SHR(int_rate_x, actual_x, brw_imm_ud(1));
-      abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2));
-      abld.OR(rate, int_rate_x, int_rate_y);
-   } else {
-      abld.MOV(rate, brw_imm_ud(0));
-   }
+   fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+   abld.SHR(int_rate_y, actual_y, brw_imm_ud(1));
+   abld.SHR(int_rate_x, actual_x, brw_imm_ud(1));
+   abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2));
+
+   fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD);
+   abld.OR(rate, int_rate_x, int_rate_y);
+
+   if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS)
+      return rate;
+
+   check_dynamic_msaa_flag(abld, wm_prog_data,
+                           BRW_WM_MSAA_FLAG_COARSE_RT_WRITES);
+   set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(rate, rate, brw_imm_ud(0)));
 
    return rate;
 }
@@ -2137,12 +2124,12 @@ fs_visitor::split_virtual_grfs()
     */
    compact_virtual_grfs();
 
-   int num_vars = this->alloc.count;
+   unsigned num_vars = this->alloc.count;
 
    /* Count the total number of registers */
-   int reg_count = 0;
-   int vgrf_to_reg[num_vars];
-   for (int i = 0; i < num_vars; i++) {
+   unsigned reg_count = 0;
+   unsigned vgrf_to_reg[num_vars];
+   for (unsigned i = 0; i < num_vars; i++) {
       vgrf_to_reg[i] = reg_count;
       reg_count += alloc.sizes[i];
    }
@@ -2159,14 +2146,14 @@ fs_visitor::split_virtual_grfs()
    /* Mark all used registers as fully splittable */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       if (inst->dst.file == VGRF) {
-         int reg = vgrf_to_reg[inst->dst.nr];
+         unsigned reg = vgrf_to_reg[inst->dst.nr];
          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
             split_points[reg + j] = true;
       }
 
-      for (int i = 0; i < inst->sources; i++) {
+      for (unsigned i = 0; i < inst->sources; i++) {
          if (inst->src[i].file == VGRF) {
-            int reg = vgrf_to_reg[inst->src[i].nr];
+            unsigned reg = vgrf_to_reg[inst->src[i].nr];
             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
                split_points[reg + j] = true;
          }
@@ -2181,13 +2168,13 @@ fs_visitor::split_virtual_grfs()
       }
 
       if (inst->dst.file == VGRF) {
-         int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
+         unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
          for (unsigned j = 1; j < regs_written(inst); j++)
             split_points[reg + j] = false;
       }
-      for (int i = 0; i < inst->sources; i++) {
+      for (unsigned i = 0; i < inst->sources; i++) {
          if (inst->src[i].file == VGRF) {
-            int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
+            unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
             for (unsigned j = 1; j < regs_read(inst, i); j++)
                split_points[reg + j] = false;
          }
@@ -2198,19 +2185,19 @@ fs_visitor::split_virtual_grfs()
    bool *vgrf_has_split = new bool[num_vars];
    memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
 
-   int *new_virtual_grf = new int[reg_count];
-   int *new_reg_offset = new int[reg_count];
+   unsigned *new_virtual_grf = new unsigned[reg_count];
+   unsigned *new_reg_offset = new unsigned[reg_count];
 
-   int reg = 0;
+   unsigned reg = 0;
    bool has_splits = false;
-   for (int i = 0; i < num_vars; i++) {
+   for (unsigned i = 0; i < num_vars; i++) {
       /* The first one should always be 0 as a quick sanity check. */
       assert(split_points[reg] == false);
 
       /* j = 0 case */
       new_reg_offset[reg] = 0;
       reg++;
-      int offset = 1;
+      unsigned offset = 1;
 
       /* j > 0 case */
       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
@@ -2221,8 +2208,8 @@ fs_visitor::split_virtual_grfs()
             has_splits = true;
             vgrf_has_split[i] = true;
             assert(offset <= MAX_VGRF_SIZE);
-            int grf = alloc.allocate(offset);
-            for (int k = reg - offset; k < reg; k++)
+            unsigned grf = alloc.allocate(offset);
+            for (unsigned k = reg - offset; k < reg; k++)
                new_virtual_grf[k] = grf;
             offset = 0;
          }
@@ -2234,7 +2221,7 @@ fs_visitor::split_virtual_grfs()
       /* The last one gets the original register number */
       assert(offset <= MAX_VGRF_SIZE);
       alloc.sizes[i] = offset;
-      for (int k = reg - offset; k < reg; k++)
+      for (unsigned k = reg - offset; k < reg; k++)
          new_virtual_grf[k] = i;
    }
    assert(reg == reg_count);
@@ -2268,7 +2255,7 @@ fs_visitor::split_virtual_grfs()
          } else {
             reg = vgrf_to_reg[inst->dst.nr];
             assert(new_reg_offset[reg] == 0);
-            assert(new_virtual_grf[reg] == (int)inst->dst.nr);
+            assert(new_virtual_grf[reg] == inst->dst.nr);
          }
          continue;
       }
@@ -2279,14 +2266,13 @@ fs_visitor::split_virtual_grfs()
             inst->dst.nr = new_virtual_grf[reg];
             inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
                                inst->dst.offset % REG_SIZE;
-            assert((unsigned)new_reg_offset[reg] <
-                   alloc.sizes[new_virtual_grf[reg]]);
+            assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
          } else {
             assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
-            assert(new_virtual_grf[reg] == (int)inst->dst.nr);
+            assert(new_virtual_grf[reg] == inst->dst.nr);
          }
       }
-      for (int i = 0; i < inst->sources; i++) {
+      for (unsigned i = 0; i < inst->sources; i++) {
 	 if (inst->src[i].file != VGRF)
             continue;
 
@@ -2295,11 +2281,10 @@ fs_visitor::split_virtual_grfs()
             inst->src[i].nr = new_virtual_grf[reg];
             inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
                                   inst->src[i].offset % REG_SIZE;
-            assert((unsigned)new_reg_offset[reg] <
-                   alloc.sizes[new_virtual_grf[reg]]);
+            assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
          } else {
             assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
-            assert(new_virtual_grf[reg] == (int)inst->src[i].nr);
+            assert(new_virtual_grf[reg] == inst->src[i].nr);
          }
       }
    }
@@ -2507,8 +2492,14 @@ fs_visitor::lower_constant_loads()
          const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
          const unsigned base = pull_index * 4;
 
-         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                   dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
+         fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
+         srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
+         srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]  = brw_imm_ud(base & ~(block_sz - 1));
+         srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]    = brw_imm_ud(block_sz);
+
+
+         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
+                   srcs, PULL_UNIFORM_CONSTANT_SRCS);
 
          /* Rewrite the instruction to use the temporary VGRF. */
          inst->src[i].file = VGRF;
@@ -2626,6 +2617,7 @@ fs_visitor::opt_algebraic()
          /* a * 1.0 = a */
          if (inst->src[1].is_one()) {
             inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
             inst->src[1] = reg_undef;
             progress = true;
             break;
@@ -2634,6 +2626,7 @@ fs_visitor::opt_algebraic()
          /* a * -1.0 = -a */
          if (inst->src[1].is_negative_one()) {
             inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
             inst->src[0].negate = !inst->src[0].negate;
             inst->src[1] = reg_undef;
             progress = true;
@@ -2648,6 +2641,7 @@ fs_visitor::opt_algebraic()
          if (brw_reg_type_is_integer(inst->src[1].type) &&
              inst->src[1].is_zero()) {
             inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
             inst->src[1] = reg_undef;
             progress = true;
             break;
@@ -2656,6 +2650,7 @@ fs_visitor::opt_algebraic()
          if (inst->src[0].file == IMM) {
             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
             inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
             inst->src[0].f += inst->src[1].f;
             inst->src[1] = reg_undef;
             progress = true;
@@ -2671,9 +2666,11 @@ fs_visitor::opt_algebraic()
              */
             if (inst->src[0].negate) {
                inst->opcode = BRW_OPCODE_NOT;
+               inst->sources = 1;
                inst->src[0].negate = false;
             } else {
                inst->opcode = BRW_OPCODE_MOV;
+               inst->sources = 1;
             }
             inst->src[1] = reg_undef;
             progress = true;
@@ -2720,6 +2717,7 @@ fs_visitor::opt_algebraic()
          }
          if (inst->src[0].equals(inst->src[1])) {
             inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
             inst->src[1] = reg_undef;
             inst->predicate = BRW_PREDICATE_NONE;
             inst->predicate_inverse = false;
@@ -2732,6 +2730,7 @@ fs_visitor::opt_algebraic()
                case BRW_REGISTER_TYPE_F:
                   if (inst->src[1].f >= 1.0f) {
                      inst->opcode = BRW_OPCODE_MOV;
+                     inst->sources = 1;
                      inst->src[1] = reg_undef;
                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
                      progress = true;
@@ -2747,6 +2746,7 @@ fs_visitor::opt_algebraic()
                case BRW_REGISTER_TYPE_F:
                   if (inst->src[1].f <= 0.0f) {
                      inst->opcode = BRW_OPCODE_MOV;
+                     inst->sources = 1;
                      inst->src[1] = reg_undef;
                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
                      progress = true;
@@ -2767,11 +2767,13 @@ fs_visitor::opt_algebraic()
             break;
          if (inst->src[1].is_one()) {
             inst->opcode = BRW_OPCODE_ADD;
+            inst->sources = 2;
             inst->src[1] = inst->src[2];
             inst->src[2] = reg_undef;
             progress = true;
          } else if (inst->src[2].is_one()) {
             inst->opcode = BRW_OPCODE_ADD;
+            inst->sources = 2;
             inst->src[2] = reg_undef;
             progress = true;
          }
@@ -2850,9 +2852,9 @@ fs_visitor::opt_zero_samples()
    /* Gfx4 infers the texturing opcode based on the message length so we can't
     * change it.  Gfx12.5 has restrictions on the number of coordinate
     * parameters that have to be provided for some texture types
-    * (Wa_14013363432).
+    * (Wa_14012688258).
     */
-   if (devinfo->ver < 5 || devinfo->verx10 == 125)
+   if (devinfo->ver < 5 || intel_needs_workaround(devinfo, 14012688258))
       return false;
 
    bool progress = false;
@@ -3725,106 +3727,6 @@ fs_visitor::insert_gfx4_send_dependency_workarounds()
       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
 }
 
-/**
- * Turns the generic expression-style uniform pull constant load instruction
- * into a hardware-specific series of instructions for loading a pull
- * constant.
- *
- * The expression style allows the CSE pass before this to optimize out
- * repeated loads from the same offset, and gives the pre-register-allocation
- * scheduling full flexibility, while the conversion to native instructions
- * allows the post-register-allocation scheduler the best information
- * possible.
- *
- * Note that execution masking for setting up pull constant loads is special:
- * the channels that need to be written are unrelated to the current execution
- * mask, since a later instruction will use one of the result channels as a
- * source operand for all 8 or 16 of its channels.
- */
-void
-fs_visitor::lower_uniform_pull_constant_loads()
-{
-   foreach_block_and_inst (block, fs_inst, inst, cfg) {
-      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
-         continue;
-
-      const fs_reg& surface = inst->src[0];
-      const fs_reg& offset_B = inst->src[1];
-      assert(offset_B.file == IMM);
-
-      if (devinfo->has_lsc) {
-         const fs_builder ubld =
-            fs_builder(this, block, inst).group(8, 0).exec_all();
-
-         const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
-         ubld.MOV(payload, offset_B);
-
-         inst->sfid = GFX12_SFID_UGM;
-         inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
-                                   1 /* simd_size */,
-                                   LSC_ADDR_SURFTYPE_BTI,
-                                   LSC_ADDR_SIZE_A32,
-                                   1 /* num_coordinates */,
-                                   LSC_DATA_SIZE_D32,
-                                   inst->size_written / 4,
-                                   true /* transpose */,
-                                   LSC_CACHE_LOAD_L1STATE_L3MOCS,
-                                   true /* has_dest */);
-
-         fs_reg ex_desc;
-         if (surface.file == IMM) {
-            ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
-         } else {
-            /* We only need the first component for the payload so we can use
-             * one of the other components for the extended descriptor
-             */
-            ex_desc = component(payload, 1);
-            ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24));
-         }
-
-         /* Update the original instruction. */
-         inst->opcode = SHADER_OPCODE_SEND;
-         inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
-         inst->ex_mlen = 0;
-         inst->header_size = 0;
-         inst->send_has_side_effects = false;
-         inst->send_is_volatile = true;
-         inst->exec_size = 1;
-
-         /* Finally, the payload */
-         inst->resize_sources(3);
-         inst->src[0] = brw_imm_ud(0); /* desc */
-         inst->src[1] = ex_desc;
-         inst->src[2] = payload;
-
-         invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-      } else if (devinfo->ver >= 7) {
-         const fs_builder ubld = fs_builder(this, block, inst).exec_all();
-         const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
-
-         ubld.group(8, 0).MOV(payload,
-                              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-         ubld.group(1, 0).MOV(component(payload, 2),
-                              brw_imm_ud(offset_B.ud / 16));
-
-         inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7;
-         inst->src[1] = payload;
-         inst->header_size = 1;
-         inst->mlen = 1;
-
-         invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
-      } else {
-         /* Before register allocation, we didn't tell the scheduler about the
-          * MRF we use.  We know it's safe to use this MRF because nothing
-          * else does except for register spill/unspill, which generates and
-          * uses its MRF within a single IR instruction.
-          */
-         inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
-         inst->mlen = 1;
-      }
-   }
-}
-
 bool
 fs_visitor::lower_load_payload()
 {
@@ -3933,15 +3835,147 @@ fs_visitor::lower_load_payload()
    return progress;
 }
 
+/**
+ * Factor an unsigned 32-bit integer.
+ *
+ * Attempts to factor \c x into two values that are at most 0xFFFF.  If no
+ * such factorization is possible, either because the value is too large or is
+ * prime, both \c result_a and \c result_b will be zero.
+ */
+static void
+factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b)
+{
+   /* This is necessary to prevent various opportunities for division by zero
+    * below.
+    */
+   assert(x > 0xffff);
+
+   /* This represents the actual expected constraints on the input.  Namely,
+    * both the upper and lower words should be > 1.
+    */
+   assert(x >= 0x00020002);
+
+   *result_a = 0;
+   *result_b = 0;
+
+   /* The value is too large to factor with the constraints. */
+   if (x > (0xffffu * 0xffffu))
+      return;
+
+   /* A non-prime number will have the form p*q*d where p is some prime
+    * number, q > 1, and 1 <= d <= q.  To meet the constraints of this
+    * function, (p*d) < 0x10000.  This implies d <= floor(0xffff / p).
+    * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)).  Finally,
+    * floor(x / (0xffff * p)) <= d <= floor(0xffff / p).
+    *
+    * The observation is finding the largest possible value of p reduces the
+    * possible range of d.  After selecting p, all values of d in this range
+    * are tested until a factorization is found.  The size of the range of
+    * possible values of d sets an upper bound on the run time of the
+    * function.
+    */
+   static const uint16_t primes[256] = {
+         2,    3,    5,    7,   11,   13,   17,   19,
+        23,   29,   31,   37,   41,   43,   47,   53,
+        59,   61,   67,   71,   73,   79,   83,   89,
+        97,  101,  103,  107,  109,  113,  127,  131,  /*  32 */
+       137,  139,  149,  151,  157,  163,  167,  173,
+       179,  181,  191,  193,  197,  199,  211,  223,
+       227,  229,  233,  239,  241,  251,  257,  263,
+       269,  271,  277,  281,  283,  293,  307,  311,  /*  64 */
+       313,  317,  331,  337,  347,  349,  353,  359,
+       367,  373,  379,  383,  389,  397,  401,  409,
+       419,  421,  431,  433,  439,  443,  449,  457,
+       461,  463,  467,  479,  487,  491,  499,  503,  /*  96 */
+       509,  521,  523,  541,  547,  557,  563,  569,
+       571,  577,  587,  593,  599,  601,  607,  613,
+       617,  619,  631,  641,  643,  647,  653,  659,
+       661,  673,  677,  683,  691,  701,  709,  719,   /* 128 */
+       727,  733,  739,  743,  751,  757,  761,  769,
+       773,  787,  797,  809,  811,  821,  823,  827,
+       829,  839,  853,  857,  859,  863,  877,  881,
+       883,  887,  907,  911,  919,  929,  937,  941,  /* 160 */
+       947,  953,  967,  971,  977,  983,  991,  997,
+      1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
+      1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
+      1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,  /* 192 */
+      1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
+      1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
+      1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
+      1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423,  /* 224 */
+      1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
+      1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
+      1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
+      1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619,  /* 256 */
+   };
+
+   unsigned p;
+   unsigned x_div_p;
+
+   for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) {
+      p = primes[i];
+      x_div_p = x / p;
+
+      if ((x_div_p * p) == x)
+         break;
+   }
+
+   /* A prime factor was not found. */
+   if (x_div_p * p != x)
+      return;
+
+   /* Terminate early if d=1 is a solution. */
+   if (x_div_p < 0x10000) {
+      *result_a = x_div_p;
+      *result_b = p;
+      return;
+   }
+
+   /* Pick the maximum possible value for 'd'.  It's important that the loop
+    * below execute while d <= max_d because max_d is a valid value.  Having
+    * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be
+    * incorrectly reported as not being factorable.  The problem would occur
+    * with any value that is a factor of two primes in the table and one prime
+    * not in the table.
+    */
+   const unsigned max_d = 0xffff / p;
+
+   /* Pick an initial value of 'd' that (combined with rejecting too large
+    * values above) guarantees that 'q' will always be small enough.
+    * DIV_ROUND_UP is used to prevent 'd' from being zero.
+    */
+   for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) {
+      unsigned q = x_div_p / d;
+
+      if ((q * d) == x_div_p) {
+         assert(p * d * q == x);
+         assert((p * d) < 0x10000);
+
+         *result_a = q;
+         *result_b = p * d;
+         break;
+      }
+
+      /* Since every value of 'd' is tried, as soon as 'd' is larger
+       * than 'q', we're just re-testing combinations that have
+       * already been tested.
+       */
+      if (d > q)
+         break;
+   }
+}
+
 void
 fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
 {
    const fs_builder ibld(this, block, inst);
 
-   const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);
+   /* It is correct to use inst->src[1].d in both end of the comparison.
+    * Using .ud in the UINT16_MAX comparison would cause any negative value to
+    * fail the check.
+    */
    if (inst->src[1].file == IMM &&
-       (( ud && inst->src[1].ud <= UINT16_MAX) ||
-        (!ud && inst->src[1].d <= INT16_MAX && inst->src[1].d >= INT16_MIN))) {
+       (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
        * src1 are used.
@@ -3949,6 +3983,7 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
        * If multiplying by an immediate value that fits in 16-bits, do a
        * single MUL instruction with that value in the proper location.
        */
+      const bool ud = (inst->src[1].d >= 0);
       if (devinfo->ver < 7) {
          fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
          ibld.MOV(imm, inst->src[1]);
@@ -4029,6 +4064,7 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
       high.stride = inst->dst.stride;
       high.offset = inst->dst.offset % REG_SIZE;
 
+      bool do_addition = true;
       if (devinfo->ver >= 7) {
          /* From Wa_1604601757:
           *
@@ -4047,10 +4083,37 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
             lower_src_modifiers(this, block, inst, 1);
 
          if (inst->src[1].file == IMM) {
-            ibld.MUL(low, inst->src[0],
-                     brw_imm_uw(inst->src[1].ud & 0xffff));
-            ibld.MUL(high, inst->src[0],
-                     brw_imm_uw(inst->src[1].ud >> 16));
+            unsigned a;
+            unsigned b;
+
+            /* If the immeditate value can be factored into two values, A and
+             * B, that each fit in 16-bits, the multiplication result can
+             * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B).
+             * This saves an operation (the addition) and a temporary register
+             * (high).
+             *
+             * Skip the optimization if either the high word or the low word
+             * is 0 or 1.  In these conditions, at least one of the
+             * multiplications generated by the straightforward method will be
+             * eliminated anyway.
+             */
+            if (inst->src[1].ud > 0x0001ffff &&
+                (inst->src[1].ud & 0xffff) > 1) {
+               factor_uint32(inst->src[1].ud, &a, &b);
+
+               if (a != 0) {
+                  ibld.MUL(low, inst->src[0], brw_imm_uw(a));
+                  ibld.MUL(low, low, brw_imm_uw(b));
+                  do_addition = false;
+               }
+            }
+
+            if (do_addition) {
+               ibld.MUL(low, inst->src[0],
+                        brw_imm_uw(inst->src[1].ud & 0xffff));
+               ibld.MUL(high, inst->src[0],
+                        brw_imm_uw(inst->src[1].ud >> 16));
+            }
          } else {
             ibld.MUL(low, inst->src[0],
                      subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
@@ -4067,9 +4130,11 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
                   inst->src[1]);
       }
 
-      ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
-               subscript(low, BRW_REGISTER_TYPE_UW, 1),
-               subscript(high, BRW_REGISTER_TYPE_UW, 0));
+      if (do_addition) {
+         ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
+                  subscript(low, BRW_REGISTER_TYPE_UW, 1),
+                  subscript(high, BRW_REGISTER_TYPE_UW, 0));
+      }
 
       if (needs_mov || inst->conditional_mod)
          set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
@@ -5052,7 +5117,6 @@ get_lowered_simd_width(const struct brw_compiler *compiler,
       return 8;
 
    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
-   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
    case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
@@ -5074,11 +5138,6 @@ get_lowered_simd_width(const struct brw_compiler *compiler,
       return inst->exec_size;
 
    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
-   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
-   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
-   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
-   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
-   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
       return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8;
 
    case SHADER_OPCODE_URB_READ_LOGICAL:
@@ -6197,7 +6256,7 @@ needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
    return false;
 }
 
-/* Wa_14017989577
+/* Wa_14015360517
  *
  * The first instruction of any kernel should have non-zero emask.
  * Make sure this happens by introducing a dummy mov instruction.
@@ -6205,7 +6264,7 @@ needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
 void
 fs_visitor::emit_dummy_mov_instruction()
 {
-   if (devinfo->verx10 < 120)
+   if (!intel_needs_workaround(devinfo, 14015360517))
       return;
 
    struct backend_instruction *first_inst =
@@ -6240,7 +6299,7 @@ fs_visitor::emit_dummy_memory_fence_before_eot()
    bool progress = false;
    bool has_ugm_write_or_atomic = false;
 
-   if (!intel_device_info_is_dg2(devinfo))
+   if (!intel_needs_workaround(devinfo, 22013689345))
       return;
 
    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
@@ -6388,15 +6447,18 @@ fs_visitor::fixup_nomask_control_flow()
                 */
                const bool save_flag = flag_liveout &
                                       flag_mask(flag, dispatch_width / 8);
-               const fs_reg tmp = ubld.group(1, 0).vgrf(flag.type);
+               const fs_reg tmp = ubld.group(8, 0).vgrf(flag.type);
 
-               if (save_flag)
+               if (save_flag) {
+                  ubld.group(8, 0).UNDEF(tmp);
                   ubld.group(1, 0).MOV(tmp, flag);
+               }
 
                ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
 
                set_predicate(pred, inst);
                inst->flag_subreg = 0;
+               inst->predicate_trivial = true;
 
                if (save_flag)
                   ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
@@ -6419,6 +6481,18 @@ fs_visitor::fixup_nomask_control_flow()
    return progress;
 }
 
+uint32_t
+fs_visitor::compute_max_register_pressure()
+{
+   const register_pressure &rp = regpressure_analysis.require();
+   uint32_t ip = 0, max_pressure = 0;
+   foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+      max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
+      ip++;
+   }
+   return max_pressure;
+}
+
 void
 fs_visitor::allocate_registers(bool allow_spilling)
 {
@@ -6438,6 +6512,11 @@ fs_visitor::allocate_registers(bool allow_spilling)
       "lifo"
    };
 
+   compact_virtual_grfs();
+
+   if (needs_register_pressure)
+      shader_stats.max_register_pressure = compute_max_register_pressure();
+
    bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
 
    /* Before we schedule anything, stash off the instruction order as an array
@@ -6587,7 +6666,7 @@ fs_visitor::run_vs()
    fixup_3src_null_dest();
    emit_dummy_memory_fence_before_eot();
 
-   /* Wa_14017989577 */
+   /* Wa_14015360517 */
    emit_dummy_mov_instruction();
 
    allocate_registers(true /* allow_spilling */);
@@ -6713,7 +6792,7 @@ fs_visitor::run_tcs()
    fixup_3src_null_dest();
    emit_dummy_memory_fence_before_eot();
 
-   /* Wa_14017989577 */
+   /* Wa_14015360517 */
    emit_dummy_mov_instruction();
 
    allocate_registers(true /* allow_spilling */);
@@ -6745,7 +6824,7 @@ fs_visitor::run_tes()
    fixup_3src_null_dest();
    emit_dummy_memory_fence_before_eot();
 
-   /* Wa_14017989577 */
+   /* Wa_14015360517 */
    emit_dummy_mov_instruction();
 
    allocate_registers(true /* allow_spilling */);
@@ -6793,7 +6872,7 @@ fs_visitor::run_gs()
    fixup_3src_null_dest();
    emit_dummy_memory_fence_before_eot();
 
-   /* Wa_14017989577 */
+   /* Wa_14015360517 */
    emit_dummy_mov_instruction();
 
    allocate_registers(true /* allow_spilling */);
@@ -6896,7 +6975,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
       fixup_3src_null_dest();
       emit_dummy_memory_fence_before_eot();
 
-      /* Wa_14017989577 */
+      /* Wa_14015360517 */
       emit_dummy_mov_instruction();
 
       allocate_registers(allow_spilling);
@@ -6936,7 +7015,7 @@ fs_visitor::run_cs(bool allow_spilling)
    fixup_3src_null_dest();
    emit_dummy_memory_fence_before_eot();
 
-   /* Wa_14017989577 */
+   /* Wa_14015360517 */
    emit_dummy_mov_instruction();
 
    allocate_registers(allow_spilling);
@@ -6968,7 +7047,7 @@ fs_visitor::run_bs(bool allow_spilling)
    fixup_3src_null_dest();
    emit_dummy_memory_fence_before_eot();
 
-   /* Wa_14017989577 */
+   /* Wa_14015360517 */
    emit_dummy_mov_instruction();
 
    allocate_registers(allow_spilling);
@@ -7001,7 +7080,7 @@ fs_visitor::run_task(bool allow_spilling)
    fixup_3src_null_dest();
    emit_dummy_memory_fence_before_eot();
 
-   /* Wa_14017989577 */
+   /* Wa_14015360517 */
    emit_dummy_mov_instruction();
 
    allocate_registers(allow_spilling);
@@ -7034,7 +7113,7 @@ fs_visitor::run_mesh(bool allow_spilling)
    fixup_3src_null_dest();
    emit_dummy_memory_fence_before_eot();
 
-   /* Wa_14017989577 */
+   /* Wa_14015360517 */
    emit_dummy_mov_instruction();
 
    allocate_registers(allow_spilling);
@@ -7045,7 +7124,10 @@ fs_visitor::run_mesh(bool allow_spilling)
 static bool
 is_used_in_not_interp_frag_coord(nir_ssa_def *def)
 {
-   nir_foreach_use(src, def) {
+   nir_foreach_use_including_if(src, def) {
+      if (src->is_if)
+         return true;
+
       if (src->parent_instr->type != nir_instr_type_intrinsic)
          return true;
 
@@ -7054,9 +7136,6 @@ is_used_in_not_interp_frag_coord(nir_ssa_def *def)
          return true;
    }
 
-   nir_foreach_if_use(src, def)
-      return true;
-
    return false;
 }
 
@@ -7088,6 +7167,8 @@ brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
             case nir_intrinsic_load_barycentric_pixel:
             case nir_intrinsic_load_barycentric_centroid:
             case nir_intrinsic_load_barycentric_sample:
+            case nir_intrinsic_load_barycentric_at_sample:
+            case nir_intrinsic_load_barycentric_at_offset:
                break;
             default:
                continue;
@@ -7182,11 +7263,12 @@ brw_nir_move_interpolation_to_top(nir_shader *nir)
          continue;
 
       nir_block *top = nir_start_block(f->impl);
-      exec_node *cursor_node = NULL;
+      nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
+      bool impl_progress = false;
 
-      nir_foreach_block(block, f->impl) {
-         if (block == top)
-            continue;
+      for (nir_block *block = nir_block_cf_tree_next(top);
+           block != NULL;
+           block = nir_block_cf_tree_next(block)) {
 
          nir_foreach_instr_safe(instr, block) {
             if (instr->type != nir_instr_type_intrinsic)
@@ -7212,28 +7294,25 @@ brw_nir_move_interpolation_to_top(nir_shader *nir)
 
             for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
                if (move[i]->block != top) {
-                  move[i]->block = top;
-                  exec_node_remove(&move[i]->node);
-                  if (cursor_node) {
-                     exec_node_insert_after(cursor_node, &move[i]->node);
-                  } else {
-                     exec_list_push_head(&top->instr_list, &move[i]->node);
-                  }
-                  cursor_node = &move[i]->node;
-                  progress = true;
+                  nir_instr_move(cursor, move[i]);
+                  impl_progress = true;
                }
             }
          }
       }
-      nir_metadata_preserve(f->impl, nir_metadata_block_index |
-                                     nir_metadata_dominance);
+
+      progress = progress || impl_progress;
+
+      nir_metadata_preserve(f->impl, impl_progress ? (nir_metadata_block_index |
+                                                      nir_metadata_dominance)
+                                                   : nir_metadata_all);
    }
 
    return progress;
 }
 
 static void
-brw_nir_populate_wm_prog_data(const nir_shader *shader,
+brw_nir_populate_wm_prog_data(nir_shader *shader,
                               const struct intel_device_info *devinfo,
                               const struct brw_wm_prog_key *key,
                               struct brw_wm_prog_data *prog_data,
@@ -7252,10 +7331,28 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
    prog_data->computed_stencil =
       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
 
-   prog_data->persample_dispatch =
-      key->multisample_fbo &&
-      (key->persample_interp ||
-       shader->info.fs.uses_sample_shading);
+   prog_data->sample_shading =
+      shader->info.fs.uses_sample_shading ||
+      shader->info.outputs_read;
+
+   assert(key->multisample_fbo != BRW_NEVER ||
+          key->persample_interp == BRW_NEVER);
+
+   prog_data->persample_dispatch = key->persample_interp;
+   if (prog_data->sample_shading)
+      prog_data->persample_dispatch = BRW_ALWAYS;
+
+   /* We can only persample dispatch if we have a multisample FBO */
+   prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
+                                        key->multisample_fbo);
+
+   /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
+    * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
+    * to definitively tell whether alpha_to_coverage is on or off.
+    */
+   prog_data->alpha_to_coverage = key->alpha_to_coverage;
+   assert(prog_data->alpha_to_coverage != BRW_SOMETIMES ||
+          prog_data->persample_dispatch == BRW_SOMETIMES);
 
    if (devinfo->ver >= 6) {
       prog_data->uses_sample_mask =
@@ -7270,11 +7367,16 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
        * per-sample dispatch.  If we need gl_SamplePosition and we don't have
        * persample dispatch, we hard-code it to 0.5.
        */
-      prog_data->uses_pos_offset = prog_data->persample_dispatch &&
-         (BITSET_TEST(shader->info.system_values_read,
-                      SYSTEM_VALUE_SAMPLE_POS) ||
-          BITSET_TEST(shader->info.system_values_read,
-                      SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
+      prog_data->read_pos_offset_input =
+         BITSET_TEST(shader->info.system_values_read,
+                     SYSTEM_VALUE_SAMPLE_POS) ||
+         BITSET_TEST(shader->info.system_values_read,
+                     SYSTEM_VALUE_SAMPLE_POS_OR_CENTER);
+
+      if (prog_data->read_pos_offset_input)
+         prog_data->uses_pos_offset = prog_data->persample_dispatch;
+      else
+         prog_data->uses_pos_offset = BRW_NEVER;
    }
 
    prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
@@ -7285,19 +7387,72 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
 
    prog_data->barycentric_interp_modes =
       brw_compute_barycentric_interp_modes(devinfo, shader);
+
+   /* From the BDW PRM documentation for 3DSTATE_WM:
+    *
+    *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
+    *     Sample or Non- perspective Sample barycentric coordinates."
+    *
+    * So cleanup any potentially set sample barycentric mode when not in per
+    * sample dispatch.
+    */
+   if (prog_data->persample_dispatch == BRW_NEVER) {
+      prog_data->barycentric_interp_modes &=
+         ~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE);
+   }
+
    prog_data->uses_nonperspective_interp_modes |=
       (prog_data->barycentric_interp_modes &
       BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
 
-   /* You can't be coarse and per-sample */
-   assert(!key->coarse_pixel || !key->persample_interp);
-   prog_data->per_coarse_pixel_dispatch =
-      key->coarse_pixel &&
-      !shader->info.fs.uses_sample_shading &&
-      !prog_data->uses_omask &&
-      !prog_data->uses_sample_mask &&
-      (prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) &&
-      !prog_data->computed_stencil;
+   /* The current VK_EXT_graphics_pipeline_library specification requires
+    * coarse to specified at compile time. But per sample interpolation can be
+    * dynamic. So we should never be in a situation where coarse &
+    * persample_interp are both respectively true & BRW_ALWAYS.
+    *
+    * Coarse will dynamically turned off when persample_interp is active.
+    */
+   assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS);
+
+   prog_data->coarse_pixel_dispatch =
+      brw_sometimes_invert(prog_data->persample_dispatch);
+   if (!key->coarse_pixel ||
+       prog_data->uses_omask ||
+       prog_data->sample_shading ||
+       prog_data->uses_sample_mask ||
+       (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
+       prog_data->computed_stencil) {
+      prog_data->coarse_pixel_dispatch = BRW_NEVER;
+   }
+
+   /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
+    * Message Descriptor :
+    *
+    *    "Message Type. Specifies the type of message being sent when
+    *     pixel-rate evaluation is requested :
+    *
+    *     Format = U2
+    *       0: Per Message Offset (eval_snapped with immediate offset)
+    *       1: Sample Position Offset (eval_sindex)
+    *       2: Centroid Position Offset (eval_centroid)
+    *       3: Per Slot Offset (eval_snapped with register offset)
+    *
+    *     Message Type. Specifies the type of message being sent when
+    *     coarse-rate evaluation is requested :
+    *
+    *     Format = U2
+    *       0: Coarse to Pixel Mapping Message (internal message)
+    *       1: Reserved
+    *       2: Coarse Centroid Position (eval_centroid)
+    *       3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
+    *
+    * The Sample Position Offset is marked as reserved for coarse rate
+    * evaluation and leads to hangs if we try to use it. So disable coarse
+    * pixel shading if we have any intrinsic that will result in a pixel
+    * interpolater message at sample.
+    */
+   if (brw_nir_pulls_at_sample(shader))
+      prog_data->coarse_pixel_dispatch = BRW_NEVER;
 
    /* We choose to always enable VMask prior to XeHP, as it would cause
     * us to lose out on the eliminate_find_live_channel() optimization.
@@ -7305,16 +7460,16 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
    prog_data->uses_vmask = devinfo->verx10 < 125 ||
                            shader->info.fs.needs_quad_helper_invocations ||
                            shader->info.fs.needs_all_helper_invocations ||
-                           prog_data->per_coarse_pixel_dispatch;
+                           prog_data->coarse_pixel_dispatch != BRW_NEVER;
 
    prog_data->uses_src_w =
       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
    prog_data->uses_src_depth =
       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
-      !prog_data->per_coarse_pixel_dispatch;
+      prog_data->coarse_pixel_dispatch != BRW_ALWAYS;
    prog_data->uses_depth_w_coefficients =
       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
-      prog_data->per_coarse_pixel_dispatch;
+      prog_data->coarse_pixel_dispatch != BRW_NEVER;
 
    calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
    brw_compute_flat_inputs(prog_data, shader);
@@ -7361,35 +7516,35 @@ brw_compile_fs(const struct brw_compiler *compiler,
     *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
     *   hardware, regardless of the state setting for this feature."
     */
-   if (devinfo->ver > 6 && key->alpha_to_coverage) {
+   if (devinfo->ver > 6 && key->alpha_to_coverage != BRW_NEVER) {
       /* Run constant fold optimization in order to get the correct source
        * offset to determine render target 0 store instruction in
        * emit_alpha_to_coverage pass.
        */
-      NIR_PASS_V(nir, nir_opt_constant_folding);
-      NIR_PASS_V(nir, brw_nir_lower_alpha_to_coverage);
+      NIR_PASS(_, nir, nir_opt_constant_folding);
+      NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data);
    }
 
-   NIR_PASS_V(nir, brw_nir_move_interpolation_to_top);
+   NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
    brw_postprocess_nir(nir, compiler, true, debug_enabled,
                        key->base.robust_buffer_access);
 
    brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
                                  params->mue_map);
 
-   fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
+   std::unique_ptr<fs_visitor> v8, v16, v32;
    cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
    float throughput = 0;
    bool has_spilled = false;
 
-   v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
-                       &prog_data->base, nir, 8,
-                       debug_enabled);
+   v8 = std::make_unique<fs_visitor>(compiler, params->log_data, mem_ctx, &key->base,
+                                     &prog_data->base, nir, 8,
+                                     params->stats != NULL,
+                                     debug_enabled);
    if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
       params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
-      delete v8;
       return NULL;
-   } else if (!INTEL_DEBUG(DEBUG_NO8)) {
+   } else if (INTEL_SIMD(FS, 8)) {
       simd8_cfg = v8->cfg;
       prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs;
       prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used);
@@ -7403,7 +7558,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
     * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
     */
    if (devinfo->ver == 8 && prog_data->dual_src_blend &&
-       !INTEL_DEBUG(DEBUG_NO8)) {
+       INTEL_SIMD(FS, 8)) {
       assert(!params->use_rep_send);
       v8->limit_dispatch_width(8, "gfx8 workaround: "
                                "using SIMD8 when dual src blending.\n");
@@ -7423,12 +7578,13 @@ brw_compile_fs(const struct brw_compiler *compiler,
 
    if (!has_spilled &&
        v8->max_dispatch_width >= 16 &&
-       (!INTEL_DEBUG(DEBUG_NO16) || params->use_rep_send)) {
+       (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
       /* Try a SIMD16 compile */
-      v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
-                           &prog_data->base, nir, 16,
-                           debug_enabled);
-      v16->import_uniforms(v8);
+      v16 = std::make_unique<fs_visitor>(compiler, params->log_data, mem_ctx, &key->base,
+                                         &prog_data->base, nir, 16,
+                                         params->stats != NULL,
+                                         debug_enabled);
+      v16->import_uniforms(v8.get());
       if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
          brw_shader_perf_log(compiler, params->log_data,
                              "SIMD16 shader failed to compile: %s\n",
@@ -7450,12 +7606,13 @@ brw_compile_fs(const struct brw_compiler *compiler,
    if (!has_spilled &&
        v8->max_dispatch_width >= 32 && !params->use_rep_send &&
        devinfo->ver >= 6 && !simd16_failed &&
-       !INTEL_DEBUG(DEBUG_NO32)) {
+       INTEL_SIMD(FS, 32)) {
       /* Try a SIMD32 compile */
-      v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
-                           &prog_data->base, nir, 32,
-                           debug_enabled);
-      v32->import_uniforms(v8);
+      v32 = std::make_unique<fs_visitor>(compiler, params->log_data, mem_ctx, &key->base,
+                                         &prog_data->base, nir, 32,
+                                         params->stats != NULL,
+                                         debug_enabled);
+      v32->import_uniforms(v8.get());
       if (!v32->run_fs(allow_spilling, false)) {
          brw_shader_perf_log(compiler, params->log_data,
                              "SIMD32 shader failed to compile: %s\n",
@@ -7520,12 +7677,14 @@ brw_compile_fs(const struct brw_compiler *compiler,
    }
 
    struct brw_compile_stats *stats = params->stats;
+   uint32_t max_dispatch_width = 0;
 
    if (simd8_cfg) {
       prog_data->dispatch_8 = true;
       g.generate_code(simd8_cfg, 8, v8->shader_stats,
                       v8->performance_analysis.require(), stats);
       stats = stats ? stats + 1 : NULL;
+      max_dispatch_width = 8;
    }
 
    if (simd16_cfg) {
@@ -7534,6 +7693,7 @@ brw_compile_fs(const struct brw_compiler *compiler,
          simd16_cfg, 16, v16->shader_stats,
          v16->performance_analysis.require(), stats);
       stats = stats ? stats + 1 : NULL;
+      max_dispatch_width = 16;
    }
 
    if (simd32_cfg) {
@@ -7542,14 +7702,13 @@ brw_compile_fs(const struct brw_compiler *compiler,
          simd32_cfg, 32, v32->shader_stats,
          v32->performance_analysis.require(), stats);
       stats = stats ? stats + 1 : NULL;
+      max_dispatch_width = 32;
    }
 
-   g.add_const_data(nir->constant_data, nir->constant_data_size);
-
-   delete v8;
-   delete v16;
-   delete v32;
+   for (struct brw_compile_stats *s = params->stats; s != NULL && s != stats; s++)
+      s->max_dispatch_width = max_dispatch_width;
 
+   g.add_const_data(nir->constant_data, nir->constant_data_size);
    return g.get_assembly();
 }
 
@@ -7707,15 +7866,17 @@ brw_compile_cs(const struct brw_compiler *compiler,
       prog_data->local_size[2] = nir->info.workgroup_size[2];
    }
 
-   const unsigned required_dispatch_width =
-      brw_required_dispatch_width(&nir->info);
+   brw_simd_selection_state simd_state{
+      .mem_ctx = mem_ctx,
+      .devinfo = compiler->devinfo,
+      .prog_data = prog_data,
+      .required_width = brw_required_dispatch_width(&nir->info),
+   };
 
-   fs_visitor *v[3]     = {0};
-   const char *error[3] = {0};
+   std::unique_ptr<fs_visitor> v[3];
 
    for (unsigned simd = 0; simd < 3; simd++) {
-      if (!brw_simd_should_compile(mem_ctx, simd, compiler->devinfo, prog_data,
-                                   required_dispatch_width, &error[simd]))
+      if (!brw_simd_should_compile(simd_state, simd))
          continue;
 
       const unsigned dispatch_width = 8u << simd;
@@ -7733,24 +7894,23 @@ brw_compile_cs(const struct brw_compiler *compiler,
       brw_postprocess_nir(shader, compiler, true, debug_enabled,
                           key->base.robust_buffer_access);
 
-      v[simd] = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
-                               &prog_data->base, shader, dispatch_width,
-                               debug_enabled);
+      v[simd] = std::make_unique<fs_visitor>(compiler, params->log_data, mem_ctx, &key->base,
+                                             &prog_data->base, shader, dispatch_width,
+                                             params->stats != NULL,
+                                             debug_enabled);
 
-      if (prog_data->prog_mask) {
-         unsigned first = ffs(prog_data->prog_mask) - 1;
-         v[simd]->import_uniforms(v[first]);
-      }
+      const int first = brw_simd_first_compiled(simd_state);
+      if (first >= 0)
+         v[simd]->import_uniforms(v[first].get());
 
-      const bool allow_spilling = !prog_data->prog_mask ||
-                                  nir->info.workgroup_size_variable;
+      const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
 
       if (v[simd]->run_cs(allow_spilling)) {
          cs_fill_push_const_info(compiler->devinfo, prog_data);
 
-         brw_simd_mark_compiled(simd, prog_data, v[simd]->spilled_any_registers);
+         brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
       } else {
-         error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg);
+         simd_state.error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg);
          if (simd > 0) {
             brw_shader_perf_log(compiler, params->log_data,
                                 "SIMD%u shader failed to compile: %s\n",
@@ -7759,21 +7919,20 @@ brw_compile_cs(const struct brw_compiler *compiler,
       }
    }
 
-   const int selected_simd = brw_simd_select(prog_data);
+   const int selected_simd = brw_simd_select(simd_state);
    if (selected_simd < 0) {
       params->error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s, %s and %s.\n",
-                                          error[0], error[1], error[2]);;
+                                          simd_state.error[0], simd_state.error[1],
+                                          simd_state.error[2]);
       return NULL;
    }
 
    assert(selected_simd < 3);
-   fs_visitor *selected = v[selected_simd];
+   fs_visitor *selected = v[selected_simd].get();
 
    if (!nir->info.workgroup_size_variable)
       prog_data->prog_mask = 1 << selected_simd;
 
-   const unsigned *ret = NULL;
-
    fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,
                   selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
    if (unlikely(debug_enabled)) {
@@ -7784,6 +7943,8 @@ brw_compile_cs(const struct brw_compiler *compiler,
       g.enable_debug(name);
    }
 
+   uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
+
    struct brw_compile_stats *stats = params->stats;
    for (unsigned simd = 0; simd < 3; simd++) {
       if (prog_data->prog_mask & (1u << simd)) {
@@ -7791,19 +7952,16 @@ brw_compile_cs(const struct brw_compiler *compiler,
          prog_data->prog_offset[simd] =
             g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
                             v[simd]->performance_analysis.require(), stats);
+         if (stats)
+            stats->max_dispatch_width = max_dispatch_width;
          stats = stats ? stats + 1 : NULL;
+         max_dispatch_width = 8u << simd;
       }
    }
 
    g.add_const_data(nir->constant_data, nir->constant_data_size);
 
-   ret = g.get_assembly();
-
-   delete v[0];
-   delete v[1];
-   delete v[2];
-
-   return ret;
+   return g.get_assembly();
 }
 
 struct brw_cs_dispatch_info
@@ -7817,9 +7975,7 @@ brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
       override_local_size ? override_local_size :
                             prog_data->local_size;
 
-   const int simd =
-      override_local_size ? brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes) :
-                            brw_simd_select(prog_data);
+   const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
    assert(simd >= 0 && simd < 3);
 
    info.group_size = sizes[0] * sizes[1] * sizes[2];
@@ -7857,81 +8013,65 @@ compile_single_bs(const struct brw_compiler *compiler, void *log_data,
    brw_postprocess_nir(shader, compiler, true, debug_enabled,
                        key->base.robust_buffer_access);
 
-   fs_visitor *v = NULL, *v8 = NULL, *v16 = NULL;
-   bool has_spilled = false;
+   brw_simd_selection_state simd_state{
+      .mem_ctx = mem_ctx,
+      .devinfo = compiler->devinfo,
+      .prog_data = prog_data,
 
-   uint8_t simd_size = 0;
-   if ((shader->info.subgroup_size == SUBGROUP_SIZE_VARYING ||
-        shader->info.subgroup_size == SUBGROUP_SIZE_REQUIRE_8) &&
-       !INTEL_DEBUG(DEBUG_NO8)) {
-      v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
-                          &prog_data->base, shader,
-                          8, debug_enabled);
-      const bool allow_spilling = true;
-      if (!v8->run_bs(allow_spilling)) {
-         if (error_str)
-            *error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
-         delete v8;
-         return 0;
-      } else {
-         v = v8;
-         simd_size = 8;
-         if (v8->spilled_any_registers)
-            has_spilled = true;
-      }
-   }
+      /* Since divergence is a lot more likely in RT than compute, it makes
+       * sense to limit ourselves to SIMD8 for now.
+       */
+      .required_width = 8,
+   };
 
-   if ((shader->info.subgroup_size == SUBGROUP_SIZE_VARYING ||
-        shader->info.subgroup_size == SUBGROUP_SIZE_REQUIRE_16) &&
-       !has_spilled && !INTEL_DEBUG(DEBUG_NO16)) {
-      v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
-                           &prog_data->base, shader,
-                           16, debug_enabled);
-      const bool allow_spilling = (v == NULL);
-      if (!v16->run_bs(allow_spilling)) {
-         brw_shader_perf_log(compiler, log_data,
-                             "SIMD16 shader failed to compile: %s\n",
-                             v16->fail_msg);
-         if (v == NULL) {
-            assert(v8 == NULL);
-            if (error_str) {
-               *error_str = ralloc_asprintf(
-                  mem_ctx, "SIMD8 disabled and couldn't generate SIMD16: %s",
-                  v16->fail_msg);
-            }
-            delete v16;
-            return 0;
-         }
+   std::unique_ptr<fs_visitor> v[2];
+
+   for (unsigned simd = 0; simd < ARRAY_SIZE(v); simd++) {
+      if (!brw_simd_should_compile(simd_state, simd))
+         continue;
+
+      const unsigned dispatch_width = 8u << simd;
+
+      v[simd] = std::make_unique<fs_visitor>(compiler, log_data, mem_ctx, &key->base,
+                                             &prog_data->base, shader,
+                                             dispatch_width,
+                                             stats != NULL,
+                                             debug_enabled);
+
+      const bool allow_spilling = !brw_simd_any_compiled(simd_state);
+      if (v[simd]->run_bs(allow_spilling)) {
+         brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
       } else {
-         v = v16;
-         simd_size = 16;
-         if (v16->spilled_any_registers)
-            has_spilled = true;
+         simd_state.error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg);
+         if (simd > 0) {
+            brw_shader_perf_log(compiler, log_data,
+                                "SIMD%u shader failed to compile: %s",
+                                dispatch_width, v[simd]->fail_msg);
+         }
       }
    }
 
-   if (unlikely(v == NULL)) {
-      assert(INTEL_DEBUG(DEBUG_NO8 | DEBUG_NO16));
-      if (error_str) {
-         *error_str = ralloc_strdup(mem_ctx,
-            "Cannot satisfy INTEL_DEBUG flags SIMD restrictions");
-      }
-      return false;
+   const int selected_simd = brw_simd_select(simd_state);
+   if (selected_simd < 0) {
+      *error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s and %s.",
+                                   simd_state.error[0], simd_state.error[1]);
+      return 0;
    }
 
-   assert(v);
+   assert(selected_simd < int(ARRAY_SIZE(v)));
+   fs_visitor *selected = v[selected_simd].get();
+   assert(selected);
+
+   const unsigned dispatch_width = selected->dispatch_width;
 
-   int offset = g->generate_code(v->cfg, simd_size, v->shader_stats,
-                                 v->performance_analysis.require(), stats);
+   int offset = g->generate_code(selected->cfg, dispatch_width, selected->shader_stats,
+                                 selected->performance_analysis.require(), stats);
    if (prog_offset)
       *prog_offset = offset;
    else
       assert(offset == 0);
 
-   delete v8;
-   delete v16;
-
-   return simd_size;
+   return dispatch_width;
 }
 
 uint64_t
author	Jonathan Gray <jsg@cvs.openbsd.org>	2023-11-02 04:53:47 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2023-11-02 04:53:47 +0000
commit	b44518130b33cadb5c1d619e9e936ae0e0dbf7cb (patch)
tree	6069eb03c39fbc79808a7d94f857118cce75cbe3 /lib/mesa/src/intel/compiler/brw_fs.cpp
parent	32aeb3c41fedbbd7b11aacfec48e8f699d16bff0 (diff)