15 files changed, 565 insertions, 462 deletions
diff --git a/lib/mesa/src/compiler/nir/nir_to_ssa.c b/lib/mesa/src/compiler/nir/nir_to_ssa.c
index 6accdd24b..44a505477 100644
--- a/lib/mesa/src/compiler/nir/nir_to_ssa.c
+++ b/lib/mesa/src/compiler/nir/nir_to_ssa.c
@@ -27,6 +27,7 @@
 
 #include "nir.h"
 #include <stdlib.h>
+#include <unistd.h>
 
 /*
  * Implements the classic to-SSA algorithm described by Cytron et. al. in
@@ -88,7 +89,7 @@ insert_phi_nodes(nir_function_impl *impl)
       w_start = w_end = 0;
       iter_count++;
 
-      nir_foreach_def(dest, reg) {
+      nir_foreach_def(reg, dest) {
          nir_instr *def = dest->reg.parent_instr;
          if (work[def->block->index] < iter_count)
             W[w_end++] = def->block;
@@ -159,8 +160,7 @@ static nir_ssa_def *get_ssa_src(nir_register *reg, rewrite_state *state)
        * to preserve the information that this source is undefined
        */
       nir_ssa_undef_instr *instr =
-         nir_ssa_undef_instr_create(state->mem_ctx, reg->num_components,
-                                    reg->bit_size);
+         nir_ssa_undef_instr_create(state->mem_ctx, reg->num_components);
 
       /*
        * We could just insert the undefined instruction before the instruction
@@ -219,9 +219,7 @@ rewrite_def_forwards(nir_dest *dest, void *_state)
                              state->states[index].num_defs);
 
    list_del(&dest->reg.def_link);
-   nir_ssa_dest_init(state->parent_instr, dest, reg->num_components,
-                     reg->bit_size, name);
-   ralloc_free(name);
+   nir_ssa_dest_init(state->parent_instr, dest, reg->num_components, name);
 
    /* push our SSA destination on the stack */
    state->states[index].index++;
@@ -273,9 +271,7 @@ rewrite_alu_instr_forward(nir_alu_instr *instr, rewrite_state *state)
 
       instr->dest.write_mask = (1 << num_components) - 1;
       list_del(&instr->dest.dest.reg.def_link);
-      nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
-                        reg->bit_size, name);
-      ralloc_free(name);
+      nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, name);
 
       if (nir_op_infos[instr->op].output_size == 0) {
          /*
@@ -381,7 +377,7 @@ rewrite_instr_forward(nir_instr *instr, rewrite_state *state)
 static void
 rewrite_phi_sources(nir_block *block, nir_block *pred, rewrite_state *state)
 {
-   nir_foreach_instr(instr, block) {
+   nir_foreach_instr(block, instr) {
       if (instr->type != nir_instr_type_phi)
          break;
 
@@ -389,7 +385,7 @@ rewrite_phi_sources(nir_block *block, nir_block *pred, rewrite_state *state)
 
       state->parent_instr = instr;
 
-      nir_foreach_phi_src(src, phi_instr) {
+      nir_foreach_phi_src(phi_instr, src) {
          if (src->pred == pred) {
             rewrite_use(&src->src, state);
             break;
@@ -434,7 +430,7 @@ rewrite_block(nir_block *block, rewrite_state *state)
     * what we want because those instructions (vector gather, conditional
     * select) will already be in SSA form.
     */
-   nir_foreach_instr_safe(instr, block) {
+   nir_foreach_instr_safe(block, instr) {
       rewrite_instr_forward(instr, state);
    }
 
@@ -455,7 +451,7 @@ rewrite_block(nir_block *block, rewrite_state *state)
    for (unsigned i = 0; i < block->num_dom_children; i++)
       rewrite_block(block->dom_children[i], state);
 
-   nir_foreach_instr_reverse(instr, block) {
+   nir_foreach_instr_reverse(block, instr) {
       rewrite_instr_backwards(instr, state);
    }
 }
@@ -533,7 +529,7 @@ nir_convert_to_ssa_impl(nir_function_impl *impl)
 void
 nir_convert_to_ssa(nir_shader *shader)
 {
-   nir_foreach_function(function, shader) {
+   nir_foreach_function(shader, function) {
       if (function->impl)
          nir_convert_to_ssa_impl(function->impl);
    }
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.c b/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.c
index 18145beb2..148920756 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -27,21 +27,62 @@
 #include "main/errors.h"
 #include "util/debug.h"
 
+static void
+shader_debug_log_mesa(void *data, const char *fmt, ...)
+{
+   struct brw_context *brw = (struct brw_context *)data;
+   va_list args;
+
+   va_start(args, fmt);
+   GLuint msg_id = 0;
+   _mesa_gl_vdebug(&brw->ctx, &msg_id,
+                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
+                   MESA_DEBUG_TYPE_OTHER,
+                   MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args);
+   va_end(args);
+}
+
+static void
+shader_perf_log_mesa(void *data, const char *fmt, ...)
+{
+   struct brw_context *brw = (struct brw_context *)data;
+
+   va_list args;
+   va_start(args, fmt);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+      va_list args_copy;
+      va_copy(args_copy, args);
+      vfprintf(stderr, fmt, args_copy);
+      va_end(args_copy);
+   }
+
+   if (brw->perf_debug) {
+      GLuint msg_id = 0;
+      _mesa_gl_vdebug(&brw->ctx, &msg_id,
+                      MESA_DEBUG_SOURCE_SHADER_COMPILER,
+                      MESA_DEBUG_TYPE_PERFORMANCE,
+                      MESA_DEBUG_SEVERITY_MEDIUM, fmt, args);
+   }
+   va_end(args);
+}
+
 #define COMMON_OPTIONS                                                        \
+   /* In order to help allow for better CSE at the NIR level we tell NIR to   \
+    * split all ffma instructions during opt_algebraic and we then re-combine \
+    * them as a later step.                                                   \
+    */                                                                        \
+   .lower_ffma = true,                                                        \
    .lower_sub = true,                                                         \
    .lower_fdiv = true,                                                        \
    .lower_scmp = true,                                                        \
-   .lower_fmod32 = true,                                                      \
-   .lower_fmod64 = false,                                                     \
+   .lower_fmod = true,                                                        \
    .lower_bitfield_extract = true,                                            \
    .lower_bitfield_insert = true,                                             \
    .lower_uadd_carry = true,                                                  \
    .lower_usub_borrow = true,                                                 \
    .lower_fdiv = true,                                                        \
-   .lower_flrp64 = true,                                                      \
-   .native_integers = true,                                                   \
-   .use_interpolated_input_intrinsics = true,                                 \
-   .vertex_id_zero_based = true
+   .native_integers = true
 
 static const struct nir_shader_compiler_options scalar_nir_options = {
    COMMON_OPTIONS,
@@ -66,26 +107,6 @@ static const struct nir_shader_compiler_options vector_nir_options = {
     */
    .fdot_replicates = true,
 
-   /* Prior to Gen6, there are no three source operations for SIMD4x2. */
-   .lower_flrp32 = true,
-
-   .lower_pack_snorm_2x16 = true,
-   .lower_pack_unorm_2x16 = true,
-   .lower_unpack_snorm_2x16 = true,
-   .lower_unpack_unorm_2x16 = true,
-   .lower_extract_byte = true,
-   .lower_extract_word = true,
-};
-
-static const struct nir_shader_compiler_options vector_nir_options_gen6 = {
-   COMMON_OPTIONS,
-
-   /* In the vec4 backend, our dpN instruction replicates its result to all the
-    * components of a vec4.  We would like NIR to give us replicated fdot
-    * instructions because it can optimize better for us.
-    */
-   .fdot_replicates = true,
-
    .lower_pack_snorm_2x16 = true,
    .lower_pack_unorm_2x16 = true,
    .lower_unpack_snorm_2x16 = true,
@@ -95,25 +116,24 @@ static const struct nir_shader_compiler_options vector_nir_options_gen6 = {
 };
 
 struct brw_compiler *
-brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
+brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 {
    struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
 
    compiler->devinfo = devinfo;
+   compiler->shader_debug_log = shader_debug_log_mesa;
+   compiler->shader_perf_log = shader_perf_log_mesa;
 
    brw_fs_alloc_reg_sets(compiler);
    brw_vec4_alloc_reg_set(compiler);
 
-   compiler->precise_trig = env_var_as_boolean("INTEL_PRECISE_TRIG", false);
-
    compiler->scalar_stage[MESA_SHADER_VERTEX] =
       devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
-   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] =
-      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TCS", true);
+   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
    compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
       devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
    compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
-      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", true);
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false);
    compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
    compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
 
@@ -123,10 +143,12 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
       compiler->glsl_compiler_options[i].MaxIfDepth =
          devinfo->gen < 6 ? 16 : UINT_MAX;
 
+      compiler->glsl_compiler_options[i].EmitCondCodes = true;
+      compiler->glsl_compiler_options[i].EmitNoNoise = true;
       compiler->glsl_compiler_options[i].EmitNoMainReturn = true;
       compiler->glsl_compiler_options[i].EmitNoIndirectInput = true;
       compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
-      compiler->glsl_compiler_options[i].LowerCombinedClipCullDistance = true;
+      compiler->glsl_compiler_options[i].LowerClipDistance = true;
 
       bool is_scalar = compiler->scalar_stage[i];
 
@@ -138,20 +160,14 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
       if (devinfo->gen < 7)
          compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
 
-      if (is_scalar) {
-         compiler->glsl_compiler_options[i].NirOptions = &scalar_nir_options;
-      } else {
-         compiler->glsl_compiler_options[i].NirOptions =
-            devinfo->gen < 6 ? &vector_nir_options : &vector_nir_options_gen6;
-      }
+      compiler->glsl_compiler_options[i].NirOptions =
+         is_scalar ? &scalar_nir_options : &vector_nir_options;
 
       compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
-      compiler->glsl_compiler_options[i].ClampBlockIndicesToArrayBounds = true;
    }
 
    compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
    compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
-   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectOutput = false;
 
    if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
       compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.h b/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.h
index 447d05b81..27a95a3c6 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -24,9 +24,8 @@
 #pragma once
 
 #include <stdio.h>
-#include "common/gen_device_info.h"
+#include "brw_device_info.h"
 #include "main/mtypes.h"
-#include "main/macros.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,7 +37,7 @@ struct brw_geometry_program;
 union gl_constant_value;
 
 struct brw_compiler {
-   const struct gen_device_info *devinfo;
+   const struct brw_device_info *devinfo;
 
    struct {
       struct ra_regs *regs;
@@ -86,19 +85,13 @@ struct brw_compiler {
        * appear in *classes.
        */
       int aligned_pairs_class;
-   } fs_reg_sets[3];
+   } fs_reg_sets[2];
 
    void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
    void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
 
    bool scalar_stage[MESA_SHADER_STAGES];
    struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
-
-   /**
-    * Apply workarounds for SIN and COS output range problems.
-    * This can negatively impact performance.
-    */
-   bool precise_trig;
 };
 
 
@@ -160,13 +153,6 @@ struct brw_sampler_prog_key_data {
     * For Sandybridge, which shader w/a we need for gather quirks.
     */
    enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS];
-
-   /**
-    * Texture units that have a YUV image bound.
-    */
-   uint32_t y_u_v_image_mask;
-   uint32_t y_uv_image_mask;
-   uint32_t yx_xuxv_image_mask;
 };
 
 
@@ -220,8 +206,6 @@ struct brw_tcs_prog_key
    /** A bitfield of per-vertex outputs written. */
    uint64_t outputs_written;
 
-   bool quads_workaround;
-
    struct brw_sampler_prog_key_data tex;
 };
 
@@ -252,15 +236,17 @@ struct brw_wm_prog_key {
    uint8_t iz_lookup;
    bool stats_wm:1;
    bool flat_shade:1;
+   bool persample_shading:1;
+   bool persample_2x:1;
    unsigned nr_color_regions:5;
    bool replicate_alpha:1;
+   bool render_to_fbo:1;
    bool clamp_fragment_color:1;
-   bool persample_interp:1;
-   bool multisample_fbo:1;
+   bool compute_pos_offset:1;
+   bool compute_sample_id:1;
    unsigned line_aa:2;
    bool high_quality_derivatives:1;
    bool force_dual_color_blend:1;
-   bool coherent_fb_fetch:1;
 
    uint16_t drawable_height;
    uint64_t input_slots_valid;
@@ -338,7 +324,6 @@ struct brw_stage_prog_data {
       uint32_t abo_start;
       uint32_t image_start;
       uint32_t shader_time_start;
-      uint32_t plane_start[3];
       /** @} */
    } binding_table;
 
@@ -378,18 +363,15 @@ struct brw_wm_prog_data {
 
    GLuint num_varying_inputs;
 
-   uint8_t reg_blocks_0;
-   uint8_t reg_blocks_2;
-
-   uint8_t dispatch_grf_start_reg_2;
-   uint32_t prog_offset_2;
+   GLuint dispatch_grf_start_reg_16;
+   GLuint reg_blocks;
+   GLuint reg_blocks_16;
 
    struct {
       /** @{
        * surface indices the WM-specific surfaces
        */
       uint32_t render_target_start;
-      uint32_t render_target_read_start;
       /** @} */
    } binding_table;
 
@@ -397,18 +379,16 @@ struct brw_wm_prog_data {
    bool computed_stencil;
 
    bool early_fragment_tests;
-   bool dispatch_8;
-   bool dispatch_16;
+   bool no_8;
    bool dual_src_blend;
-   bool persample_dispatch;
    bool uses_pos_offset;
    bool uses_omask;
    bool uses_kill;
    bool uses_src_depth;
    bool uses_src_w;
    bool uses_sample_mask;
-   bool has_side_effects;
    bool pulls_bary;
+   uint32_t prog_offset_16;
 
    /**
     * Mask of which interpolation modes are required by the fragment shader.
@@ -417,12 +397,6 @@ struct brw_wm_prog_data {
    uint32_t barycentric_interp_modes;
 
    /**
-    * Mask of which FS inputs are marked flat by the shader source.  This is
-    * needed for setting up 3DSTATE_SF/SBE.
-    */
-   uint32_t flat_inputs;
-
-   /**
     * Map from gl_varying_slot to the position within the FS setup data
     * payload where the varying's attribute vertex deltas should be delivered.
     * For varying slots that are not used by the FS, the value is -1.
@@ -430,28 +404,15 @@ struct brw_wm_prog_data {
    int urb_setup[VARYING_SLOT_MAX];
 };
 
-struct brw_push_const_block {
-   unsigned dwords;     /* Dword count, not reg aligned */
-   unsigned regs;
-   unsigned size;       /* Bytes, register aligned */
-};
-
 struct brw_cs_prog_data {
    struct brw_stage_prog_data base;
 
    GLuint dispatch_grf_start_reg_16;
    unsigned local_size[3];
    unsigned simd_size;
-   unsigned threads;
    bool uses_barrier;
    bool uses_num_work_groups;
-   int thread_local_id_index;
-
-   struct {
-      struct brw_push_const_block cross_thread;
-      struct brw_push_const_block per_thread;
-      struct brw_push_const_block total;
-   } push;
+   unsigned local_invocation_id_regs;
 
    struct {
       /** @{
@@ -566,7 +527,7 @@ GLuint brw_varying_to_offset(const struct brw_vue_map *vue_map, GLuint varying)
    return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
 }
 
-void brw_compute_vue_map(const struct gen_device_info *devinfo,
+void brw_compute_vue_map(const struct brw_device_info *devinfo,
                          struct brw_vue_map *vue_map,
                          GLbitfield64 slots_valid,
                          bool separate_shader);
@@ -620,8 +581,6 @@ struct brw_vue_prog_data {
    GLuint urb_read_length;
    GLuint total_grf;
 
-   uint32_t cull_distance_mask;
-
    /* Used for calculating urb partitions.  In the VS, this is the size of the
     * URB entry used for both input and output to the thread.  In the GS, this
     * is the size of the URB entry used for output.
@@ -637,7 +596,6 @@ struct brw_vs_prog_data {
    GLbitfield64 inputs_read;
 
    unsigned nr_attributes;
-   unsigned nr_attribute_slots;
 
    bool uses_vertexid;
    bool uses_instanceid;
@@ -731,28 +689,11 @@ struct brw_gs_prog_data
    unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */];
 };
 
-#define DEFINE_PROG_DATA_DOWNCAST(stage)                       \
-static inline struct brw_##stage##_prog_data *                 \
-brw_##stage##_prog_data(struct brw_stage_prog_data *prog_data) \
-{                                                              \
-   return (struct brw_##stage##_prog_data *) prog_data;        \
-}
-DEFINE_PROG_DATA_DOWNCAST(vue)
-DEFINE_PROG_DATA_DOWNCAST(vs)
-DEFINE_PROG_DATA_DOWNCAST(tcs)
-DEFINE_PROG_DATA_DOWNCAST(tes)
-DEFINE_PROG_DATA_DOWNCAST(gs)
-DEFINE_PROG_DATA_DOWNCAST(wm)
-DEFINE_PROG_DATA_DOWNCAST(cs)
-DEFINE_PROG_DATA_DOWNCAST(ff_gs)
-DEFINE_PROG_DATA_DOWNCAST(clip)
-DEFINE_PROG_DATA_DOWNCAST(sf)
-#undef DEFINE_PROG_DATA_DOWNCAST
 
 /** @} */
 
 struct brw_compiler *
-brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo);
+brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo);
 
 /**
  * Compile a vertex shader.
@@ -833,7 +774,6 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                struct gl_program *prog,
                int shader_time_index8,
                int shader_time_index16,
-               bool allow_spilling,
                bool use_rep_send,
                unsigned *final_assembly_size,
                char **error_str);
@@ -853,86 +793,12 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                unsigned *final_assembly_size,
                char **error_str);
 
-static inline uint32_t
-encode_slm_size(unsigned gen, uint32_t bytes)
-{
-   uint32_t slm_size = 0;
-
-   /* Shared Local Memory is specified as powers of two, and encoded in
-    * INTERFACE_DESCRIPTOR_DATA with the following representations:
-    *
-    * Size   | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB |
-    * -------------------------------------------------------------------
-    * Gen7-8 |    0 | none | none |    1 |    2 |     4 |     8 |    16 |
-    * -------------------------------------------------------------------
-    * Gen9+  |    0 |    1 |    2 |    3 |    4 |     5 |     6 |     7 |
-    */
-   assert(bytes <= 64 * 1024);
-
-   if (bytes > 0) {
-      /* Shared Local Memory Size is specified as powers of two. */
-      slm_size = util_next_power_of_two(bytes);
-
-      if (gen >= 9) {
-         /* Use a minimum of 1kB; turn an exponent of 10 (1024 kB) into 1. */
-         slm_size = ffs(MAX2(slm_size, 1024)) - 10;
-      } else {
-         /* Use a minimum of 4kB; convert to the pre-Gen9 representation. */
-         slm_size = MAX2(slm_size, 4096) / 4096;
-      }
-   }
-
-   return slm_size;
-}
-
 /**
- * Return true if the given shader stage is dispatched contiguously by the
- * relevant fixed function starting from channel 0 of the SIMD thread, which
- * implies that the dispatch mask of a thread can be assumed to have the form
- * '2^n - 1' for some n.
+ * Fill out local id payload for compute shader according to cs_prog_data.
  */
-static inline bool
-brw_stage_has_packed_dispatch(const struct gen_device_info *devinfo,
-                              gl_shader_stage stage,
-                              const struct brw_stage_prog_data *prog_data)
-{
-   /* The code below makes assumptions about the hardware's thread dispatch
-    * behavior that could be proven wrong in future generations -- Make sure
-    * to do a full test run with brw_fs_test_dispatch_packing() hooked up to
-    * the NIR front-end before changing this assertion.
-    */
-   assert(devinfo->gen <= 9);
-
-   switch (stage) {
-   case MESA_SHADER_FRAGMENT: {
-      /* The PSD discards subspans coming in with no lit samples, which in the
-       * per-pixel shading case implies that each subspan will either be fully
-       * lit (due to the VMask being used to allow derivative computations),
-       * or not dispatched at all.  In per-sample dispatch mode individual
-       * samples from the same subspan have a fixed relative location within
-       * the SIMD thread, so dispatch of unlit samples cannot be avoided in
-       * general and we should return false.
-       */
-      const struct brw_wm_prog_data *wm_prog_data =
-         (const struct brw_wm_prog_data *)prog_data;
-      return !wm_prog_data->persample_dispatch;
-   }
-   case MESA_SHADER_COMPUTE:
-      /* Compute shaders will be spawned with either a fully enabled dispatch
-       * mask or with whatever bottom/right execution mask was given to the
-       * GPGPU walker command to be used along the workgroup edges -- In both
-       * cases the dispatch mask is required to be tightly packed for our
-       * invocation index calculations to work.
-       */
-      return true;
-   default:
-      /* Most remaining fixed functions are limited to use a packed dispatch
-       * mask due to the hardware representation of the dispatch mask as a
-       * single counter representing the number of enabled channels.
-       */
-      return true;
-   }
-}
+void
+brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
+                             void *buffer, uint32_t threads, uint32_t stride);
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_eu_validate.c b/lib/mesa/src/mesa/drivers/dri/i965/brw_eu_validate.c
index 0e736ed01..2de2ea1ba 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_eu_validate.c
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_eu_validate.c
@@ -55,31 +55,251 @@ cat(struct string *dest, const struct string src)
    } while(0)
 
 static bool
-src0_is_null(const struct gen_device_info *devinfo, const brw_inst *inst)
+src0_is_null(const struct brw_device_info *devinfo, const brw_inst *inst)
 {
    return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
           brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
 }
 
 static bool
-src1_is_null(const struct gen_device_info *devinfo, const brw_inst *inst)
+src1_is_null(const struct brw_device_info *devinfo, const brw_inst *inst)
 {
    return brw_inst_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
           brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
 }
 
-static bool
-src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst)
-{
-   return brw_inst_src0_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE;
-}
+enum gen {
+   GEN4  = (1 << 0),
+   GEN45 = (1 << 1),
+   GEN5  = (1 << 2),
+   GEN6  = (1 << 3),
+   GEN7  = (1 << 4),
+   GEN75 = (1 << 5),
+   GEN8  = (1 << 6),
+   GEN9  = (1 << 7),
+   GEN_ALL = ~0
+};
+
+#define GEN_GE(gen) (~((gen) - 1) | gen)
+#define GEN_LE(gen) (((gen) - 1) | gen)
+
+struct inst_info {
+   enum gen gen;
+};
+
+static const struct inst_info inst_info[128] = {
+   [BRW_OPCODE_ILLEGAL] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_MOV] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SEL] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_MOVI] = {
+      .gen = GEN_GE(GEN45),
+   },
+   [BRW_OPCODE_NOT] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_AND] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_OR] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_XOR] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SHR] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SHL] = {
+      .gen = GEN_ALL,
+   },
+   /* BRW_OPCODE_DIM / BRW_OPCODE_SMOV */
+   /* Reserved - 11 */
+   [BRW_OPCODE_ASR] = {
+      .gen = GEN_ALL,
+   },
+   /* Reserved - 13-15 */
+   [BRW_OPCODE_CMP] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_CMPN] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_CSEL] = {
+      .gen = GEN_GE(GEN8),
+   },
+   [BRW_OPCODE_F32TO16] = {
+      .gen = GEN7 | GEN75,
+   },
+   [BRW_OPCODE_F16TO32] = {
+      .gen = GEN7 | GEN75,
+   },
+   /* Reserved - 21-22 */
+   [BRW_OPCODE_BFREV] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_BFE] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_BFI1] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_BFI2] = {
+      .gen = GEN_GE(GEN7),
+   },
+   /* Reserved - 27-31 */
+   [BRW_OPCODE_JMPI] = {
+      .gen = GEN_ALL,
+   },
+   /* BRW_OPCODE_BRD */
+   [BRW_OPCODE_IF] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_IFF] = { /* also BRW_OPCODE_BRC */
+      .gen = GEN_LE(GEN5),
+   },
+   [BRW_OPCODE_ELSE] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_ENDIF] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_DO] = { /* also BRW_OPCODE_CASE */
+      .gen = GEN_LE(GEN5),
+   },
+   [BRW_OPCODE_WHILE] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_BREAK] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_CONTINUE] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_HALT] = {
+      .gen = GEN_ALL,
+   },
+   /* BRW_OPCODE_CALLA */
+   /* BRW_OPCODE_MSAVE / BRW_OPCODE_CALL */
+   /* BRW_OPCODE_MREST / BRW_OPCODE_RET */
+   /* BRW_OPCODE_PUSH / BRW_OPCODE_FORK / BRW_OPCODE_GOTO */
+   /* BRW_OPCODE_POP */
+   [BRW_OPCODE_WAIT] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SEND] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SENDC] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SENDS] = {
+      .gen = GEN_GE(GEN9),
+   },
+   [BRW_OPCODE_SENDSC] = {
+      .gen = GEN_GE(GEN9),
+   },
+   /* Reserved 53-55 */
+   [BRW_OPCODE_MATH] = {
+      .gen = GEN_GE(GEN6),
+   },
+   /* Reserved 57-63 */
+   [BRW_OPCODE_ADD] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_MUL] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_AVG] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_FRC] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_RNDU] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_RNDD] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_RNDE] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_RNDZ] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_MAC] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_MACH] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_LZD] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_FBH] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_FBL] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_CBIT] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_ADDC] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_SUBB] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_SAD2] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SADA2] = {
+      .gen = GEN_ALL,
+   },
+   /* Reserved 82-83 */
+   [BRW_OPCODE_DP4] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_DPH] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_DP3] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_DP2] = {
+      .gen = GEN_ALL,
+   },
+   /* Reserved 88 */
+   [BRW_OPCODE_LINE] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_PLN] = {
+      .gen = GEN_GE(GEN45),
+   },
+   [BRW_OPCODE_MAD] = {
+      .gen = GEN_GE(GEN6),
+   },
+   [BRW_OPCODE_LRP] = {
+      .gen = GEN_GE(GEN6),
+   },
+   /* Reserved 93-124 */
+   /* BRW_OPCODE_NENOP */
+   [BRW_OPCODE_NOP] = {
+      .gen = GEN_ALL,
+   },
+};
 
 static unsigned
-num_sources_from_inst(const struct gen_device_info *devinfo,
+num_sources_from_inst(const struct brw_device_info *devinfo,
                       const brw_inst *inst)
 {
-   const struct opcode_desc *desc =
-      brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst));
    unsigned math_function;
 
    if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MATH) {
@@ -94,10 +314,8 @@ num_sources_from_inst(const struct gen_device_info *devinfo,
           */
          return 0;
       }
-   } else if (desc) {
-      return desc->nsrc;
    } else {
-      return 0;
+      return opcode_descs[brw_inst_opcode(devinfo, inst)].nsrc;
    }
 
    switch (math_function) {
@@ -123,18 +341,34 @@ num_sources_from_inst(const struct gen_device_info *devinfo,
    }
 }
 
+static enum gen
+gen_from_devinfo(const struct brw_device_info *devinfo)
+{
+   switch (devinfo->gen) {
+   case 4: return devinfo->is_g4x ? GEN45 : GEN4;
+   case 5: return GEN5;
+   case 6: return GEN6;
+   case 7: return devinfo->is_haswell ? GEN75 : GEN7;
+   case 8: return GEN8;
+   case 9: return GEN9;
+   default:
+      unreachable("not reached");
+   }
+}
+
 static bool
-is_unsupported_inst(const struct gen_device_info *devinfo,
+is_unsupported_inst(const struct brw_device_info *devinfo,
                     const brw_inst *inst)
 {
-   return brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)) == NULL;
+   enum gen gen = gen_from_devinfo(devinfo);
+   return (inst_info[brw_inst_opcode(devinfo, inst)].gen & gen) == 0;
 }
 
 bool
 brw_validate_instructions(const struct brw_codegen *p, int start_offset,
                           struct annotation_info *annotation)
 {
-   const struct gen_device_info *devinfo = p->devinfo;
+   const struct brw_device_info *devinfo = p->devinfo;
    const void *store = p->store + start_offset / 16;
    bool valid = true;
 
@@ -163,18 +397,6 @@ brw_validate_instructions(const struct brw_codegen *p, int start_offset,
       ERROR_IF(is_unsupported_inst(devinfo, inst),
                "Instruction not supported on this Gen");
 
-      if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND) {
-         ERROR_IF(brw_inst_src0_address_mode(devinfo, inst) !=
-                  BRW_ADDRESS_DIRECT, "send must use direct addressing");
-
-         if (devinfo->gen >= 7) {
-            ERROR_IF(!src0_is_grf(devinfo, inst), "send from non-GRF");
-            ERROR_IF(brw_inst_eot(devinfo, inst) &&
-                     brw_inst_src0_da_reg_nr(devinfo, inst) < 112,
-                     "send with EOT must use g112-g127");
-         }
-      }
-
       if (error_msg.str && annotation) {
          annotation_insert_error(annotation, src_offset, error_msg.str);
       }
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_fs_validate.cpp b/lib/mesa/src/mesa/drivers/dri/i965/brw_fs_validate.cpp
index 676942c19..90edd023b 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_fs_validate.cpp
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_fs_validate.cpp
@@ -43,14 +43,14 @@ fs_visitor::validate()
 {
    foreach_block_and_inst (block, fs_inst, inst, cfg) {
       if (inst->dst.file == VGRF) {
-         fsv_assert(inst->dst.offset / REG_SIZE + regs_written(inst) <=
+         fsv_assert(inst->dst.reg_offset + inst->regs_written <=
                     alloc.sizes[inst->dst.nr]);
       }
 
       for (unsigned i = 0; i < inst->sources; i++) {
          if (inst->src[i].file == VGRF) {
-            fsv_assert(inst->src[i].offset / REG_SIZE + regs_read(inst, i) <=
-                       alloc.sizes[inst->src[i].nr]);
+            fsv_assert(inst->src[i].reg_offset + inst->regs_read(i) <=
+                       (int)alloc.sizes[inst->src[i].nr]);
          }
       }
    }
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c b/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c
index 0bb766d70..9c65e540d 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_attribute_workarounds.c
@@ -39,11 +39,12 @@ struct attr_wa_state {
 };
 
 static bool
-apply_attr_wa_block(nir_block *block, struct attr_wa_state *state)
+apply_attr_wa_block(nir_block *block, void *void_state)
 {
+   struct attr_wa_state *state = void_state;
    nir_builder *b = &state->builder;
 
-   nir_foreach_instr_safe(instr, block) {
+   nir_foreach_instr_safe(block, instr) {
       if (instr->type != nir_instr_type_intrinsic)
          continue;
 
@@ -155,16 +156,14 @@ brw_nir_apply_attribute_workarounds(nir_shader *shader,
       .wa_flags = attrib_wa_flags,
    };
 
-   nir_foreach_function(func, shader) {
+   nir_foreach_function(shader, func) {
       if (!func->impl)
          continue;
 
       nir_builder_init(&state.builder, func->impl);
       state.impl_progress = false;
 
-      nir_foreach_block(block, func->impl) {
-         apply_attr_wa_block(block, &state);
-      }
+      nir_foreach_block(func->impl, apply_attr_wa_block, &state);
 
       if (state.impl_progress) {
          nir_metadata_preserve(func->impl, nir_metadata_block_index |
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c b/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
index 14a9a0fac..5ff2cba04 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
@@ -44,7 +44,7 @@ are_all_uses_fadd(nir_ssa_def *def)
    if (!list_empty(&def->if_uses))
       return false;
 
-   nir_foreach_use(use_src, def) {
+   nir_foreach_use(def, use_src) {
       nir_instr *use_instr = use_src->parent_instr;
 
       if (use_instr->type != nir_instr_type_alu)
@@ -84,17 +84,6 @@ get_mul_for_src(nir_alu_src *src, int num_components,
       return NULL;
 
    nir_alu_instr *alu = nir_instr_as_alu(instr);
-
-   /* We want to bail if any of the other ALU operations involved is labled
-    * exact.  One reason for this is that, while the value that is changing is
-    * actually the result of the add and not the multiply, the intention of
-    * the user when they specify an exact multiply is that they want *that*
-    * value and what they don't care about is the add.  Another reason is that
-    * SPIR-V explicitly requires this behaviour.
-    */
-   if (alu->exact)
-      return NULL;
-
    switch (alu->op) {
    case nir_op_imov:
    case nir_op_fmov:
@@ -113,7 +102,7 @@ get_mul_for_src(nir_alu_src *src, int num_components,
       break;
 
    case nir_op_fmul:
-      /* Only absorb a fmul into a ffma if the fmul is only used in fadd
+      /* Only absorb a fmul into a ffma if the fmul is is only used in fadd
        * operations.  This prevents us from being too aggressive with our
        * fusing which can actually lead to more instructions.
        */
@@ -167,11 +156,11 @@ any_alu_src_is_a_constant(nir_alu_src srcs[])
 }
 
 static bool
-brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx)
+brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
 {
-   bool progress = false;
+   struct peephole_ffma_state *state = void_state;
 
-   nir_foreach_instr_safe(instr, block) {
+   nir_foreach_instr_safe(block, instr) {
       if (instr->type != nir_instr_type_alu)
          continue;
 
@@ -179,9 +168,7 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx)
       if (add->op != nir_op_fadd)
          continue;
 
-      assert(add->dest.dest.is_ssa);
-      if (add->exact)
-         continue;
+      /* TODO: Maybe bail if this expression is considered "precise"? */
 
       assert(add->src[0].src.is_ssa && add->src[1].src.is_ssa);
 
@@ -214,8 +201,6 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx)
       if (mul == NULL)
          continue;
 
-      unsigned bit_size = add->dest.dest.ssa.bit_size;
-
       nir_ssa_def *mul_src[2];
       mul_src[0] = mul->src[0].src.ssa;
       mul_src[1] = mul->src[1].src.ssa;
@@ -231,10 +216,11 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx)
 
       if (abs) {
          for (unsigned i = 0; i < 2; i++) {
-            nir_alu_instr *abs = nir_alu_instr_create(mem_ctx, nir_op_fabs);
+            nir_alu_instr *abs = nir_alu_instr_create(state->mem_ctx,
+                                                      nir_op_fabs);
             abs->src[0].src = nir_src_for_ssa(mul_src[i]);
             nir_ssa_dest_init(&abs->instr, &abs->dest.dest,
-                              mul_src[i]->num_components, bit_size, NULL);
+                              mul_src[i]->num_components, NULL);
             abs->dest.write_mask = (1 << mul_src[i]->num_components) - 1;
             nir_instr_insert_before(&add->instr, &abs->instr);
             mul_src[i] = &abs->dest.dest.ssa;
@@ -242,16 +228,17 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx)
       }
 
       if (negate) {
-         nir_alu_instr *neg = nir_alu_instr_create(mem_ctx, nir_op_fneg);
+         nir_alu_instr *neg = nir_alu_instr_create(state->mem_ctx,
+                                                   nir_op_fneg);
          neg->src[0].src = nir_src_for_ssa(mul_src[0]);
          nir_ssa_dest_init(&neg->instr, &neg->dest.dest,
-                           mul_src[0]->num_components, bit_size, NULL);
+                           mul_src[0]->num_components, NULL);
          neg->dest.write_mask = (1 << mul_src[0]->num_components) - 1;
          nir_instr_insert_before(&add->instr, &neg->instr);
          mul_src[0] = &neg->dest.dest.ssa;
       }
 
-      nir_alu_instr *ffma = nir_alu_instr_create(mem_ctx, nir_op_ffma);
+      nir_alu_instr *ffma = nir_alu_instr_create(state->mem_ctx, nir_op_ffma);
       ffma->dest.saturate = add->dest.saturate;
       ffma->dest.write_mask = add->dest.write_mask;
 
@@ -266,7 +253,6 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx)
 
       nir_ssa_dest_init(&ffma->instr, &ffma->dest.dest,
                         add->dest.dest.ssa.num_components,
-                        bit_size,
                         add->dest.dest.ssa.name);
       nir_ssa_def_rewrite_uses(&add->dest.dest.ssa,
                                nir_src_for_ssa(&ffma->dest.dest.ssa));
@@ -275,27 +261,28 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *mem_ctx)
       assert(list_empty(&add->dest.dest.ssa.uses));
       nir_instr_remove(&add->instr);
 
-      progress = true;
+      state->progress = true;
    }
 
-   return progress;
+   return true;
 }
 
 static bool
 brw_nir_opt_peephole_ffma_impl(nir_function_impl *impl)
 {
-   bool progress = false;
-   void *mem_ctx = ralloc_parent(impl);
+   struct peephole_ffma_state state;
 
-   nir_foreach_block(block, impl) {
-      progress |= brw_nir_opt_peephole_ffma_block(block, mem_ctx);
-   }
+   state.mem_ctx = ralloc_parent(impl);
+   state.impl = impl;
+   state.progress = false;
+
+   nir_foreach_block(impl, brw_nir_opt_peephole_ffma_block, &state);
 
-   if (progress)
+   if (state.progress)
       nir_metadata_preserve(impl, nir_metadata_block_index |
                                   nir_metadata_dominance);
 
-   return progress;
+   return state.progress;
 }
 
 bool
@@ -303,7 +290,7 @@ brw_nir_opt_peephole_ffma(nir_shader *shader)
 {
    bool progress = false;
 
-   nir_foreach_function(function, shader) {
+   nir_foreach_function(shader, function) {
       if (function->impl)
          progress |= brw_nir_opt_peephole_ffma_impl(function->impl);
    }
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_builder.h b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_builder.h
index dab6e0377..3a8617e05 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_builder.h
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_builder.h
@@ -95,7 +95,7 @@ namespace brw {
       vec4_builder
       at_end() const
       {
-         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
+         return at(NULL, (exec_node *)&shader->instructions.tail);
       }
 
       /**
@@ -373,7 +373,6 @@ namespace brw {
       ALU1(CBIT)
       ALU2(CMPN)
       ALU3(CSEL)
-      ALU1(DIM)
       ALU2(DP2)
       ALU2(DP3)
       ALU2(DP4)
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp
index c531fba03..0c8224f5f 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp
@@ -68,10 +68,10 @@ opt_cmod_propagation_local(bblock_t *block)
 
       bool read_flag = false;
       foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) {
-         if (regions_overlap(inst->src[0], inst->size_read(0),
-                             scan_inst->dst, scan_inst->size_written)) {
+         if (inst->src[0].in_range(scan_inst->dst,
+                                   scan_inst->regs_written)) {
             if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) ||
-                scan_inst->dst.offset != inst->src[0].offset ||
+                scan_inst->dst.reg_offset != inst->src[0].reg_offset ||
                 (scan_inst->dst.writemask != WRITEMASK_X &&
                  scan_inst->dst.writemask != WRITEMASK_XYZW) ||
                 (scan_inst->dst.writemask == WRITEMASK_XYZW &&
@@ -115,18 +115,6 @@ opt_cmod_propagation_local(bblock_t *block)
                break;
             }
 
-            /* The conditional mod of the CMP/CMPN instructions behaves
-             * specially because the flag output is not calculated from the
-             * result of the instruction, but the other way around, which
-             * means that even if the condmod to propagate and the condmod
-             * from the CMP instruction are the same they will in general give
-             * different results because they are evaluated based on different
-             * inputs.
-             */
-            if (scan_inst->opcode == BRW_OPCODE_CMP ||
-                scan_inst->opcode == BRW_OPCODE_CMPN)
-               break;
-
             /* Otherwise, try propagating the conditional. */
             enum brw_conditional_mod cond =
                inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp
index 19c685fee..28002c56c 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp
@@ -145,7 +145,7 @@ namespace brw {
             vec4_instruction *inst =
                bld.emit(op, dst, src_reg(payload), usurface, brw_imm_ud(arg));
             inst->mlen = sz;
-            inst->size_written = ret_sz * REG_SIZE;
+            inst->regs_written = ret_sz;
             inst->header_size = header_sz;
             inst->predicate = pred;
 
@@ -221,7 +221,7 @@ namespace brw {
                           emit_insert(bld, addr, dims, has_simd4x2),
                           has_simd4x2 ? 1 : dims,
                           emit_insert(bld, src_reg(srcs), size, has_simd4x2),
-                          has_simd4x2 && size ? 1 : size,
+                          has_simd4x2 ? 1 : size,
                           surface, op, rsize, pred);
       }
 
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index 498fb7cfb..9b49b7df8 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -29,7 +29,6 @@
 
 #include "brw_nir.h"
 #include "brw_vec4_tcs.h"
-#include "brw_fs.h"
 
 namespace brw {
 
@@ -49,12 +48,62 @@ vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler,
 
 
 void
+vec4_tcs_visitor::emit_nir_code()
+{
+   if (key->program_string_id != 0) {
+      /* We have a real application-supplied TCS, emit real code. */
+      vec4_visitor::emit_nir_code();
+   } else {
+      /* There is no TCS; automatically generate a passthrough shader
+       * that writes the API-specified default tessellation levels and
+       * copies VS outputs to TES inputs.
+       */
+      uniforms = 2;
+      uniform_size[0] = 1;
+      uniform_size[1] = 1;
+
+      uint64_t varyings = key->outputs_written;
+
+      src_reg vertex_offset(this, glsl_type::uint_type);
+      emit(MUL(dst_reg(vertex_offset), invocation_id,
+               brw_imm_ud(prog_data->vue_map.num_per_vertex_slots)));
+
+      while (varyings != 0) {
+         const int varying = ffsll(varyings) - 1;
+
+         unsigned in_offset = input_vue_map->varying_to_slot[varying];
+         unsigned out_offset = prog_data->vue_map.varying_to_slot[varying];
+         assert(out_offset >= 2);
+
+         dst_reg val(this, glsl_type::vec4_type);
+         emit_input_urb_read(val, invocation_id, in_offset, src_reg());
+         emit_urb_write(src_reg(val), WRITEMASK_XYZW, out_offset,
+                        vertex_offset);
+
+         varyings &= ~BITFIELD64_BIT(varying);
+      }
+
+      /* Only write the tessellation factors from invocation 0.
+       * There's no point in making other threads do redundant work.
+       */
+      emit(CMP(dst_null_d(), invocation_id, brw_imm_ud(0),
+               BRW_CONDITIONAL_EQ));
+      emit(IF(BRW_PREDICATE_NORMAL));
+      emit_urb_write(src_reg(UNIFORM, 0, glsl_type::vec4_type),
+                     WRITEMASK_XYZW, 0, src_reg());
+      emit_urb_write(src_reg(UNIFORM, 1, glsl_type::vec4_type),
+                     WRITEMASK_XYZW, 1, src_reg());
+      emit(BRW_OPCODE_ENDIF);
+   }
+}
+
+void
 vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
 {
 }
 
 dst_reg *
-vec4_tcs_visitor::make_reg_for_system_value(int location)
+vec4_tcs_visitor::make_reg_for_system_value(int location, const glsl_type *type)
 {
    return NULL;
 }
@@ -135,9 +184,7 @@ vec4_tcs_visitor::emit_thread_end()
        * we don't have stride in the vec4 world, nor UV immediates in
        * align16, so we need an opcode to get invocation_id<0,4,0>.
        */
-      set_condmod(BRW_CONDITIONAL_Z,
-                  emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
-                       invocation_id));
+      emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), invocation_id);
       emit(IF(BRW_PREDICATE_NORMAL));
       for (unsigned i = 0; i < key->input_vertices; i += 2) {
          /* If we have an odd number of input vertices, the last will be
@@ -166,7 +213,6 @@ void
 vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
                                       const src_reg &vertex_index,
                                       unsigned base_offset,
-                                      unsigned first_component,
                                       const src_reg &indirect_offset)
 {
    vec4_instruction *inst;
@@ -192,16 +238,13 @@ vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
    if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
       emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW)));
    } else {
-      src_reg src = src_reg(temp);
-      src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
-      emit(MOV(dst, src));
+      emit(MOV(dst, src_reg(temp)));
    }
 }
 
 void
 vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
                                        unsigned base_offset,
-                                       unsigned first_component,
                                        const src_reg &indirect_offset)
 {
    vec4_instruction *inst;
@@ -217,12 +260,6 @@ vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
    read->offset = base_offset;
    read->mlen = 1;
    read->base_mrf = -1;
-
-   if (first_component) {
-      src_reg src = src_reg(dst);
-      src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
-      emit(MOV(dst, src));
-   }
 }
 
 void
@@ -249,6 +286,53 @@ vec4_tcs_visitor::emit_urb_write(const src_reg &value,
    inst->base_mrf = -1;
 }
 
+static unsigned
+tesslevel_outer_components(GLenum tes_primitive_mode)
+{
+   switch (tes_primitive_mode) {
+   case GL_QUADS:
+      return 4;
+   case GL_TRIANGLES:
+      return 3;
+   case GL_ISOLINES:
+      return 2;
+   default:
+      unreachable("Bogus tessellation domain");
+   }
+   return 0;
+}
+
+static unsigned
+tesslevel_inner_components(GLenum tes_primitive_mode)
+{
+   switch (tes_primitive_mode) {
+   case GL_QUADS:
+      return 2;
+   case GL_TRIANGLES:
+      return 1;
+   case GL_ISOLINES:
+      return 0;
+   default:
+      unreachable("Bogus tessellation domain");
+   }
+   return 0;
+}
+
+/**
+ * Given a normal .xyzw writemask, convert it to a writemask for a vector
+ * that's stored backwards, i.e. .wzyx.
+ */
+static unsigned
+writemask_for_backwards_vector(unsigned mask)
+{
+   unsigned new_mask = 0;
+
+   for (int i = 0; i < 4; i++)
+      new_mask |= ((mask >> i) & 1) << (3 - i);
+
+   return new_mask;
+}
+
 void
 vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 {
@@ -271,14 +355,13 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
       nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]);
       src_reg vertex_index =
-         vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0]))
+         vertex_const ? src_reg(brw_imm_ud(vertex_const->u[0]))
                       : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
 
       dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
       dst.writemask = brw_writemask_for_size(instr->num_components);
 
-      emit_input_urb_read(dst, vertex_index, imm_offset,
-                          nir_intrinsic_component(instr), indirect_offset);
+      emit_input_urb_read(dst, vertex_index, imm_offset, indirect_offset);
       break;
    }
    case nir_intrinsic_load_input:
@@ -287,7 +370,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    case nir_intrinsic_load_output:
    case nir_intrinsic_load_per_vertex_output: {
       src_reg indirect_offset = get_indirect_offset(instr);
-      unsigned imm_offset = instr->const_index[0];
+      unsigned imm_offset = instr->const_index[0];;
 
       dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
       dst.writemask = brw_writemask_for_size(instr->num_components);
@@ -302,15 +385,14 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          case GL_QUADS: {
             /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */
             dst_reg tmp(this, glsl_type::vec4_type);
-            emit_output_urb_read(tmp, 0, 0, src_reg());
+            emit_output_urb_read(tmp, 0, src_reg());
             emit(MOV(writemask(dst, WRITEMASK_XY),
                      swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
             break;
          }
          case GL_TRIANGLES:
             /* DWord 4; use offset 1 but normal swizzle/writemask. */
-            emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, 0,
-                                 src_reg());
+            emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, src_reg());
             break;
          case GL_ISOLINES:
             /* All channels are undefined. */
@@ -342,11 +424,10 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          }
 
          dst_reg tmp(this, glsl_type::vec4_type);
-         emit_output_urb_read(tmp, 1, 0, src_reg());
+         emit_output_urb_read(tmp, 1, src_reg());
          emit(MOV(dst, swizzle(src_reg(tmp), swiz)));
       } else {
-         emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr),
-                              indirect_offset);
+         emit_output_urb_read(dst, imm_offset, indirect_offset);
       }
       break;
    }
@@ -359,67 +440,53 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       src_reg indirect_offset = get_indirect_offset(instr);
       unsigned imm_offset = instr->const_index[0];
 
-      /* The passthrough shader writes the whole patch header as two vec4s;
-       * skip all the gl_TessLevelInner/Outer swizzling.
-       */
-      if (indirect_offset.file == BAD_FILE && !is_passthrough_shader) {
-         if (imm_offset == 0) {
-            value.type = BRW_REGISTER_TYPE_F;
+      if (imm_offset == 0 && indirect_offset.file == BAD_FILE) {
+         value.type = BRW_REGISTER_TYPE_F;
 
-            mask &=
-               (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1;
+         mask &= (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1;
 
-            /* This is a write to gl_TessLevelInner[], which lives in the
-             * Patch URB header.  The layout depends on the domain.
+         /* This is a write to gl_TessLevelInner[], which lives in the
+          * Patch URB header.  The layout depends on the domain.
+          */
+         switch (key->tes_primitive_mode) {
+         case GL_QUADS:
+            /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
+             * We use an XXYX swizzle to reverse put .xy in the .wz
+             * channels, and use a .zw writemask.
              */
-            switch (key->tes_primitive_mode) {
-            case GL_QUADS:
-               /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
-                * We use an XXYX swizzle to reverse put .xy in the .wz
-                * channels, and use a .zw writemask.
-                */
-               swiz = BRW_SWIZZLE4(0, 0, 1, 0);
-               mask = writemask_for_backwards_vector(mask);
-               break;
-            case GL_TRIANGLES:
-               /* gl_TessLevelInner[].x lives at DWord 4, so we set the
-                * writemask to X and bump the URB offset by 1.
-                */
-               imm_offset = 1;
-               break;
-            case GL_ISOLINES:
-               /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
-               return;
-            default:
-               unreachable("Bogus tessellation domain");
-            }
-         } else if (imm_offset == 1) {
-            value.type = BRW_REGISTER_TYPE_F;
-
-            mask &=
-               (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1;
-
-            /* This is a write to gl_TessLevelOuter[] which lives in the
-             * Patch URB Header at DWords 4-7.  However, it's reversed, so
-             * instead of .xyzw we have .wzyx.
+            swiz = BRW_SWIZZLE4(0, 0, 1, 0);
+            mask = writemask_for_backwards_vector(mask);
+            break;
+         case GL_TRIANGLES:
+            /* gl_TessLevelInner[].x lives at DWord 4, so we set the
+             * writemask to X and bump the URB offset by 1.
              */
-            if (key->tes_primitive_mode == GL_ISOLINES) {
-               /* Isolines .xy should be stored in .zw, in order. */
-               swiz = BRW_SWIZZLE4(0, 0, 0, 1);
-               mask <<= 2;
-            } else {
-               /* Other domains are reversed; store .wzyx instead of .xyzw. */
-               swiz = BRW_SWIZZLE_WZYX;
-               mask = writemask_for_backwards_vector(mask);
-            }
+            imm_offset = 1;
+            break;
+         case GL_ISOLINES:
+            /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
+            return;
+         default:
+            unreachable("Bogus tessellation domain");
          }
-      }
+      } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
+         value.type = BRW_REGISTER_TYPE_F;
 
-      unsigned first_component = nir_intrinsic_component(instr);
-      if (first_component) {
-         assert(swiz == BRW_SWIZZLE_XYZW);
-         swiz = BRW_SWZ_COMP_OUTPUT(first_component);
-         mask = mask << first_component;
+         mask &= (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1;
+
+         /* This is a write to gl_TessLevelOuter[] which lives in the
+          * Patch URB Header at DWords 4-7.  However, it's reversed, so
+          * instead of .xyzw we have .wzyx.
+          */
+         if (key->tes_primitive_mode == GL_ISOLINES) {
+            /* Isolines .xy should be stored in .zw, in order. */
+            swiz = BRW_SWIZZLE4(0, 0, 0, 1);
+            mask <<= 2;
+         } else {
+            /* Other domains are reversed; store .wzyx instead of .xyzw. */
+            swiz = BRW_SWIZZLE_WZYX;
+            mask = writemask_for_backwards_vector(mask);
+         }
       }
 
       emit_urb_write(swizzle(value, swiz), mask,
@@ -451,36 +518,23 @@ brw_compile_tcs(const struct brw_compiler *compiler,
                 unsigned *final_assembly_size,
                 char **error_str)
 {
-   const struct gen_device_info *devinfo = compiler->devinfo;
+   const struct brw_device_info *devinfo = compiler->devinfo;
    struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
    const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
 
    nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
+   nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
    nir->info.outputs_written = key->outputs_written;
    nir->info.patch_outputs_written = key->patch_outputs_written;
+   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar, false, NULL);
+   nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
 
-   struct brw_vue_map input_vue_map;
-   brw_compute_vue_map(devinfo, &input_vue_map,
-                       nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID,
-                       true);
+   prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
 
    brw_compute_tess_vue_map(&vue_prog_data->vue_map,
                             nir->info.outputs_written,
                             nir->info.patch_outputs_written);
 
-   nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
-   brw_nir_lower_vue_inputs(nir, is_scalar, &input_vue_map);
-   brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map);
-   if (key->quads_workaround)
-      brw_nir_apply_tcs_quads_workaround(nir);
-
-   nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
-
-   if (is_scalar)
-      prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 8);
-   else
-      prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
-
    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
     * That divides up as follows:
     *
@@ -507,6 +561,11 @@ brw_compile_tcs(const struct brw_compiler *compiler,
    /* URB entry sizes are stored as a multiple of 64 bytes. */
    vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
 
+   struct brw_vue_map input_vue_map;
+   brw_compute_vue_map(devinfo, &input_vue_map,
+                       nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID,
+                       true);
+
    /* HS does not use the usual payload pushing from URB to GRFs,
     * because we don't have enough registers for a full-size payload, and
     * the hardware is broken on Haswell anyway.
@@ -520,50 +579,20 @@ brw_compile_tcs(const struct brw_compiler *compiler,
       brw_print_vue_map(stderr, &vue_prog_data->vue_map);
    }
 
-   if (is_scalar) {
-      fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
-                   &prog_data->base.base, NULL, nir, 8,
-                   shader_time_index, &input_vue_map);
-      if (!v.run_tcs_single_patch()) {
-         if (error_str)
-            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
-      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
-
-      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
-                     &prog_data->base.base, v.promoted_constants, false,
-                     MESA_SHADER_TESS_CTRL);
-      if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
-         g.enable_debug(ralloc_asprintf(mem_ctx,
-                                        "%s tessellation control shader %s",
-                                        nir->info.label ? nir->info.label
-                                                        : "unnamed",
-                                        nir->info.name));
-      }
-
-      g.generate_code(v.cfg, 8);
-
-      return g.get_assembly(final_assembly_size);
-   } else {
-      vec4_tcs_visitor v(compiler, log_data, key, prog_data,
-                         nir, mem_ctx, shader_time_index, &input_vue_map);
-      if (!v.run()) {
-         if (error_str)
-            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
-         return NULL;
-      }
-
-      if (unlikely(INTEL_DEBUG & DEBUG_TCS))
-         v.dump_instructions();
+   vec4_tcs_visitor v(compiler, log_data, key, prog_data,
+                      nir, mem_ctx, shader_time_index, &input_vue_map);
+   if (!v.run()) {
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+      return NULL;
+   }
 
+   if (unlikely(INTEL_DEBUG & DEBUG_TCS))
+      v.dump_instructions();
 
-      return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
-                                        &prog_data->base, v.cfg,
-                                        final_assembly_size);
-   }
+   return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
+                                     &prog_data->base, v.cfg,
+                                     final_assembly_size);
 }
 
 
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.h b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.h
index 030eb5e66..2c6801b2a 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.h
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tcs.h
@@ -49,7 +49,9 @@ public:
                     const struct brw_vue_map *input_vue_map);
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(int location);
+   virtual void emit_nir_code();
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type);
    virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
    virtual void setup_payload();
    virtual void emit_prolog();
@@ -60,11 +62,9 @@ protected:
    void emit_input_urb_read(const dst_reg &dst,
                             const src_reg &vertex_index,
                             unsigned base_offset,
-                            unsigned first_component,
                             const src_reg &indirect_offset);
    void emit_output_urb_read(const dst_reg &dst,
                              unsigned base_offset,
-                             unsigned first_component,
                              const src_reg &indirect_offset);
 
    void emit_urb_write(const src_reg &value, unsigned writemask,
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp
index 226dcb4f6..7ba494fbf 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp
@@ -46,7 +46,7 @@ vec4_tes_visitor::vec4_tes_visitor(const struct brw_compiler *compiler,
 
 
 dst_reg *
-vec4_tes_visitor::make_reg_for_system_value(int location)
+vec4_tes_visitor::make_reg_for_system_value(int location, const glsl_type *type)
 {
    return NULL;
 }
@@ -177,9 +177,7 @@ vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    case nir_intrinsic_load_input:
    case nir_intrinsic_load_per_vertex_input: {
       src_reg indirect_offset = get_indirect_offset(instr);
-      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
       unsigned imm_offset = instr->const_index[0];
-      unsigned first_component = nir_intrinsic_component(instr);
       src_reg header = input_read_header;
 
       if (indirect_offset.file != BAD_FILE) {
@@ -192,10 +190,8 @@ vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
           */
          const unsigned max_push_slots = 24;
          if (imm_offset < max_push_slots) {
-            src_reg src = src_reg(ATTR, imm_offset, glsl_type::ivec4_type);
-            src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
-
-            emit(MOV(dst, src));
+            emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D),
+                     src_reg(ATTR, imm_offset, glsl_type::ivec4_type)));
             prog_data->urb_read_length =
                MAX2(prog_data->urb_read_length,
                     DIV_ROUND_UP(imm_offset + 1, 2));
@@ -209,14 +205,12 @@ vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       read->offset = imm_offset;
       read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
 
-      src_reg src = src_reg(temp);
-      src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
-
       /* Copy to target.  We might end up with some funky writemasks landing
        * in here, but we really don't want them in the above pseudo-ops.
        */
+      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
       dst.writemask = brw_writemask_for_size(instr->num_components);
-      emit(MOV(dst, src));
+      emit(MOV(dst, src_reg(temp)));
       break;
    }
    default:
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.h b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.h
index 31a28f359..4b697aa59 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.h
+++ b/lib/mesa/src/mesa/drivers/dri/i965/brw_vec4_tes.h
@@ -47,7 +47,8 @@ public:
                    int shader_time_index);
 
 protected:
-   virtual dst_reg *make_reg_for_system_value(int location);
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type);
    virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
    virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
 
diff --git a/lib/mesa/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp b/lib/mesa/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp
index 1323b6507..8d4a447a8 100644
--- a/lib/mesa/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp
+++ b/lib/mesa/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp
@@ -36,9 +36,10 @@ class cmod_propagation_test : public ::testing::Test {
 
 public:
    struct brw_compiler *compiler;
-   struct gen_device_info *devinfo;
+   struct brw_device_info *devinfo;
    struct gl_context *ctx;
    struct gl_shader_program *shader_prog;
+   struct brw_vertex_program *vp;
    struct brw_vue_prog_data *prog_data;
    vec4_visitor *v;
 };
@@ -57,7 +58,8 @@ public:
 
 protected:
    /* Dummy implementation for pure virtual methods */
-   virtual dst_reg *make_reg_for_system_value(int location)
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type)
    {
       unreachable("Not reached");
    }
@@ -98,14 +100,18 @@ void cmod_propagation_test::SetUp()
 {
    ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
    compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
-   devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+   devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
    prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data));
    compiler->devinfo = devinfo;
 
+   vp = ralloc(NULL, struct brw_vertex_program);
+
    nir_shader *shader = nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL);
 
    v = new cmod_propagation_vec4_visitor(compiler, shader, prog_data);
 
+   _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0);
+
    devinfo->gen = 4;
 }
 
@@ -370,7 +376,7 @@ TEST_F(cmod_propagation_test, intervening_dest_write)
    src_reg zero(brw_imm_f(0.0f));
    bld.ADD(offset(dest, 2), src0, src1);
    bld.emit(SHADER_OPCODE_TEX, dest, src2)
-      ->size_written = 4 * REG_SIZE;
+      ->regs_written = 4;
    bld.CMP(bld.null_reg_f(), offset(src_reg(dest), 2), zero, BRW_CONDITIONAL_GE);
 
    /* = Before =