Merge Mesa 17.2.8

author: Jonathan Gray <jsg@cvs.openbsd.org> 2017-12-31 07:12:27 +0000
committer: Jonathan Gray <jsg@cvs.openbsd.org> 2017-12-31 07:12:27 +0000
commit: 051645c92924bf915d82bf219f2ed67309b5577a (patch)
tree: 4aae126dd8e5a18c6a9926a5468d1561e6038a07 /lib/mesa/src/gallium/auxiliary/gallivm
parent: 2dae6fe6f74cf7fb9fd65285302c0331d9786b00 (diff)
27 files changed, 1871 insertions, 740 deletions
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 3ea073433..04f86bef2 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1091,6 +1091,180 @@ lp_build_mul(struct lp_build_context *bld,
    return res;
 }
 
+/*
+ * Widening mul, valid for 32x32 bit -> 64bit only.
+ * Result is low 32bits, high bits returned in res_hi.
+ *
+ * Emits code that is meant to be compiled for the host CPU.
+ */
+LLVMValueRef
+lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
+                         LLVMValueRef a,
+                         LLVMValueRef b,
+                         LLVMValueRef *res_hi)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+
+   assert(bld->type.width == 32);
+   assert(bld->type.floating == 0);
+   assert(bld->type.fixed == 0);
+   assert(bld->type.norm == 0);
+
+   /*
+    * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
+    * for x86 simd is atrocious (even if the high bits weren't required),
+    * trying to handle real 64bit inputs (which of course can't happen due
+    * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
+    * apparently llvm does not recognize this widening mul). This includes 6
+    * (instead of 2) pmuludq plus extra adds and shifts
+    * The same story applies to signed mul, albeit fixing this requires sse41.
+    * https://llvm.org/bugs/show_bug.cgi?id=30845
+    * So, whip up our own code, albeit only for length 4 and 8 (which
+    * should be good enough)...
+    */
+   if ((bld->type.length == 4 || bld->type.length == 8) &&
+       ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
+        util_cpu_caps.has_sse4_1)) {
+      const char *intrinsic = NULL;
+      LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
+      LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
+      struct lp_type type_wide = lp_wider_type(bld->type);
+      LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
+      unsigned i;
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i+1);
+         shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      aeven = a;
+      beven = b;
+      aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
+      bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
+
+      if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
+         if (bld->type.sign) {
+            intrinsic = "llvm.x86.avx2.pmul.dq";
+         } else {
+            intrinsic = "llvm.x86.avx2.pmulu.dq";
+         }
+         muleven = lp_build_intrinsic_binary(builder, intrinsic,
+                                             wider_type, aeven, beven);
+         mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+                                            wider_type, aodd, bodd);
+      }
+      else {
+         /* for consistent naming look elsewhere... */
+         if (bld->type.sign) {
+            intrinsic = "llvm.x86.sse41.pmuldq";
+         } else {
+            intrinsic = "llvm.x86.sse2.pmulu.dq";
+         }
+         /*
+          * XXX If we only have AVX but not AVX2 this is a pain.
+          * lp_build_intrinsic_binary_anylength() can't handle it
+          * (due to src and dst type not being identical).
+          */
+         if (bld->type.length == 8) {
+            LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
+            LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
+            LLVMValueRef muleven2[2], mulodd2[2];
+            struct lp_type type_wide_half = type_wide;
+            LLVMTypeRef wtype_half;
+            type_wide_half.length = 2;
+            wtype_half = lp_build_vec_type(gallivm, type_wide_half);
+            aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
+            aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
+            bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
+            bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
+            aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
+            aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
+            boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
+            boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
+            muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                    wtype_half, aevenlo, bevenlo);
+            mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                   wtype_half, aoddlo, boddlo);
+            muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                    wtype_half, aevenhi, bevenhi);
+            mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                   wtype_half, aoddhi, boddhi);
+            muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
+            mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
+
+         }
+         else {
+            muleven = lp_build_intrinsic_binary(builder, intrinsic,
+                                                wider_type, aeven, beven);
+            mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+                                               wider_type, aodd, bodd);
+         }
+      }
+      muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
+      mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
+
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i + 1);
+         shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
+
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i);
+         shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
+   }
+   else {
+      return lp_build_mul_32_lohi(bld, a, b, res_hi);
+   }
+}
+
+
+/*
+ * Widening mul, valid for 32x32 bit -> 64bit only.
+ * Result is low 32bits, high bits returned in res_hi.
+ *
+ * Emits generic code.
+ */
+LLVMValueRef
+lp_build_mul_32_lohi(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     LLVMValueRef *res_hi)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef tmp, shift, res_lo;
+   struct lp_type type_tmp;
+   LLVMTypeRef wide_type, narrow_type;
+
+   type_tmp = bld->type;
+   narrow_type = lp_build_vec_type(gallivm, type_tmp);
+   type_tmp.width *= 2;
+   wide_type = lp_build_vec_type(gallivm, type_tmp);
+   shift = lp_build_const_vec(gallivm, type_tmp, 32);
+
+   if (bld->type.sign) {
+      a = LLVMBuildSExt(builder, a, wide_type, "");
+      b = LLVMBuildSExt(builder, b, wide_type, "");
+   } else {
+      a = LLVMBuildZExt(builder, a, wide_type, "");
+      b = LLVMBuildZExt(builder, b, wide_type, "");
+   }
+   tmp = LLVMBuildMul(builder, a, b, "");
+
+   res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
+
+   /* Since we truncate anyway, LShr and AShr are equivalent. */
+   tmp = LLVMBuildLShr(builder, tmp, shift, "");
+   *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
+
+   return res_lo;
+}
+
 
 /* a * b + c */
 LLVMValueRef
@@ -1198,7 +1372,9 @@ lp_build_div(struct lp_build_context *bld,
          return LLVMConstUDiv(a, b);
    }
 
-   if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+   /* fast rcp is disabled (just uses div), so makes no sense to try that */
+   if(FALSE &&
+      ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
       type.floating)
       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 622b930a9..2a4137a67 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -77,6 +77,18 @@ lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef b);
 
 LLVMValueRef
+lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
+                         LLVMValueRef a,
+                         LLVMValueRef b,
+                         LLVMValueRef *res_hi);
+
+LLVMValueRef
+lp_build_mul_32_lohi(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     LLVMValueRef *res_hi);
+
+LLVMValueRef
 lp_build_mul_imm(struct lp_build_context *bld,
                  LLVMValueRef a,
                  int b);
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 69d24a55b..c688965a7 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -456,21 +456,21 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
        src_type.sign == dst_type->sign)
       return num_dsts;
 
-   /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
+   /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8
     */
-   if (src_type.floating == 1 &&
-       src_type.fixed    == 0 &&
-       src_type.sign     == 1 &&
-       src_type.norm     == 0 &&
+   if (src_type.norm     == 0 &&
        src_type.width    == 32 &&
+       src_type.fixed    == 0 &&
 
        dst_type->floating == 0 &&
        dst_type->fixed    == 0 &&
-       dst_type->sign     == 0 &&
-       dst_type->norm     == 1 &&
-       dst_type->width    == 8)
-   {
-      /* Special case 4x4f --> 1x16ub */
+       dst_type->width    == 8 &&
+
+       ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) ||
+        (src_type.floating == 0 && dst_type->floating == 0 &&
+         src_type.sign == dst_type->sign && dst_type->norm == 0))) {
+
+      /* Special case 4x4x32 --> 1x16x8 */
       if (src_type.length == 4 &&
             (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
       {
@@ -481,7 +481,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
          return num_dsts;
       }
 
-      /* Special case 2x8f --> 1x16ub */
+      /* Special case 2x8x32 --> 1x16x8 */
       if (src_type.length == 8 &&
           util_cpu_caps.has_avx)
       {
@@ -497,8 +497,25 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
    if (src_type.width == dst_type->width) {
       lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
    } else {
-      for (i = 0; i < num_srcs; ++i) {
-         lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
+      /*
+       * If dst_width is 16 bits and src_width 32 and the dst vector size
+       * 64bit, try feeding 2 vectors at once so pack intrinsics can be used.
+       * (For AVX, this isn't needed, since we usually get 256bit src and
+       * 128bit dst vectors which works ok. If we do AVX2 pack this should
+       * be extended but need to be able to tell conversion code about pack
+       * ordering first.)
+       */
+      unsigned ratio = 1;
+      if (src_type.width == 2 * dst_type->width &&
+          src_type.length == dst_type->length &&
+          dst_type->floating == 0 && (num_srcs % 2 == 0) &&
+          dst_type->width * dst_type->length == 64) {
+         ratio = 2;
+         num_dsts /= 2;
+         dst_type->length *= 2;
+      }
+      for (i = 0; i < num_dsts; i++) {
+         lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, &dst[i], 1);
       }
    }
 
@@ -541,21 +558,25 @@ lp_build_conv(struct gallivm_state *gallivm,
    num_tmps = num_srcs;
 
 
-   /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub
+   /*
+    * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8
+    * Only float -> s/unorm8 and (u)int32->(u)int8.
+    * XXX: This should cover all interesting backend cases for 8 bit,
+    * but should use same strategy if dst is 16 bit.
     */
-   if (src_type.floating == 1 &&
-       src_type.fixed    == 0 &&
-       src_type.sign     == 1 &&
-       src_type.norm     == 0 &&
+   if (src_type.norm     == 0 &&
        src_type.width    == 32 &&
        src_type.length   == 4 &&
+       src_type.fixed    == 0 &&
 
        dst_type.floating == 0 &&
        dst_type.fixed    == 0 &&
-       dst_type.sign     == 0 &&
-       dst_type.norm     == 1 &&
        dst_type.width    == 8 &&
 
+       ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
+        (src_type.floating == 0 && dst_type.floating == 0 &&
+         src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
+
        ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
         (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
 
@@ -564,7 +585,7 @@ lp_build_conv(struct gallivm_state *gallivm,
       struct lp_build_context bld;
       struct lp_type int16_type, int32_type;
       struct lp_type dst_type_ext = dst_type;
-      LLVMValueRef const_255f;
+      LLVMValueRef const_scale;
       unsigned i, j;
 
       lp_build_context_init(&bld, gallivm, src_type);
@@ -580,14 +601,54 @@ lp_build_conv(struct gallivm_state *gallivm,
       int32_type.length /= 4;
       int32_type.sign = 1;
 
-      const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+      const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
 
       for (i = 0; i < num_dsts; ++i, src += 4) {
          LLVMValueRef lo, hi;
 
-         for (j = 0; j < dst_type.length / 4; ++j) {
-            tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
-            tmp[j] = lp_build_iround(&bld, tmp[j]);
+         if (src_type.floating) {
+            for (j = 0; j < dst_type.length / 4; ++j) {
+               /*
+                * XXX This is not actually fully correct. The float to int
+                * conversion will produce 0x80000000 value for everything
+                * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq).
+                * Hence, NaNs and negatives will get clamped just fine to zero
+                * (relying on clamping pack behavior) when converting to unorm,
+                * however too large values (both finite and infinite) will also
+                * end up as zero, not 255.
+                * For snorm, for now we'll keep bug compatibility with generic
+                * conversion path (meaning too large values are fine, but
+                * NaNs get converted to -128 (purely by luck, as we don't
+                * specify nan behavior for the max there) instead of 0).
+                */
+               if (dst_type.sign) {
+                  tmp[j] = lp_build_min(&bld, bld.one, src[j]);
+
+               }
+               else {
+                  if (0) {
+                     tmp[j] = lp_build_min_ext(&bld, bld.one, src[j],
+                                               GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
+                  }
+                  tmp[j] = src[j];
+               }
+               tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, "");
+               tmp[j] = lp_build_iround(&bld, tmp[j]);
+            }
+         } else {
+            for (j = 0; j < dst_type.length / 4; ++j) {
+               if (!dst_type.sign) {
+                  /*
+                   * Pack clamp is always signed->unsigned (or signed->signed).
+                   * Hence need min.
+                   */
+                  LLVMValueRef const_max;
+                  const_max = lp_build_const_int_vec(gallivm, src_type, 255);
+                  tmp[j] = lp_build_min(&bld, src[j], const_max);
+               } else {
+                  tmp[j] = src[j];
+               }
+            }
          }
 
          if (num_srcs == 1) {
@@ -612,20 +673,20 @@ lp_build_conv(struct gallivm_state *gallivm,
       return; 
    }
 
-   /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub
+   /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8
     */
-   else if (src_type.floating == 1 &&
-      src_type.fixed    == 0 &&
-      src_type.sign     == 1 &&
-      src_type.norm     == 0 &&
-      src_type.width    == 32 &&
-      src_type.length   == 8 &&
-
-      dst_type.floating == 0 &&
-      dst_type.fixed    == 0 &&
-      dst_type.sign     == 0 &&
-      dst_type.norm     == 1 &&
-      dst_type.width    == 8 &&
+   else if (src_type.norm     == 0 &&
+       src_type.width    == 32 &&
+       src_type.length   == 8 &&
+       src_type.fixed    == 0 &&
+
+       dst_type.floating == 0 &&
+       dst_type.fixed    == 0 &&
+       dst_type.width    == 8 &&
+
+       ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
+        (src_type.floating == 0 && dst_type.floating == 0 &&
+         src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
 
       ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
        (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
@@ -635,7 +696,7 @@ lp_build_conv(struct gallivm_state *gallivm,
       struct lp_build_context bld;
       struct lp_type int16_type, int32_type;
       struct lp_type dst_type_ext = dst_type;
-      LLVMValueRef const_255f;
+      LLVMValueRef const_scale;
       unsigned i;
 
       lp_build_context_init(&bld, gallivm, src_type);
@@ -651,30 +712,44 @@ lp_build_conv(struct gallivm_state *gallivm,
       int32_type.length /= 4;
       int32_type.sign = 1;
 
-      const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+      const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
 
       for (i = 0; i < num_dsts; ++i, src += 2) {
-         LLVMValueRef lo, hi, a, b;
-
-         a = LLVMBuildFMul(builder, src[0], const_255f, "");
-         a = lp_build_iround(&bld, a);
-         tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
-         tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
-         /* relying on clamping behavior of sse2 intrinsics here */
-         lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
-
-         if (num_srcs == 1) {
-            hi = lo;
+         unsigned j;
+         for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) {
+            LLVMValueRef lo, hi, a;
+
+            a = src[j];
+            if (src_type.floating) {
+               if (dst_type.sign) {
+                  a = lp_build_min(&bld, bld.one, a);
+
+               }
+               else {
+                  if (0) {
+                     a = lp_build_min_ext(&bld, bld.one, a,
+                                          GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
+                  }
+               }
+               a = LLVMBuildFMul(builder, a, const_scale, "");
+               a = lp_build_iround(&bld, a);
+            } else {
+               if (!dst_type.sign) {
+                  LLVMValueRef const_max;
+                  const_max = lp_build_const_int_vec(gallivm, src_type, 255);
+                  a = lp_build_min(&bld, a, const_max);
+               }
+            }
+            lo = lp_build_extract_range(gallivm, a, 0, 4);
+            hi = lp_build_extract_range(gallivm, a, 4, 4);
+            /* relying on clamping behavior of sse2 intrinsics here */
+            tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi);
          }
-         else {
-            b = LLVMBuildFMul(builder, src[1], const_255f, "");
-            b = lp_build_iround(&bld, b);
-            tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
-            tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
-            hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
 
+         if (num_srcs == 1) {
+            tmp[1] = tmp[0];
          }
-         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
+         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]);
       }
 
       if (num_srcs == 1) {
@@ -841,6 +916,10 @@ lp_build_conv(struct gallivm_state *gallivm,
       new_type.width  = dst_type.width;
       new_type.length = dst_type.length;
 
+      /*
+       * Note that resize when using packs can sometimes get min/max
+       * clamping for free. Should be able to exploit this...
+       */
       lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
 
       tmp_type = new_type;
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 5c866f420..6540caaa2 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -143,6 +143,7 @@ void
 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                         const struct util_format_description *format_desc,
                         struct lp_type type,
+                        boolean aligned,
                         LLVMValueRef base_ptr,
                         LLVMValueRef offsets,
                         LLVMValueRef i,
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 9f6b9e9fb..2f723857f 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -38,6 +38,7 @@
 #include "util/u_math.h"
 #include "util/u_pointer.h"
 #include "util/u_string.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_arit.h"
 #include "lp_bld_init.h"
@@ -49,7 +50,10 @@
 #include "lp_bld_gather.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_format.h"
+#include "lp_bld_pack.h"
 #include "lp_bld_intr.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_bitarit.h"
 
 
 /**
@@ -137,6 +141,73 @@ format_matches_type(const struct util_format_description *desc,
    return TRUE;
 }
 
+/*
+ * Do rounding when converting small unorm values to larger ones.
+ * Not quite 100% accurate, as it's done by appending MSBs, but
+ * should be good enough.
+ */
+
+static inline LLVMValueRef
+scale_bits_up(struct gallivm_state *gallivm,
+              int src_bits,
+              int dst_bits,
+              LLVMValueRef src,
+              struct lp_type src_type)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef result = src;
+
+   if (src_bits == 1 && dst_bits > 1) {
+      /*
+       * Useful for a1 - we'd need quite some repeated copies otherwise.
+       */
+      struct lp_build_context bld;
+      LLVMValueRef dst_mask;
+      lp_build_context_init(&bld, gallivm, src_type);
+      dst_mask = lp_build_const_int_vec(gallivm, src_type,
+                                        (1 << dst_bits) - 1),
+      result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src,
+                            lp_build_const_int_vec(gallivm, src_type, 0));
+      result = lp_build_andnot(&bld, dst_mask, result);
+   }
+   else if (dst_bits > src_bits) {
+      /* Scale up bits */
+      int db = dst_bits - src_bits;
+
+      /* Shift left by difference in bits */
+      result = LLVMBuildShl(builder,
+                            src,
+                            lp_build_const_int_vec(gallivm, src_type, db),
+                            "");
+
+      if (db <= src_bits) {
+         /* Enough bits in src to fill the remainder */
+         LLVMValueRef lower = LLVMBuildLShr(builder,
+                                            src,
+                                            lp_build_const_int_vec(gallivm, src_type,
+                                                                   src_bits - db),
+                                            "");
+
+         result = LLVMBuildOr(builder, result, lower, "");
+      } else if (db > src_bits) {
+         /* Need to repeatedly copy src bits to fill remainder in dst */
+         unsigned n;
+
+         for (n = src_bits; n < dst_bits; n *= 2) {
+            LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
+
+            result = LLVMBuildOr(builder,
+                                 result,
+                                 LLVMBuildLShr(builder, result, shuv, ""),
+                                 "");
+         }
+      }
+   } else {
+      assert (dst_bits == src_bits);
+   }
+
+   return result;
+}
 
 /**
  * Unpack a single pixel into its XYZW components.
@@ -156,6 +227,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
    LLVMValueRef shifts[4];
    LLVMValueRef masks[4];
    LLVMValueRef scales[4];
+   LLVMTypeRef vec32_type;
 
    boolean normalized;
    boolean needs_uitofp;
@@ -171,19 +243,17 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
     * matches floating point size */
    assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
 
+   vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+
    /* Broadcast the packed value to all four channels
     * before: packed = BGRA
     * after: packed = {BGRA, BGRA, BGRA, BGRA}
     */
-   packed = LLVMBuildInsertElement(builder,
-                                   LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
-                                   packed,
+   packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
                                    LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
                                    "");
-   packed = LLVMBuildShuffleVector(builder,
-                                   packed,
-                                   LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
-                                   LLVMConstNull(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
+   packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
+                                   LLVMConstNull(vec32_type),
                                    "");
 
    /* Initialize vector constants */
@@ -224,8 +294,40 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
    /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
     * into masked = {X, Y, Z, W}
     */
-   shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
-   masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+   if (desc->block.bits < 32 && normalized) {
+      /*
+       * Note: we cannot do the shift below on x86 natively until AVX2.
+       *
+       * Old llvm versions will resort to scalar extract/shift insert,
+       * which is definitely terrible, new versions will just do
+       * several vector shifts and shuffle/blend results together.
+       * We could turn this into a variable left shift plus a constant
+       * right shift, and llvm would then turn the variable left shift
+       * into a mul for us (albeit without sse41 the mul needs emulation
+       * too...). However, since we're going to do a float mul
+       * anyway, we just adjust that mul instead (plus the mask), skipping
+       * the shift completely.
+       * We could also use a extra mul when the format isn't normalized and
+       * we don't have AVX2 support, but don't bother for now. Unfortunately,
+       * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
+       * rgba8 if it ends up here), as that would require UIToFP, albeit that
+       * would be fixable with easy 16bit shuffle (unless there's channels
+       * crossing 16bit boundaries).
+       */
+      for (i = 0; i < 4; ++i) {
+         if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
+            unsigned bits = desc->channel[i].size;
+            unsigned shift = desc->channel[i].shift;
+            unsigned long long mask = ((1ULL << bits) - 1) << shift;
+            scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
+            masks[i] = lp_build_const_int32(gallivm, mask);
+         }
+      }
+      masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
+   } else {
+      shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
+      masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+   }
 
    if (!needs_uitofp) {
       /* UIToFP can't be expressed in SSE2 */
@@ -234,8 +336,10 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
       casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
    }
 
-   /* At this point 'casted' may be a vector of floats such as
-    * {255.0, 255.0, 255.0, 255.0}.  Next, if the pixel values are normalized
+   /*
+    * At this point 'casted' may be a vector of floats such as
+    * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
+    * by powers of two). Next, if the pixel values are normalized
     * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
     */
 
@@ -391,9 +495,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
 
    if (format_matches_type(format_desc, type) &&
        format_desc->block.bits <= type.width * 4 &&
+       /* XXX this shouldn't be needed */
        util_is_power_of_two(format_desc->block.bits)) {
       LLVMValueRef packed;
       LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
+      struct lp_type fetch_type;
       unsigned vec_len = type.width * type.length;
 
       /*
@@ -401,8 +507,9 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
        * scaling or converting.
        */
 
+      fetch_type = lp_type_uint(type.width*4);
       packed = lp_build_gather(gallivm, type.length/4,
-                               format_desc->block.bits, type.width*4,
+                               format_desc->block.bits, fetch_type,
                                aligned, base_ptr, offset, TRUE);
 
       assert(format_desc->block.bits <= vec_len);
@@ -413,6 +520,86 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
    }
 
    /*
+    * Bit arithmetic for converting small_unorm to unorm8.
+    *
+    * This misses some opportunities for optimizations (like skipping mask
+    * for the highest channel for instance, or doing bit scaling in parallel
+    * for channels with the same bit width) but it should be passable for
+    * all arithmetic formats.
+    */
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
+       util_format_fits_8unorm(format_desc) &&
+       type.width == 8 && type.norm == 1 && type.sign == 0 &&
+       type.fixed == 0 && type.floating == 0) {
+      LLVMValueRef packed, res, chans[4], rgba[4];
+      LLVMTypeRef dst_vec_type, conv_vec_type;
+      struct lp_type fetch_type, conv_type;
+      struct lp_build_context bld_conv;
+      unsigned j;
+
+      fetch_type = lp_type_uint(type.width*4);
+      conv_type = lp_type_int_vec(type.width*4, type.width * type.length);
+      dst_vec_type = lp_build_vec_type(gallivm, type);
+      conv_vec_type = lp_build_vec_type(gallivm, conv_type);
+      lp_build_context_init(&bld_conv, gallivm, conv_type);
+
+      packed = lp_build_gather(gallivm, type.length/4,
+                               format_desc->block.bits, fetch_type,
+                               aligned, base_ptr, offset, TRUE);
+
+      assert(format_desc->block.bits * type.length / 4 <=
+             type.width * type.length);
+
+      packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, "");
+
+      for (j = 0; j < format_desc->nr_channels; ++j) {
+         unsigned mask = 0;
+         unsigned sa = format_desc->channel[j].shift;
+
+         mask = (1 << format_desc->channel[j].size) - 1;
+
+         /* Extract bits from source */
+         chans[j] = LLVMBuildLShr(builder, packed,
+                                  lp_build_const_int_vec(gallivm, conv_type, sa),
+                                  "");
+
+         chans[j] = LLVMBuildAnd(builder, chans[j],
+                                 lp_build_const_int_vec(gallivm, conv_type, mask),
+                                 "");
+
+         /* Scale bits */
+         if (type.norm) {
+            chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size,
+                                     type.width, chans[j], conv_type);
+         }
+      }
+      /*
+       * This is a hacked lp_build_format_swizzle_soa() since we need a
+       * normalized 1 but only 8 bits in a 32bit vector...
+       */
+      for (j = 0; j < 4; ++j) {
+         enum pipe_swizzle swizzle = format_desc->swizzle[j];
+         if (swizzle == PIPE_SWIZZLE_1) {
+            rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1);
+         } else {
+            rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle);
+         }
+         if (j == 0) {
+            res = rgba[j];
+         } else {
+            rgba[j] = LLVMBuildShl(builder, rgba[j],
+                                   lp_build_const_int_vec(gallivm, conv_type,
+                                                          j * type.width), "");
+            res = LLVMBuildOr(builder, res, rgba[j], "");
+         }
+      }
+      res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, "");
+
+      return res;
+   }
+
+   /*
     * Bit arithmetic
     */
 
@@ -421,6 +608,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
        format_desc->block.width == 1 &&
        format_desc->block.height == 1 &&
+       /* XXX this shouldn't be needed */
        util_is_power_of_two(format_desc->block.bits) &&
        format_desc->block.bits <= 32 &&
        format_desc->is_bitmask &&
@@ -430,8 +618,15 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
        !format_desc->channel[0].pure_integer) {
 
       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
-      LLVMValueRef res;
-      unsigned k;
+      LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
+      struct lp_type conv_type;
+      unsigned k, num_conv_src, num_conv_dst;
+
+      /*
+       * Note this path is generally terrible for fetching multiple pixels.
+       * We should make sure we cannot hit this code path for anything but
+       * single pixels.
+       */
 
       /*
        * Unpack a pixel at a time into a <4 x float> RGBA vector
@@ -461,12 +656,38 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                       __FUNCTION__, format_desc->short_name);
       }
 
-      lp_build_conv(gallivm,
-                    lp_float32_vec4_type(),
-                    type,
-                    tmps, num_pixels, &res, 1);
+      conv_type = lp_float32_vec4_type();
+      num_conv_src = num_pixels;
+      num_conv_dst = 1;
+
+      if (num_pixels % 8 == 0) {
+         lp_build_concat_n(gallivm, lp_float32_vec4_type(),
+                           tmps, num_pixels, tmps, num_pixels / 2);
+         conv_type.length *= num_pixels / 4;
+         num_conv_src = 4 * num_pixels / 8;
+         if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
+            /*
+             * FIXME: The fast float->unorm path (which is basically
+             * skipping the MIN/MAX which are extremely pointless in any
+             * case) requires that there's 2 destinations...
+             * In any case, we really should make sure we don't hit this
+             * code with multiple pixels for unorm8 dst types, it's
+             * completely hopeless even if we do hit the right conversion.
+             */
+            type.length /= num_pixels / 4;
+            num_conv_dst = num_pixels / 4;
+         }
+      }
+
+      lp_build_conv(gallivm, conv_type, type,
+                    tmps, num_conv_src, res, num_conv_dst);
+
+      if (num_pixels % 8 == 0 &&
+          (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
+         lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
+      }
 
-      return lp_build_format_swizzle_aos(format_desc, &bld, res);
+      return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
    }
 
    /* If all channels are of same type and we are not using half-floats */
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
index 8cad3a6fc..636a4a623 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -70,7 +70,14 @@ lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
 
    src_vec_type  = lp_build_vec_type(gallivm,  src_type);
 
-   /* Read whole vector from memory, unaligned */
+   /*
+    * Read whole vector from memory, unaligned.
+    * XXX: Note it's actually aligned to element type. Not sure if all
+    * callers are able to guarantee that (whereas for others, we should
+    * be able to use full alignment when there's 2 or 4 channels).
+    * (If all callers can guarantee element type alignment, we should
+    * relax alignment restrictions elsewhere.)
+    */
    ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
    ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
    res = LLVMBuildLoad(builder, ptr, "");
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 7fc4e8d24..22c19b10d 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -31,6 +31,7 @@
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_string.h"
+#include "util/u_math.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -40,6 +41,39 @@
 #include "lp_bld_debug.h"
 #include "lp_bld_format.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
+
+
+static void
+convert_to_soa(struct gallivm_state *gallivm,
+               LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
+               LLVMValueRef dst_soa[4],
+               const struct lp_type soa_type)
+{
+   unsigned j, k;
+   struct lp_type aos_channel_type = soa_type;
+
+   LLVMValueRef aos_channels[4];
+   unsigned pixels_per_channel = soa_type.length / 4;
+
+   debug_assert((soa_type.length % 4) == 0);
+
+   aos_channel_type.length >>= 1;
+
+   for (j = 0; j < 4; ++j) {
+      LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
+
+      assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
+
+      for (k = 0; k < pixels_per_channel; ++k) {
+         channel[k] = src_aos[j + 4 * k];
+      }
+
+      aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
+   }
+
+   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
+}
 
 
 void
@@ -48,9 +82,6 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
                             const LLVMValueRef *unswizzled,
                             LLVMValueRef swizzled_out[4])
 {
-   assert(PIPE_SWIZZLE_0 == (int)PIPE_SWIZZLE_0);
-   assert(PIPE_SWIZZLE_1 == (int)PIPE_SWIZZLE_1);
-
    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
       enum pipe_swizzle swizzle;
       LLVMValueRef depth_or_stencil;
@@ -83,6 +114,166 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
 }
 
 
+
+static LLVMValueRef
+lp_build_extract_soa_chan(struct lp_build_context *bld,
+                          unsigned blockbits,
+                          boolean srgb_chan,
+                          struct util_format_channel_description chan_desc,
+                          LLVMValueRef packed)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_type type = bld->type;
+   LLVMValueRef input = packed;
+   const unsigned width = chan_desc.size;
+   const unsigned start = chan_desc.shift;
+   const unsigned stop = start + width;
+
+   /* Decode the input vector component */
+
+   switch(chan_desc.type) {
+   case UTIL_FORMAT_TYPE_VOID:
+      input = bld->undef;
+      break;
+
+   case UTIL_FORMAT_TYPE_UNSIGNED:
+      /*
+       * Align the LSB
+       */
+      if (start) {
+         input = LLVMBuildLShr(builder, input,
+                               lp_build_const_int_vec(gallivm, type, start), "");
+      }
+
+      /*
+       * Zero the MSBs
+       */
+      if (stop < blockbits) {
+         unsigned mask = ((unsigned long long)1 << width) - 1;
+         input = LLVMBuildAnd(builder, input,
+                              lp_build_const_int_vec(gallivm, type, mask), "");
+      }
+
+      /*
+       * Type conversion
+       */
+      if (type.floating) {
+         if (srgb_chan) {
+            struct lp_type conv_type = lp_uint_type(type);
+            input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
+         }
+         else {
+            if(chan_desc.normalized)
+               input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
+            else
+               input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+         }
+      }
+      else if (chan_desc.pure_integer) {
+         /* Nothing to do */
+      } else {
+          /* FIXME */
+          assert(0);
+      }
+      break;
+
+   case UTIL_FORMAT_TYPE_SIGNED:
+      /*
+       * Align the sign bit first.
+       */
+      if (stop < type.width) {
+         unsigned bits = type.width - stop;
+         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
+         input = LLVMBuildShl(builder, input, bits_val, "");
+      }
+
+      /*
+       * Align the LSB (with an arithmetic shift to preserve the sign)
+       */
+      if (chan_desc.size < type.width) {
+         unsigned bits = type.width - chan_desc.size;
+         LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
+         input = LLVMBuildAShr(builder, input, bits_val, "");
+      }
+
+      /*
+       * Type conversion
+       */
+      if (type.floating) {
+         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+         if (chan_desc.normalized) {
+            double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
+            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+            input = LLVMBuildFMul(builder, input, scale_val, "");
+            /*
+             * The formula above will produce value below -1.0 for most negative
+             * value but everything seems happy with that hence disable for now.
+             */
+            if (0)
+               input = lp_build_max(bld, input,
+                                    lp_build_const_vec(gallivm, type, -1.0f));
+         }
+      }
+      else if (chan_desc.pure_integer) {
+         /* Nothing to do */
+      } else {
+          /* FIXME */
+          assert(0);
+      }
+      break;
+
+   case UTIL_FORMAT_TYPE_FLOAT:
+      if (type.floating) {
+         if (chan_desc.size == 16) {
+            struct lp_type f16i_type = type;
+            f16i_type.width /= 2;
+            f16i_type.floating = 0;
+            if (start) {
+               input = LLVMBuildLShr(builder, input,
+                                     lp_build_const_int_vec(gallivm, type, start), "");
+            }
+            input = LLVMBuildTrunc(builder, input,
+                                   lp_build_vec_type(gallivm, f16i_type), "");
+            input = lp_build_half_to_float(gallivm, input);
+         } else {
+            assert(start == 0);
+            assert(stop == 32);
+            assert(type.width == 32);
+         }
+         input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
+      }
+      else {
+         /* FIXME */
+         assert(0);
+         input = bld->undef;
+      }
+      break;
+
+   case UTIL_FORMAT_TYPE_FIXED:
+      if (type.floating) {
+         double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
+         LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+         input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+         input = LLVMBuildFMul(builder, input, scale_val, "");
+      }
+      else {
+         /* FIXME */
+         assert(0);
+         input = bld->undef;
+      }
+      break;
+
+   default:
+      assert(0);
+      input = bld->undef;
+      break;
+   }
+
+   return input;
+}
+
+
 /**
  * Unpack several pixels in SoA.
  *
@@ -113,7 +304,6 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
                          LLVMValueRef packed,
                          LLVMValueRef rgba_out[4])
 {
-   LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context bld;
    LLVMValueRef inputs[4];
    unsigned chan;
@@ -129,149 +319,19 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
 
    /* Decode the input vector components */
    for (chan = 0; chan < format_desc->nr_channels; ++chan) {
-      const unsigned width = format_desc->channel[chan].size;
-      const unsigned start = format_desc->channel[chan].shift;
-      const unsigned stop = start + width;
-      LLVMValueRef input;
-
-      input = packed;
-
-      switch(format_desc->channel[chan].type) {
-      case UTIL_FORMAT_TYPE_VOID:
-         input = lp_build_undef(gallivm, type);
-         break;
-
-      case UTIL_FORMAT_TYPE_UNSIGNED:
-         /*
-          * Align the LSB
-          */
-
-         if (start) {
-            input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), "");
-         }
-
-         /*
-          * Zero the MSBs
-          */
-
-         if (stop < format_desc->block.bits) {
-            unsigned mask = ((unsigned long long)1 << width) - 1;
-            input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), "");
-         }
-
-         /*
-          * Type conversion
-          */
-
-         if (type.floating) {
-            if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
-               if (format_desc->swizzle[3] == chan) {
-                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
-               }
-               else {
-                  struct lp_type conv_type = lp_uint_type(type);
-                  input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
-               }
-            }
-            else {
-               if(format_desc->channel[chan].normalized)
-                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
-               else
-                  input = LLVMBuildSIToFP(builder, input,
-                                          lp_build_vec_type(gallivm, type), "");
-            }
-         }
-         else if (format_desc->channel[chan].pure_integer) {
-            /* Nothing to do */
-         } else {
-             /* FIXME */
-             assert(0);
-         }
-
-         break;
-
-      case UTIL_FORMAT_TYPE_SIGNED:
-         /*
-          * Align the sign bit first.
-          */
-
-         if (stop < type.width) {
-            unsigned bits = type.width - stop;
-            LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
-            input = LLVMBuildShl(builder, input, bits_val, "");
-         }
-
-         /*
-          * Align the LSB (with an arithmetic shift to preserve the sign)
-          */
-
-         if (format_desc->channel[chan].size < type.width) {
-            unsigned bits = type.width - format_desc->channel[chan].size;
-            LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
-            input = LLVMBuildAShr(builder, input, bits_val, "");
-         }
-
-         /*
-          * Type conversion
-          */
+      struct util_format_channel_description chan_desc = format_desc->channel[chan];
+      boolean srgb_chan = FALSE;
 
-         if (type.floating) {
-            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
-            if (format_desc->channel[chan].normalized) {
-               double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
-               LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
-               input = LLVMBuildFMul(builder, input, scale_val, "");
-               /* the formula above will produce value below -1.0 for most negative
-                * value but everything seems happy with that hence disable for now */
-               if (0)
-                  input = lp_build_max(&bld, input,
-                                       lp_build_const_vec(gallivm, type, -1.0f));
-            }
-         }
-         else if (format_desc->channel[chan].pure_integer) {
-            /* Nothing to do */
-         } else {
-             /* FIXME */
-             assert(0);
-         }
-
-         break;
-
-      case UTIL_FORMAT_TYPE_FLOAT:
-         if (type.floating) {
-            assert(start == 0);
-            assert(stop == 32);
-            assert(type.width == 32);
-            input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), "");
-         }
-         else {
-            /* FIXME */
-            assert(0);
-            input = lp_build_undef(gallivm, type);
-         }
-         break;
-
-      case UTIL_FORMAT_TYPE_FIXED:
-         if (type.floating) {
-            double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
-            LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
-            input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
-            input = LLVMBuildFMul(builder, input, scale_val, "");
-         }
-         else {
-            /* FIXME */
-            assert(0);
-            input = lp_build_undef(gallivm, type);
-         }
-         break;
-
-      default:
-         assert(0);
-         input = lp_build_undef(gallivm, type);
-         break;
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
+          format_desc->swizzle[3] != chan) {
+         srgb_chan = TRUE;
       }
 
-      inputs[chan] = input;
+      inputs[chan] = lp_build_extract_soa_chan(&bld,
+                                               format_desc->block.bits,
+                                               srgb_chan,
+                                               chan_desc,
+                                               packed);
    }
 
    lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
@@ -336,6 +396,7 @@ lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
  *
  * \param type  the desired return type for 'rgba'.  The vector length
  *              is the number of texels to fetch
+ * \param aligned if the offset is guaranteed to be aligned to element width
  *
  * \param base_ptr  points to the base of the texture mip tree.
  * \param offset    offset to start of the texture image block.  For non-
@@ -352,6 +413,7 @@ void
 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                         const struct util_format_description *format_desc,
                         struct lp_type type,
+                        boolean aligned,
                         LLVMValueRef base_ptr,
                         LLVMValueRef offset,
                         LLVMValueRef i,
@@ -360,6 +422,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                         LLVMValueRef rgba_out[4])
 {
    LLVMBuilderRef builder = gallivm->builder;
+   enum pipe_format format = format_desc->format;
+   struct lp_type fetch_type;
 
    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
@@ -369,7 +433,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
        format_desc->block.height == 1 &&
        format_desc->block.bits <= type.width &&
        (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
-        format_desc->channel[0].size == 32))
+        format_desc->channel[0].size == 32 ||
+        format_desc->channel[0].size == 16))
    {
       /*
        * The packed pixel fits into an element of the destination format. Put
@@ -384,11 +449,12 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
        * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
        */
       assert(format_desc->block.bits <= type.width);
+      fetch_type = lp_type_uint(type.width);
       packed = lp_build_gather(gallivm,
                                type.length,
                                format_desc->block.bits,
-                               type.width,
-                               TRUE,
+                               fetch_type,
+                               aligned,
                                base_ptr, offset, FALSE);
 
       /*
@@ -401,22 +467,232 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
       return;
    }
 
-   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
-       format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+       (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
+       format_desc->block.width == 1 &&
+       format_desc->block.height == 1 &&
+       format_desc->block.bits > type.width &&
+       ((format_desc->block.bits <= type.width * type.length &&
+         format_desc->channel[0].size <= type.width) ||
+        (format_desc->channel[0].size == 64 &&
+         format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+         type.floating)))
+   {
+      /*
+       * Similar to above, but the packed pixel is larger than what fits
+       * into an element of the destination format. The packed pixels will be
+       * shuffled into SoA vectors appropriately, and then the extraction will
+       * be done in parallel as much as possible.
+       * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
+       * the gathered vectors can be shuffled easily (even with avx).
+       * 64xn float -> 32xn float is handled too but it's a bit special as
+       * it does the conversion pre-shuffle.
+       */
+
+      LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
+      struct lp_type fetch_type, gather_type = type;
+      unsigned num_gather, fetch_width, i, j;
+      struct lp_build_context bld;
+      boolean fp64 = format_desc->channel[0].size == 64;
+
+      lp_build_context_init(&bld, gallivm, type);
+
+      assert(type.width == 32);
+      assert(format_desc->block.bits > type.width);
+
+      /*
+       * First, figure out fetch order.
+       */
+      fetch_width = util_next_power_of_two(format_desc->block.bits);
+      /*
+       * fp64 are treated like fp32 except we fetch twice wide values
+       * (as we shuffle after trunc). The shuffles for that work out
+       * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
+       * albeit we miss the potential opportunity for hw gather (as it
+       * only handles native size).
+       */
+      num_gather = fetch_width / type.width;
+      gather_type.width *= num_gather;
+      if (fp64) {
+         num_gather /= 2;
+      }
+      gather_type.length /= num_gather;
+
+      for (i = 0; i < num_gather; i++) {
+         LLVMValueRef offsetr, shuf_vec;
+         if(num_gather == 4) {
+            for (j = 0; j < gather_type.length; j++) {
+               unsigned idx = i + 4*j;
+               shuffles[j] = lp_build_const_int32(gallivm, idx);
+            }
+            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
+            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
+
+         }
+         else if (num_gather == 2) {
+            assert(num_gather == 2);
+            for (j = 0; j < gather_type.length; j++) {
+               unsigned idx = i*2 + (j%2) + (j/2)*4;
+               shuffles[j] = lp_build_const_int32(gallivm, idx);
+            }
+            shuf_vec = LLVMConstVector(shuffles, gather_type.length);
+            offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
+         }
+         else {
+            assert(num_gather == 1);
+            offsetr = offset;
+         }
+         if (gather_type.length == 1) {
+            LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
+            offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
+         }
+
+         /*
+          * Determine whether to use float or int loads. This is mostly
+          * to outsmart the (stupid) llvm int/float shuffle logic, we
+          * don't really care much if the data is floats or ints...
+          * But llvm will refuse to use single float shuffle with int data
+          * and instead use 3 int shuffles instead, the code looks atrocious.
+          * (Note bitcasts often won't help, as llvm is too smart to be
+          * fooled by that.)
+          * Nobody cares about simd float<->int domain transition penalties,
+          * which usually don't even exist for shuffles anyway.
+          * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
+          * going into transpose, which is unpacks, so doesn't really matter
+          * much).
+          * With 2x32bit or 4x16bit fetch, we use float vec, since those
+          * go into the weird channel separation shuffle. With floats,
+          * this is (with 128bit vectors):
+          * - 2 movq, 2 movhpd, 2 shufps
+          * With ints it would be:
+          * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
+          * I've seen texture functions increase in code size by 15% just due
+          * to that (there's lots of such fetches in them...)
+          * (We could chose a different gather order to improve this somewhat
+          * for the int path, but it would basically just drop the blends,
+          * so the float path with this order really is optimal.)
+          * Albeit it is tricky sometimes llvm doesn't ignore the float->int
+          * casts so must avoid them until we're done with the float shuffle...
+          * 3x16bit formats (the same is also true for 3x8) are pretty bad but
+          * there's nothing we can do about them (we could overallocate by
+          * those couple bytes and use unaligned but pot sized load).
+          * Note that this is very much x86 specific. I don't know if this
+          * affect other archs at all.
+          */
+         if (num_gather > 1) {
+            /*
+             * We always want some float type here (with x86)
+             * due to shuffles being float ones afterwards (albeit for
+             * the num_gather == 4 case int should work fine too
+             * (unless there's some problems with avx but not avx2).
+             */
+            if (format_desc->channel[0].size == 64) {
+               fetch_type = lp_type_float_vec(64, gather_type.width);
+            } else {
+               fetch_type = lp_type_int_vec(32, gather_type.width);
+            }
+         }
+         else {
+            /* type doesn't matter much */
+            if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+                (format_desc->channel[0].size == 32 ||
+                 format_desc->channel[0].size == 64)) {
+            fetch_type = lp_type_float(gather_type.width);
+            } else {
+               fetch_type = lp_type_uint(gather_type.width);
+            }
+         }
+
+         /* Now finally gather the values */
+         packed[i] = lp_build_gather(gallivm, gather_type.length,
+                                     format_desc->block.bits,
+                                     fetch_type, aligned,
+                                     base_ptr, offsetr, FALSE);
+         if (fp64) {
+            struct lp_type conv_type = type;
+            conv_type.width *= 2;
+            packed[i] = LLVMBuildBitCast(builder, packed[i],
+                                         lp_build_vec_type(gallivm, conv_type), "");
+            packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
+         }
+      }
+
+      /* shuffle the gathered values to SoA */
+      if (num_gather == 2) {
+         for (i = 0; i < num_gather; i++) {
+            for (j = 0; j < type.length; j++) {
+               unsigned idx = (j%2)*2 + (j/4)*4 + i;
+               if ((j/2)%2)
+                  idx += type.length;
+               shuffles[j] = lp_build_const_int32(gallivm, idx);
+            }
+            dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
+                                            LLVMConstVector(shuffles, type.length), "");
+         }
+      }
+      else if (num_gather == 4) {
+         lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
+      }
+      else {
+         assert(num_gather == 1);
+         dst[0] = packed[0];
+      }
+
+      /*
+       * And finally unpack exactly as above, except that
+       * chan shift is adjusted and the right vector selected.
+       */
+      if (!fp64) {
+         for (i = 0; i < num_gather; i++) {
+            dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
+         }
+         for (i = 0; i < format_desc->nr_channels; i++) {
+            struct util_format_channel_description chan_desc = format_desc->channel[i];
+            unsigned blockbits = type.width;
+            unsigned vec_nr;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+            vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
+#else
+            vec_nr = chan_desc.shift / type.width;
+#endif
+            chan_desc.shift %= type.width;
+
+            output[i] = lp_build_extract_soa_chan(&bld,
+                                                  blockbits,
+                                                  FALSE,
+                                                  chan_desc,
+                                                  dst[vec_nr]);
+         }
+      }
+      else {
+         for (i = 0; i < format_desc->nr_channels; i++)  {
+            output[i] = dst[i];
+         }
+      }
+
+      lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
+      return;
+   }
+
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+       format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
       /*
        * similar conceptually to above but requiring special
        * AoS packed -> SoA float conversion code.
        */
       LLVMValueRef packed;
+      struct lp_type fetch_type = lp_type_uint(type.width);
 
       assert(type.floating);
       assert(type.width == 32);
 
       packed = lp_build_gather(gallivm, type.length,
                                format_desc->block.bits,
-                               type.width, TRUE,
+                               fetch_type, aligned,
                                base_ptr, offset, FALSE);
-      if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
          lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
       }
       else {
@@ -432,8 +708,9 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
        * 32bit (or 8bit) from each block.
        */
       LLVMValueRef packed;
+      struct lp_type fetch_type = lp_type_uint(type.width);
 
-      if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) {
+      if (format == PIPE_FORMAT_X32_S8X24_UINT) {
          /*
           * for stencil simply fix up offsets - could in fact change
           * base_ptr instead even outside the shader.
@@ -441,15 +718,15 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
          unsigned mask = (1 << 8) - 1;
          LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
          offset = LLVMBuildAdd(builder, offset, s_offset, "");
-         packed = lp_build_gather(gallivm, type.length, 32, type.width,
-                                  TRUE, base_ptr, offset, FALSE);
+         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
+                                  aligned, base_ptr, offset, FALSE);
          packed = LLVMBuildAnd(builder, packed,
                                lp_build_const_int_vec(gallivm, type, mask), "");
       }
       else {
-         assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
-         packed = lp_build_gather(gallivm, type.length, 32, type.width,
-                                  TRUE, base_ptr, offset, TRUE);
+         assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
+                                  aligned, base_ptr, offset, TRUE);
          packed = LLVMBuildBitCast(builder, packed,
                                    lp_build_vec_type(gallivm, type), "");
       }
@@ -461,63 +738,69 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
 
    /*
     * Try calling lp_build_fetch_rgba_aos for all pixels.
+    * Should only really hit subsampled, compressed
+    * (for s3tc srgb too, for rgtc the unorm ones only) by now.
+    * (This is invalid for plain 8unorm formats because we're lazy with
+    * the swizzle since some results would arrive swizzled, some not.)
     */
 
-   if (util_format_fits_8unorm(format_desc) &&
+   if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
+       (util_format_fits_8unorm(format_desc) ||
+        format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
        type.floating && type.width == 32 &&
        (type.length == 1 || (type.length % 4 == 0))) {
       struct lp_type tmp_type;
-      LLVMValueRef tmp;
+      struct lp_build_context bld;
+      LLVMValueRef packed, rgba[4];
+      const struct util_format_description *flinear_desc;
+      const struct util_format_description *frgba8_desc;
+      unsigned chan;
+
+      lp_build_context_init(&bld, gallivm, type);
 
+      /*
+       * Make sure the conversion in aos really only does convert to rgba8
+       * and not anything more (so use linear format, adjust type).
+       */
+      flinear_desc = util_format_description(util_format_linear(format));
       memset(&tmp_type, 0, sizeof tmp_type);
       tmp_type.width = 8;
       tmp_type.length = type.length * 4;
       tmp_type.norm = TRUE;
 
-      tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
-                                    TRUE, base_ptr, offset, i, j, cache);
+      packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
+                                       aligned, base_ptr, offset, i, j, cache);
+      packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
 
-      lp_build_rgba8_to_fi32_soa(gallivm,
-                                type,
-                                tmp,
-                                rgba_out);
-
-      return;
-   }
-
-   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC &&
-       /* non-srgb case is already handled above */
-       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
-       type.floating && type.width == 32 &&
-       (type.length == 1 || (type.length % 4 == 0)) &&
-       cache) {
-      const struct util_format_description *format_decompressed;
-      const struct util_format_description *flinear_desc;
-      LLVMValueRef packed;
-      flinear_desc = util_format_description(util_format_linear(format_desc->format));
-      packed = lp_build_fetch_cached_texels(gallivm,
-                                            flinear_desc,
-                                            type.length,
-                                            base_ptr,
-                                            offset,
-                                            i, j,
-                                            cache);
-      packed = LLVMBuildBitCast(builder, packed,
-                                lp_build_int_vec_type(gallivm, type), "");
       /*
-       * The values are now packed so they match ordinary srgb RGBA8 format,
+       * The values are now packed so they match ordinary (srgb) RGBA8 format,
        * hence need to use matching format for unpack.
        */
-      format_decompressed = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
-
+      frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+         assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
+         frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
+      }
       lp_build_unpack_rgba_soa(gallivm,
-                               format_decompressed,
+                               frgba8_desc,
                                type,
-                               packed, rgba_out);
+                               packed, rgba);
 
+      /*
+       * We converted 4 channels. Make sure llvm can drop unneeded ones
+       * (luckily the rgba order is fixed, only LA needs special case).
+       */
+      for (chan = 0; chan < 4; chan++) {
+         enum pipe_swizzle swizzle = format_desc->swizzle[chan];
+         if (chan == 3 && util_format_is_luminance_alpha(format)) {
+            swizzle = PIPE_SWIZZLE_W;
+         }
+         rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
+      }
       return;
    }
 
+
    /*
     * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
     *
@@ -525,30 +808,40 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
     * miss some opportunities to do vectorization, but this is
     * convenient for formats or scenarios for which there was no
     * opportunity or incentive to optimize.
+    *
+    * We do NOT want to end up here, this typically is quite terrible,
+    * in particular if the formats have less than 4 channels.
+    *
+    * Right now, this should only be hit for:
+    * - RGTC snorm formats
+    *   (those miss fast fetch functions hence they are terrible anyway)
     */
 
    {
-      unsigned k, chan;
+      unsigned k;
       struct lp_type tmp_type;
+      LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
 
       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
-         debug_printf("%s: scalar unpacking of %s\n",
+         debug_printf("%s: AoS fetch fallback for %s\n",
                       __FUNCTION__, format_desc->short_name);
       }
 
       tmp_type = type;
       tmp_type.length = 4;
 
-      for (chan = 0; chan < 4; ++chan) {
-         rgba_out[chan] = lp_build_undef(gallivm, type);
-      }
+      /*
+       * Note that vector transpose can be worse compared to insert/extract
+       * for aos->soa conversion (for formats with 1 or 2 channels). However,
+       * we should try to avoid getting here for just about all formats, so
+       * don't bother.
+       */
 
       /* loop over number of pixels */
       for(k = 0; k < type.length; ++k) {
          LLVMValueRef index = lp_build_const_int32(gallivm, k);
          LLVMValueRef offset_elem;
          LLVMValueRef i_elem, j_elem;
-         LLVMValueRef tmp;
 
          offset_elem = LLVMBuildExtractElement(builder, offset,
                                                index, "");
@@ -557,20 +850,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
          j_elem = LLVMBuildExtractElement(builder, j, index, "");
 
          /* Get a single float[4]={R,G,B,A} pixel */
-         tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
-                                       TRUE, base_ptr, offset_elem,
-                                       i_elem, j_elem, cache);
+         aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
+                                                aligned, base_ptr, offset_elem,
+                                                i_elem, j_elem, cache);
 
-         /*
-          * Insert the AoS tmp value channels into the SoA result vectors at
-          * position = 'index'.
-          */
-         for (chan = 0; chan < 4; ++chan) {
-            LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan),
-            tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, "");
-            rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
-                                                    tmp_chan, index, "");
-         }
       }
+      convert_to_soa(gallivm, aos_fetch, rgba_out, type);
    }
 }
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index fa0e8b656..d6d755298 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -491,13 +491,15 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
 {
    LLVMValueRef packed;
    LLVMValueRef rgba;
+   struct lp_type fetch_type;
 
    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
    assert(format_desc->block.bits == 32);
    assert(format_desc->block.width == 2);
    assert(format_desc->block.height == 1);
 
-   packed = lp_build_gather(gallivm, n, 32, 32, TRUE, base_ptr, offset, FALSE);
+   fetch_type = lp_type_uint(32);
+   packed = lp_build_gather(gallivm, n, 32, fetch_type, TRUE, base_ptr, offset, FALSE);
 
    (void)j;
 
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c
index 439bbb679..7d11dcd3b 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -28,13 +28,16 @@
 
 #include "util/u_debug.h"
 #include "util/u_cpu_detect.h"
+#include "util/u_math.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_const.h"
 #include "lp_bld_format.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_swizzle.h"
+#include "lp_bld_type.h"
 #include "lp_bld_init.h"
 #include "lp_bld_intr.h"
+#include "lp_bld_pack.h"
 
 
 /**
@@ -113,14 +116,29 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
     * translation of offsets to first_elem in sampler_views it actually seems
     * gallium could not do anything else except 16 no matter what...
     */
-  if (!aligned) {
+   if (!aligned) {
       LLVMSetAlignment(res, 1);
+   } else if (!util_is_power_of_two(src_width)) {
+      /*
+       * Full alignment is impossible, assume the caller really meant
+       * the individual elements were aligned (e.g. 3x32bit format).
+       * And yes the generated code may otherwise crash, llvm will
+       * really assume 128bit alignment with a 96bit fetch (I suppose
+       * that makes sense as it can just assume the upper 32bit to be
+       * whatever).
+       * Maybe the caller should be able to explicitly set this, but
+       * this should cover all the 3-channel formats.
+       */
+      if (((src_width / 24) * 24 == src_width) &&
+           util_is_power_of_two(src_width / 24)) {
+          LLVMSetAlignment(res, src_width / 24);
+      } else {
+         LLVMSetAlignment(res, 1);
+      }
    }
 
    assert(src_width <= dst_width);
-   if (src_width > dst_width) {
-      res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, "");
-   } else if (src_width < dst_width) {
+   if (src_width < dst_width) {
       res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
       if (vector_justify) {
 #ifdef PIPE_ARCH_BIG_ENDIAN
@@ -134,28 +152,162 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
 }
 
 
+/**
+ * Gather one element from scatter positions in memory.
+ * Nearly the same as above, however the individual elements
+ * may be vectors themselves, and fetches may be float type.
+ * Can also do pad vector instead of ZExt.
+ *
+ * @sa lp_build_gather()
+ */
+static LLVMValueRef
+lp_build_gather_elem_vec(struct gallivm_state *gallivm,
+                         unsigned length,
+                         unsigned src_width,
+                         LLVMTypeRef src_type,
+                         struct lp_type dst_type,
+                         boolean aligned,
+                         LLVMValueRef base_ptr,
+                         LLVMValueRef offsets,
+                         unsigned i,
+                         boolean vector_justify)
+{
+   LLVMValueRef ptr, res;
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
+
+   ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
+   ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
+   res = LLVMBuildLoad(gallivm->builder, ptr, "");
+
+   /* XXX
+    * On some archs we probably really want to avoid having to deal
+    * with alignments lower than 4 bytes (if fetch size is a power of
+    * two >= 32). On x86 it doesn't matter, however.
+    * We should be able to guarantee full alignment for any kind of texture
+    * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
+    * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
+    * but I don't think that's quite what we wanted).
+    * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
+    * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
+    * enforcing what we want (which is what d3d10 does, the offset needs to
+    * be aligned to element size, but GL has bytes regardless of element
+    * size which would only leave us with minimum alignment restriction of 16
+    * which doesn't make much sense if the type isn't 4x32bit). Due to
+    * translation of offsets to first_elem in sampler_views it actually seems
+    * gallium could not do anything else except 16 no matter what...
+    */
+   if (!aligned) {
+      LLVMSetAlignment(res, 1);
+   } else if (!util_is_power_of_two(src_width)) {
+      /*
+       * Full alignment is impossible, assume the caller really meant
+       * the individual elements were aligned (e.g. 3x32bit format).
+       * And yes the generated code may otherwise crash, llvm will
+       * really assume 128bit alignment with a 96bit fetch (I suppose
+       * that makes sense as it can just assume the upper 32bit to be
+       * whatever).
+       * Maybe the caller should be able to explicitly set this, but
+       * this should cover all the 3-channel formats.
+       */
+      if (((src_width / 24) * 24 == src_width) &&
+           util_is_power_of_two(src_width / 24)) {
+          LLVMSetAlignment(res, src_width / 24);
+      } else {
+         LLVMSetAlignment(res, 1);
+      }
+   }
+
+   assert(src_width <= dst_type.width * dst_type.length);
+   if (src_width < dst_type.width * dst_type.length) {
+      if (dst_type.length > 1) {
+         res = lp_build_pad_vector(gallivm, res, dst_type.length);
+         /*
+          * vector_justify hopefully a non-issue since we only deal
+          * with src_width >= 32 here?
+          */
+      } else {
+         LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type);
+
+         /*
+          * Only valid if src_ptr_type is int type...
+          */
+         res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         if (vector_justify) {
+         res = LLVMBuildShl(gallivm->builder, res,
+                            LLVMConstInt(dst_elem_type,
+                                         dst_type.width - src_width, 0), "");
+         }
+         if (src_width == 48) {
+            /* Load 3x16 bit vector.
+             * The sequence of loads on big-endian hardware proceeds as follows.
+             * 16-bit fields are denoted by X, Y, Z, and 0.  In memory, the sequence
+             * of three fields appears in the order X, Y, Z.
+             *
+             * Load 32-bit word: 0.0.X.Y
+             * Load 16-bit halfword: 0.0.0.Z
+             * Rotate left: 0.X.Y.0
+             * Bitwise OR: 0.X.Y.Z
+             *
+             * The order in which we need the fields in the result is 0.Z.Y.X,
+             * the same as on little-endian; permute 16-bit fields accordingly
+             * within 64-bit register:
+             */
+            LLVMValueRef shuffles[4] = {
+               lp_build_const_int32(gallivm, 2),
+               lp_build_const_int32(gallivm, 1),
+               lp_build_const_int32(gallivm, 0),
+               lp_build_const_int32(gallivm, 3),
+            };
+            res = LLVMBuildBitCast(gallivm->builder, res,
+                                   lp_build_vec_type(gallivm, lp_type_uint_vec(16, 4*16)), "");
+            res = LLVMBuildShuffleVector(gallivm->builder, res, res, LLVMConstVector(shuffles, 4), "");
+            res = LLVMBuildBitCast(gallivm->builder, res, dst_elem_type, "");
+         }
+#endif
+      }
+   }
+   return res;
+}
+
+
+
+
 static LLVMValueRef
 lp_build_gather_avx2(struct gallivm_state *gallivm,
                      unsigned length,
                      unsigned src_width,
-                     unsigned dst_width,
+                     struct lp_type dst_type,
                      LLVMValueRef base_ptr,
                      LLVMValueRef offsets)
 {
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef dst_type = LLVMIntTypeInContext(gallivm->context, dst_width);
-   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_type, length);
-   LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
-   LLVMTypeRef src_vec_type = LLVMVectorType(src_type, length);
+   LLVMTypeRef src_type, src_vec_type;
    LLVMValueRef res;
+   struct lp_type res_type = dst_type;
+   res_type.length *= length;
+
+   if (dst_type.floating) {
+      src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
+                                   LLVMFloatTypeInContext(gallivm->context);
+   } else {
+      src_type = LLVMIntTypeInContext(gallivm->context, src_width);
+   }
+   src_vec_type = LLVMVectorType(src_type, length);
 
+   /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
 
    if (0) {
       /*
        * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
-       * will not use the AVX2 gather instrinsics.  See
+       * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
+       * least with Haswell. See
        * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
+       * And the generated code doing the emulation is quite a bit worse
+       * than what we get by doing it ourselves too.
        */
       LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
       LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
@@ -175,7 +327,8 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
       src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");
 
       char intrinsic[64];
-      util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%ui%u", length, src_width);
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
+                    length, dst_type.floating ? "f" : "i", src_width);
       LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
       LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
@@ -184,26 +337,35 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
 
       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
    } else {
-      assert(src_width == 32);
-
       LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
-
-      /*
-       * We should get the caller to give more type information so we can use
-       * the intrinsics for the right int/float domain.  Int should be the most
-       * common.
-       */
       const char *intrinsic = NULL;
-      switch (length) {
-      case 4:
-         intrinsic = "llvm.x86.avx2.gather.d.d";
-         break;
-      case 8:
-         intrinsic = "llvm.x86.avx2.gather.d.d.256";
-         break;
-      default:
-         assert(0);
+      unsigned l_idx = 0;
+
+      assert(src_width == 32 || src_width == 64);
+      if (src_width == 32) {
+         assert(length == 4 || length == 8);
+      } else {
+         assert(length == 2 || length == 4);
+      }
+
+      static const char *intrinsics[2][2][2] = {
+
+         {{"llvm.x86.avx2.gather.d.d",
+           "llvm.x86.avx2.gather.d.d.256"},
+          {"llvm.x86.avx2.gather.d.q",
+           "llvm.x86.avx2.gather.d.q.256"}},
+
+         {{"llvm.x86.avx2.gather.d.ps",
+           "llvm.x86.avx2.gather.d.ps.256"},
+          {"llvm.x86.avx2.gather.d.pd",
+           "llvm.x86.avx2.gather.d.pd.256"}},
+      };
+
+      if ((src_width == 32 && length == 8) ||
+          (src_width == 64 && length == 4)) {
+         l_idx = 1;
       }
+      intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
 
       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
       LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
@@ -214,12 +376,7 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
 
       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
    }
-
-   if (src_width > dst_width) {
-      res = LLVMBuildTrunc(builder, res, dst_vec_type, "");
-   } else if (src_width < dst_width) {
-      res = LLVMBuildZExt(builder, res, dst_vec_type, "");
-   }
+   res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
 
    return res;
 }
@@ -240,9 +397,11 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
  *
  * @param length length of the offsets
  * @param src_width src element width in bits
- * @param dst_width result element width in bits (src will be expanded to fit)
+ * @param dst_type result element type (src will be expanded to fit,
+ *        but truncation is not allowed)
+ *        (this may be a vector, must be pot sized)
  * @param aligned whether the data is guaranteed to be aligned (to src_width)
- * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param base_ptr base pointer, needs to be a i8 pointer type.
  * @param offsets vector with offsets
  * @param vector_justify select vector rather than integer justification
  */
@@ -250,36 +409,174 @@ LLVMValueRef
 lp_build_gather(struct gallivm_state *gallivm,
                 unsigned length,
                 unsigned src_width,
-                unsigned dst_width,
+                struct lp_type dst_type,
                 boolean aligned,
                 LLVMValueRef base_ptr,
                 LLVMValueRef offsets,
                 boolean vector_justify)
 {
    LLVMValueRef res;
+   boolean need_expansion = src_width < dst_type.width * dst_type.length;
+   boolean vec_fetch;
+   struct lp_type fetch_type, fetch_dst_type;
+   LLVMTypeRef src_type;
+
+   assert(src_width <= dst_type.width * dst_type.length);
+
+   /*
+    * This is quite a mess...
+    * Figure out if the fetch should be done as:
+    * a) scalar or vector
+    * b) float or int
+    *
+    * As an example, for a 96bit fetch expanded into 4x32bit, it is better
+    * to use (3x32bit) vector type (then pad the vector). Otherwise, the
+    * zext will cause extra instructions.
+    * However, the same isn't true for 3x16bit (the codegen for that is
+    * completely worthless on x86 simd, and for 3x8bit is is way worse
+    * still, don't try that... (To get really good code out of llvm for
+    * these cases, the only way is to decompose the fetches manually
+    * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
+    * case requires sse41, otherwise simple scalar zext is way better.
+    * But probably not important enough, so don't bother.)
+    * Also, we try to honor the floating bit of destination (but isn't
+    * possible if caller asks for instance for 2x32bit dst_type with
+    * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
+    * cast to 2x32f type, so the fetch is always int and on top of that
+    * we avoid the vec pad and use scalar zext due the above mentioned
+    * issue).
+    * Note this is optimized for x86 sse2 and up backend. Could be tweaked
+    * for other archs if necessary...
+    */
+   if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
+       (dst_type.length > 1)) {
+      /* use vector fetch (if dst_type is vector) */
+      vec_fetch = TRUE;
+      if (dst_type.floating) {
+         fetch_type = lp_type_float_vec(dst_type.width, src_width);
+      } else {
+         fetch_type = lp_type_int_vec(dst_type.width, src_width);
+      }
+      /* intentionally not using lp_build_vec_type here */
+      src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
+                                fetch_type.length);
+      fetch_dst_type = fetch_type;
+      fetch_dst_type.length = dst_type.length;
+    } else {
+      /* use scalar fetch */
+      vec_fetch = FALSE;
+      if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
+         fetch_type = lp_type_float(src_width);
+      } else {
+         fetch_type = lp_type_int(src_width);
+      }
+      src_type = lp_build_vec_type(gallivm, fetch_type);
+      fetch_dst_type = fetch_type;
+      fetch_dst_type.width = dst_type.width * dst_type.length;
+   }
 
    if (length == 1) {
       /* Scalar */
-      return lp_build_gather_elem(gallivm, length,
-                                  src_width, dst_width, aligned,
-                                  base_ptr, offsets, 0, vector_justify);
-   } else if (util_cpu_caps.has_avx2 && src_width == 32 && (length == 4 || length == 8)) {
-      return lp_build_gather_avx2(gallivm, length, src_width, dst_width, base_ptr, offsets);
+      res = lp_build_gather_elem_vec(gallivm, length,
+                                     src_width, src_type, fetch_dst_type,
+                                     aligned, base_ptr, offsets, 0,
+                                     vector_justify);
+      return LLVMBuildBitCast(gallivm->builder, res,
+                              lp_build_vec_type(gallivm, dst_type), "");
+      /*
+       * Excluding expansion from these paths because if you need it for
+       * 32bit/64bit fetches you're doing it wrong (this is gather, not
+       * conversion) and it would be awkward for floats.
+       */
+   } else if (util_cpu_caps.has_avx2 && !need_expansion &&
+              src_width == 32 && (length == 4 || length == 8)) {
+      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+                                  base_ptr, offsets);
+   /*
+    * This looks bad on paper wrt throughtput/latency on Haswell.
+    * Even on Broadwell it doesn't look stellar.
+    * Albeit no measurements were done (but tested to work).
+    * Should definitely enable on Skylake.
+    * (In general, should be more of a win if the fetch is 256bit wide -
+    * this is true for the 32bit case above too.)
+    */
+   } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
+              src_width == 64 && (length == 2 || length == 4)) {
+      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+                                  base_ptr, offsets);
    } else {
       /* Vector */
 
-      LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
-      LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
+      LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
       unsigned i;
-
-      res = LLVMGetUndef(dst_vec_type);
+      boolean vec_zext = FALSE;
+      struct lp_type res_type, gather_res_type;
+      LLVMTypeRef res_t, gather_res_t;
+
+      res_type = fetch_dst_type;
+      res_type.length *= length;
+      gather_res_type = res_type;
+
+      if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
+         /*
+          * Note that llvm is never able to optimize zext/insert combos
+          * directly (i.e. zero the simd reg, then place the elements into
+          * the appropriate place directly). (I think this has to do with
+          * scalar/vector transition.) And scalar 16->32bit zext simd loads
+          * aren't possible (instead loading to scalar reg first).
+          * No idea about other archs...
+          * We could do this manually, but instead we just use a vector
+          * zext, which is simple enough (and, in fact, llvm might optimize
+          * this away).
+          * (We're not trying that with other bit widths as that might not be
+          * easier, in particular with 8 bit values at least with only sse2.)
+          */
+         assert(vec_fetch == FALSE);
+         gather_res_type.width /= 2;
+         fetch_dst_type = fetch_type;
+         src_type = lp_build_vec_type(gallivm, fetch_type);
+         vec_zext = TRUE;
+      }
+      res_t = lp_build_vec_type(gallivm, res_type);
+      gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
+      res = LLVMGetUndef(gather_res_t);
       for (i = 0; i < length; ++i) {
          LLVMValueRef index = lp_build_const_int32(gallivm, i);
-         LLVMValueRef elem;
-         elem = lp_build_gather_elem(gallivm, length,
-                                     src_width, dst_width, aligned,
-                                     base_ptr, offsets, i, vector_justify);
-         res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
+         elems[i] = lp_build_gather_elem_vec(gallivm, length,
+                                             src_width, src_type, fetch_dst_type,
+                                             aligned, base_ptr, offsets, i,
+                                             vector_justify);
+         if (!vec_fetch) {
+            res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
+         }
+      }
+      if (vec_zext) {
+         res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
+         if (vector_justify) {
+#ifdef PIPE_ARCH_BIG_ENDIAN
+            unsigned sv = dst_type.width - src_width;
+            res = LLVMBuildShl(gallivm->builder, res,
+                               lp_build_const_int_vec(gallivm, res_type, sv), "");
+#endif
+         }
+      }
+      if (vec_fetch) {
+         /*
+          * Do bitcast now otherwise llvm might get some funny ideas wrt
+          * float/int types...
+          */
+         for (i = 0; i < length; i++) {
+            elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
+                                        lp_build_vec_type(gallivm, dst_type), "");
+         }
+         res = lp_build_concat(gallivm, elems, dst_type, length);
+      } else {
+         struct lp_type really_final_type = dst_type;
+         assert(res_type.length * res_type.width ==
+                dst_type.length * dst_type.width * length);
+         really_final_type.length *= length;
+         res = LLVMBuildBitCast(gallivm->builder, res,
+                                lp_build_vec_type(gallivm, really_final_type), "");
       }
    }
 
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h
index 3ede4763a..7930864e6 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h
@@ -55,7 +55,7 @@ LLVMValueRef
 lp_build_gather(struct gallivm_state *gallivm,
                 unsigned length,
                 unsigned src_width,
-                unsigned dst_width,
+                struct lp_type dst_type,
                 boolean aligned,
                 LLVMValueRef base_ptr,
                 LLVMValueRef offsets,
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c
index fed43e99e..c456a97eb 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -48,8 +48,12 @@
 #  define USE_MCJIT 1
 #elif defined(PIPE_ARCH_PPC_64) || defined(PIPE_ARCH_S390) || defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64)
 #  define USE_MCJIT 1
+#endif
+
+#if defined(USE_MCJIT)
+static const bool use_mcjit = USE_MCJIT;
 #else
-static bool USE_MCJIT = 0;
+static bool use_mcjit = FALSE;
 #endif
 
 
@@ -121,19 +125,6 @@ create_pass_manager(struct gallivm_state *gallivm)
    LLVMAddTargetData(gallivm->target, gallivm->passmgr);
 #endif
 
-   /* Setting the module's DataLayout to an empty string will cause the
-    * ExecutionEngine to copy to the DataLayout string from its target
-    * machine to the module.  As of LLVM 3.8 the module and the execution
-    * engine are required to have the same DataLayout.
-    *
-    * TODO: This is just a temporary work-around.  The correct solution is
-    * for gallivm_init_state() to create a TargetMachine and pull the
-    * DataLayout from there.  Currently, the TargetMachine used by llvmpipe
-    * is being implicitly created by the EngineBuilder in
-    * lp_build_create_jit_compiler_for_module()
-    */
-
-#if HAVE_LLVM < 0x0308
    {
       char *td_str;
       // New ones from the Module.
@@ -141,9 +132,6 @@ create_pass_manager(struct gallivm_state *gallivm)
       LLVMSetDataLayout(gallivm->module, td_str);
       free(td_str);
    }
-#else
-   LLVMSetDataLayout(gallivm->module, "");
-#endif
 
    if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) {
       /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
@@ -190,7 +178,7 @@ gallivm_free_ir(struct gallivm_state *gallivm)
 
    FREE(gallivm->module_name);
 
-   if (!USE_MCJIT) {
+   if (!use_mcjit) {
       /* Don't free the TargetData, it's owned by the exec engine */
    } else {
       if (gallivm->target) {
@@ -248,7 +236,7 @@ init_gallivm_engine(struct gallivm_state *gallivm)
                                                     gallivm->module,
                                                     gallivm->memorymgr,
                                                     (unsigned) optlevel,
-                                                    USE_MCJIT,
+                                                    use_mcjit,
                                                     &error);
       if (ret) {
          _debug_printf("%s\n", error);
@@ -257,7 +245,7 @@ init_gallivm_engine(struct gallivm_state *gallivm)
       }
    }
 
-   if (!USE_MCJIT) {
+   if (!use_mcjit) {
       gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine);
       if (!gallivm->target)
          goto fail;
@@ -336,7 +324,7 @@ init_gallivm_state(struct gallivm_state *gallivm, const char *name,
     * complete when MC-JIT is created. So defer the MC-JIT engine creation for
     * now.
     */
-   if (!USE_MCJIT) {
+   if (!use_mcjit) {
       if (!init_gallivm_engine(gallivm)) {
          goto fail;
       }
@@ -395,10 +383,21 @@ lp_build_init(void)
    if (gallivm_initialized)
       return TRUE;
 
-   LLVMLinkInMCJIT();
-#if !defined(USE_MCJIT)
-   USE_MCJIT = debug_get_bool_option("GALLIVM_MCJIT", 0);
+
+   /* LLVMLinkIn* are no-ops at runtime.  They just ensure the respective
+    * component is linked at buildtime, which is sufficient for its static
+    * constructors to be called at load time.
+    */
+#if defined(USE_MCJIT)
+#  if USE_MCJIT
+      LLVMLinkInMCJIT();
+#  else
+      LLVMLinkInJIT();
+#  endif
+#else
+   use_mcjit = debug_get_bool_option("GALLIVM_MCJIT", FALSE);
    LLVMLinkInJIT();
+   LLVMLinkInMCJIT();
 #endif
 
 #ifdef DEBUG
@@ -457,7 +456,7 @@ lp_build_init(void)
       util_cpu_caps.has_f16c = 0;
       util_cpu_caps.has_fma = 0;
    }
-   if (HAVE_LLVM < 0x0304 || !USE_MCJIT) {
+   if (HAVE_LLVM < 0x0304 || !use_mcjit) {
       /* AVX2 support has only been tested with LLVM 3.4, and it requires
        * MCJIT. */
       util_cpu_caps.has_avx2 = 0;
@@ -607,12 +606,30 @@ gallivm_compile_module(struct gallivm_state *gallivm)
       LLVMWriteBitcodeToFile(gallivm->module, filename);
       debug_printf("%s written\n", filename);
       debug_printf("Invoke as \"llc %s%s -o - %s\"\n",
-                   (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option] " : "",
+                   (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option>] " : "",
                    "[-mattr=<-mattr option(s)>]",
                    filename);
    }
 
-   if (USE_MCJIT) {
+   if (use_mcjit) {
+      /* Setting the module's DataLayout to an empty string will cause the
+       * ExecutionEngine to copy to the DataLayout string from its target
+       * machine to the module.  As of LLVM 3.8 the module and the execution
+       * engine are required to have the same DataLayout.
+       *
+       * We must make sure we do this after running the optimization passes,
+       * because those passes need a correct datalayout string.  For example,
+       * if those optimization passes see an empty datalayout, they will assume
+       * this is a little endian target and will do optimizations that break big
+       * endian machines.
+       *
+       * TODO: This is just a temporary work-around.  The correct solution is
+       * for gallivm_init_state() to create a TargetMachine and pull the
+       * DataLayout from there.  Currently, the TargetMachine used by llvmpipe
+       * is being implicitly created by the EngineBuilder in
+       * lp_build_create_jit_compiler_for_module()
+       */
+      LLVMSetDataLayout(gallivm->module, "");
       assert(!gallivm->engine);
       if (!init_gallivm_engine(gallivm)) {
          assert(0);
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c
index f12e735b5..b92455593 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -46,6 +46,7 @@
 
 #include "util/u_debug.h"
 #include "util/u_string.h"
+#include "util/bitscan.h"
 
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
@@ -120,16 +121,113 @@ lp_declare_intrinsic(LLVMModuleRef module,
 }
 
 
+#if HAVE_LLVM < 0x0400
+static LLVMAttribute lp_attr_to_llvm_attr(enum lp_func_attr attr)
+{
+   switch (attr) {
+   case LP_FUNC_ATTR_ALWAYSINLINE: return LLVMAlwaysInlineAttribute;
+   case LP_FUNC_ATTR_BYVAL: return LLVMByValAttribute;
+   case LP_FUNC_ATTR_INREG: return LLVMInRegAttribute;
+   case LP_FUNC_ATTR_NOALIAS: return LLVMNoAliasAttribute;
+   case LP_FUNC_ATTR_NOUNWIND: return LLVMNoUnwindAttribute;
+   case LP_FUNC_ATTR_READNONE: return LLVMReadNoneAttribute;
+   case LP_FUNC_ATTR_READONLY: return LLVMReadOnlyAttribute;
+   default:
+      _debug_printf("Unhandled function attribute: %x\n", attr);
+      return 0;
+   }
+}
+
+#else
+
+static const char *attr_to_str(enum lp_func_attr attr)
+{
+   switch (attr) {
+   case LP_FUNC_ATTR_ALWAYSINLINE: return "alwaysinline";
+   case LP_FUNC_ATTR_BYVAL: return "byval";
+   case LP_FUNC_ATTR_INREG: return "inreg";
+   case LP_FUNC_ATTR_NOALIAS: return "noalias";
+   case LP_FUNC_ATTR_NOUNWIND: return "nounwind";
+   case LP_FUNC_ATTR_READNONE: return "readnone";
+   case LP_FUNC_ATTR_READONLY: return "readonly";
+   case LP_FUNC_ATTR_WRITEONLY: return "writeonly";
+   case LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY: return "inaccessiblememonly";
+   case LP_FUNC_ATTR_CONVERGENT: return "convergent";
+   default:
+      _debug_printf("Unhandled function attribute: %x\n", attr);
+      return 0;
+   }
+}
+
+#endif
+
+void
+lp_add_function_attr(LLVMValueRef function_or_call,
+                     int attr_idx, enum lp_func_attr attr)
+{
+
+#if HAVE_LLVM < 0x0400
+   LLVMAttribute llvm_attr = lp_attr_to_llvm_attr(attr);
+   if (LLVMIsAFunction(function_or_call)) {
+      if (attr_idx == -1) {
+         LLVMAddFunctionAttr(function_or_call, llvm_attr);
+      } else {
+         LLVMAddAttribute(LLVMGetParam(function_or_call, attr_idx - 1), llvm_attr);
+      }
+   } else {
+      LLVMAddInstrAttribute(function_or_call, attr_idx, llvm_attr);
+   }
+#else
+
+   LLVMModuleRef module;
+   if (LLVMIsAFunction(function_or_call)) {
+      module = LLVMGetGlobalParent(function_or_call);
+   } else {
+      LLVMBasicBlockRef bb = LLVMGetInstructionParent(function_or_call);
+      LLVMValueRef function = LLVMGetBasicBlockParent(bb);
+      module = LLVMGetGlobalParent(function);
+   }
+   LLVMContextRef ctx = LLVMGetModuleContext(module);
+
+   const char *attr_name = attr_to_str(attr);
+   unsigned kind_id = LLVMGetEnumAttributeKindForName(attr_name,
+                                                      strlen(attr_name));
+   LLVMAttributeRef llvm_attr = LLVMCreateEnumAttribute(ctx, kind_id, 0);
+
+   if (LLVMIsAFunction(function_or_call))
+      LLVMAddAttributeAtIndex(function_or_call, attr_idx, llvm_attr);
+   else
+      LLVMAddCallSiteAttribute(function_or_call, attr_idx, llvm_attr);
+#endif
+}
+
+static void
+lp_add_func_attributes(LLVMValueRef function, unsigned attrib_mask)
+{
+   /* NoUnwind indicates that the intrinsic never raises a C++ exception.
+    * Set it for all intrinsics.
+    */
+   attrib_mask |= LP_FUNC_ATTR_NOUNWIND;
+   attrib_mask &= ~LP_FUNC_ATTR_LEGACY;
+
+   while (attrib_mask) {
+      enum lp_func_attr attr = 1u << u_bit_scan(&attrib_mask);
+      lp_add_function_attr(function, -1, attr);
+   }
+}
+
 LLVMValueRef
 lp_build_intrinsic(LLVMBuilderRef builder,
                    const char *name,
                    LLVMTypeRef ret_type,
                    LLVMValueRef *args,
                    unsigned num_args,
-                   LLVMAttribute attr)
+                   unsigned attr_mask)
 {
    LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
-   LLVMValueRef function;
+   LLVMValueRef function, call;
+   bool set_callsite_attrs = HAVE_LLVM >= 0x0400 &&
+                             !(attr_mask & LP_FUNC_ATTR_LEGACY);
 
    function = LLVMGetNamedFunction(module, name);
    if(!function) {
@@ -145,17 +243,18 @@ lp_build_intrinsic(LLVMBuilderRef builder,
 
       function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args);
 
-      /* NoUnwind indicates that the intrinsic never raises a C++ exception.
-       * Set it for all intrinsics.
-       */
-      LLVMAddFunctionAttr(function, attr | LLVMNoUnwindAttribute);
+      if (!set_callsite_attrs)
+         lp_add_func_attributes(function, attr_mask);
 
       if (gallivm_debug & GALLIVM_DEBUG_IR) {
          lp_debug_dump_value(function);
       }
    }
 
-   return LLVMBuildCall(builder, function, args, num_args, "");
+   call = LLVMBuildCall(builder, function, args, num_args, "");
+   if (set_callsite_attrs)
+      lp_add_func_attributes(call, attr_mask);
+   return call;
 }
 
 
@@ -243,9 +342,9 @@ lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
       unsigned num_vec = src_type.length / intrin_length;
       LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
 
-      /* don't support arbitrary size here as this is so yuck */
+      /* don't support arbitrary size here as this is so yuck */
       if (src_type.length % intrin_length) {
-         /* FIXME: This is something which should be supported
+         /* FIXME: This is something which should be supported
           * but there doesn't seem to be any need for it currently
           * so crash and burn.
           */
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h
index 7d80ac28f..0a929c519 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -46,6 +46,24 @@
  */
 #define LP_MAX_FUNC_ARGS 32
 
+enum lp_func_attr {
+   LP_FUNC_ATTR_ALWAYSINLINE = (1 << 0),
+   LP_FUNC_ATTR_BYVAL        = (1 << 1),
+   LP_FUNC_ATTR_INREG        = (1 << 2),
+   LP_FUNC_ATTR_NOALIAS      = (1 << 3),
+   LP_FUNC_ATTR_NOUNWIND     = (1 << 4),
+   LP_FUNC_ATTR_READNONE     = (1 << 5),
+   LP_FUNC_ATTR_READONLY     = (1 << 6),
+   LP_FUNC_ATTR_WRITEONLY    = HAVE_LLVM >= 0x0400 ? (1 << 7) : 0,
+   LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY = HAVE_LLVM >= 0x0400 ? (1 << 8) : 0,
+   LP_FUNC_ATTR_CONVERGENT   = HAVE_LLVM >= 0x0400 ? (1 << 9) : 0,
+
+   /* Legacy intrinsic that needs attributes on function declarations
+    * and they must match the internal LLVM definition exactly, otherwise
+    * intrinsic selection fails.
+    */
+   LP_FUNC_ATTR_LEGACY       = (1u << 31),
+};
 
 void
 lp_format_intrinsic(char *name,
@@ -60,13 +78,17 @@ lp_declare_intrinsic(LLVMModuleRef module,
                      LLVMTypeRef *arg_types,
                      unsigned num_args);
 
+void
+lp_add_function_attr(LLVMValueRef function_or_call,
+                     int attr_idx, enum lp_func_attr attr);
+
 LLVMValueRef
 lp_build_intrinsic(LLVMBuilderRef builder,
                    const char *name,
                    LLVMTypeRef ret_type,
                    LLVMValueRef *args,
                    unsigned num_args,
-                   LLVMAttribute attr);
+                   unsigned attr_mask);
 
 
 LLVMValueRef
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index 32addec97..354e2a46b 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -49,8 +49,6 @@
 
 #define LP_MAX_TGSI_IMMEDIATES 4096
 
-#define LP_MAX_TGSI_PREDS 16
-
 #define LP_MAX_TGSI_CONSTS 4096
 
 #define LP_MAX_TGSI_CONST_BUFFERS 16
@@ -109,8 +107,6 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
       return PIPE_MAX_CONSTANT_BUFFERS;
    case PIPE_SHADER_CAP_MAX_TEMPS:
       return LP_MAX_TGSI_TEMPS;
-   case PIPE_SHADER_CAP_MAX_PREDS:
-      return LP_MAX_TGSI_PREDS;
    case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
       return 1;
    case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
@@ -133,13 +129,13 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 1;
-   case PIPE_SHADER_CAP_DOUBLES:
-      return 1;
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
    case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+   case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+   case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
       return 0;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 1a50e82c2..524917abe 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -327,6 +327,8 @@ lp_build_select(struct lp_build_context *bld,
        * supported yet for a long time, and LLVM will generate poor code when
        * the mask is not the result of a comparison.
        * Also, llvm 3.7 may miscompile them (bug 94972).
+       * XXX: Even if the instruction was an SExt, this may still produce
+       * terrible code. Try piglit stencil-twoside.
        */
 
       /* Convert the mask to a vector of booleans.
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 3efb6a8e7..d988910a7 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -32,14 +32,6 @@
  */
 
 
-#ifndef __STDC_LIMIT_MACROS
-#define __STDC_LIMIT_MACROS
-#endif
-
-#ifndef __STDC_CONSTANT_MACROS
-#define __STDC_CONSTANT_MACROS
-#endif
-
 // Undef these vars just to silence warnings
 #undef PACKAGE_BUGREPORT
 #undef PACKAGE_NAME
@@ -57,6 +49,9 @@
 #endif
 
 #include <llvm-c/Core.h>
+#if HAVE_LLVM >= 0x0306
+#include <llvm-c/Support.h>
+#endif
 #include <llvm-c/ExecutionEngine.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
@@ -77,6 +72,9 @@
 
 #include <llvm/Support/TargetSelect.h>
 
+#if HAVE_LLVM >= 0x0305
+#include <llvm/IR/CallSite.h>
+#endif
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Module.h>
 #include <llvm/Support/CBindingWrapping.h>
@@ -127,20 +125,26 @@ static void init_native_targets()
    llvm::InitializeNativeTargetAsmPrinter();
 
    llvm::InitializeNativeTargetDisassembler();
-}
-
-/**
- * The llvm target registry is not thread-safe, so drivers and state-trackers
- * that want to initialize targets should use the gallivm_init_llvm_targets()
- * function to safely initialize targets.
- *
- * LLVM targets should be initialized before the driver or state-tracker tries
- * to access the registry.
- */
-extern "C" void
-gallivm_init_llvm_targets(void)
-{
-   call_once(&init_native_targets_once_flag, init_native_targets);
+#if DEBUG && HAVE_LLVM >= 0x0306
+   {
+      char *env_llc_options = getenv("GALLIVM_LLC_OPTIONS");
+      if (env_llc_options) {
+         char *option;
+         char *options[64] = {(char *) "llc"};      // Warning without cast
+         int   n;
+         for (n = 0, option = strtok(env_llc_options, " "); option; n++, option = strtok(NULL, " ")) {
+            options[n + 1] = option;
+         }
+         if (gallivm_debug & (GALLIVM_DEBUG_IR | GALLIVM_DEBUG_ASM | GALLIVM_DEBUG_DUMP_BC)) {
+            debug_printf("llc additional options (%d):\n", n);
+            for (int i = 1; i <= n; i++)
+               debug_printf("\t%s\n", options[i]);
+            debug_printf("\n");
+         }
+         LLVMParseCommandLineOptions(n + 1, options, NULL);
+      }
+   }
+#endif
 }
 
 extern "C" void
@@ -155,7 +159,14 @@ lp_set_target_options(void)
    llvm::DisablePrettyStackTrace = true;
 #endif
 
-   gallivm_init_llvm_targets();
+   /* The llvm target registry is not thread-safe, so drivers and state-trackers
+    * that want to initialize targets should use the lp_set_target_options()
+    * function to safely initialize targets.
+    *
+    * LLVM targets should be initialized before the driver or state-tracker tries
+    * to access the registry.
+    */
+   call_once(&init_native_targets_once_flag, init_native_targets);
 }
 
 extern "C"
@@ -347,14 +358,20 @@ class DelegatingJITMemoryManager : public BaseMemoryManager {
       virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
          mgr()->registerEHFrames(Addr, LoadAddr, Size);
       }
-      virtual void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
-         mgr()->deregisterEHFrames(Addr, LoadAddr, Size);
-      }
 #else
       virtual void registerEHFrames(llvm::StringRef SectionData) {
          mgr()->registerEHFrames(SectionData);
       }
 #endif
+#if HAVE_LLVM >= 0x0500
+      virtual void deregisterEHFrames() {
+         mgr()->deregisterEHFrames();
+      }
+#elif HAVE_LLVM >= 0x0304
+      virtual void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
+         mgr()->deregisterEHFrames(Addr, LoadAddr, Size);
+      }
+#endif
       virtual void *getPointerToNamedFunction(const std::string &Name,
                                               bool AbortOnFailure=true) {
          return mgr()->getPointerToNamedFunction(Name, AbortOnFailure);
@@ -540,6 +557,20 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
    llvm::SmallVector<std::string, 16> MAttrs;
 
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+#if HAVE_LLVM >= 0x0400
+   /* llvm-3.7+ implements sys::getHostCPUFeatures for x86,
+    * which allows us to enable/disable code generation based
+    * on the results of cpuid.
+    */
+   llvm::StringMap<bool> features;
+   llvm::sys::getHostCPUFeatures(features);
+
+   for (StringMapIterator<bool> f = features.begin();
+        f != features.end();
+        ++f) {
+      MAttrs.push_back(((*f).second ? "+" : "-") + (*f).first().str());
+   }
+#else
    /*
     * We need to unset attributes because sometimes LLVM mistakenly assumes
     * certain features are present given the processor name.
@@ -594,27 +625,51 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
    MAttrs.push_back("-avx512vl");
 #endif
 #endif
+#endif
 
 #if defined(PIPE_ARCH_PPC)
    MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec");
 #if (HAVE_LLVM >= 0x0304)
-#if (HAVE_LLVM <= 0x0307) || (HAVE_LLVM == 0x0308 && MESA_LLVM_VERSION_PATCH == 0)
+#if (HAVE_LLVM < 0x0400)
    /*
     * Make sure VSX instructions are disabled
-    * See LLVM bug https://llvm.org/bugs/show_bug.cgi?id=25503#c7
+    * See LLVM bugs:
+    * https://llvm.org/bugs/show_bug.cgi?id=25503#c7 (fixed in 3.8.1)
+    * https://llvm.org/bugs/show_bug.cgi?id=26775 (fixed in 3.8.1)
+    * https://llvm.org/bugs/show_bug.cgi?id=33531 (fixed in 4.0)
+    * https://llvm.org/bugs/show_bug.cgi?id=34647 (llc performance on certain unusual shader IR; intro'd in 4.0, pending as of 5.0)
     */
    if (util_cpu_caps.has_altivec) {
       MAttrs.push_back("-vsx");
    }
 #else
    /*
-    * However, bug 25503 is fixed, by the same fix that fixed
-    * bug 26775, in versions of LLVM later than 3.8 (starting with 3.8.1):
-    * Make sure VSX instructions are ENABLED
-    * See LLVM bug https://llvm.org/bugs/show_bug.cgi?id=26775
+    * Bug 25503 is fixed, by the same fix that fixed
+    * bug 26775, in versions of LLVM later than 3.8 (starting with 3.8.1).
+    * BZ 33531 actually comprises more than one bug, all of
+    * which are fixed in LLVM 4.0.
+    *
+    * With LLVM 4.0 or higher:
+    * Make sure VSX instructions are ENABLED, unless
+    * a) the entire -mattr option is overridden via GALLIVM_MATTRS, or
+    * b) VSX instructions are explicitly enabled/disabled via GALLIVM_VSX=1 or 0.
     */
    if (util_cpu_caps.has_altivec) {
-      MAttrs.push_back("+vsx");
+      char *env_mattrs = getenv("GALLIVM_MATTRS");
+      if (env_mattrs) {
+         MAttrs.push_back(env_mattrs);
+      }
+      else {
+         boolean enable_vsx = true;
+         char *env_vsx = getenv("GALLIVM_VSX");
+         if (env_vsx && env_vsx[0] == '0') {
+            enable_vsx = false;
+         }
+         if (enable_vsx)
+            MAttrs.push_back("+vsx");
+         else
+            MAttrs.push_back("-vsx");
+      }
    }
 #endif
 #endif
@@ -737,13 +792,49 @@ lp_free_memory_manager(LLVMMCJITMemoryManagerRef memorymgr)
    delete reinterpret_cast<BaseMemoryManager*>(memorymgr);
 }
 
-extern "C" void
-lp_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes)
+extern "C" LLVMValueRef
+lp_get_called_value(LLVMValueRef call)
 {
-#if HAVE_LLVM >= 0x0306
-   llvm::Argument *A = llvm::unwrap<llvm::Argument>(val);
-   llvm::AttrBuilder B;
-   B.addDereferenceableAttr(bytes);
-   A->addAttr(llvm::AttributeSet::get(A->getContext(), A->getArgNo() + 1,  B));
+#if HAVE_LLVM >= 0x0309
+	return LLVMGetCalledValue(call);
+#elif HAVE_LLVM >= 0x0305
+	return llvm::wrap(llvm::CallSite(llvm::unwrap<llvm::Instruction>(call)).getCalledValue());
+#else
+	return NULL; /* radeonsi doesn't support so old LLVM. */
+#endif
+}
+
+extern "C" bool
+lp_is_function(LLVMValueRef v)
+{
+#if HAVE_LLVM >= 0x0309
+	return LLVMGetValueKind(v) == LLVMFunctionValueKind;
+#else
+	return llvm::isa<llvm::Function>(llvm::unwrap(v));
+#endif
+}
+
+extern "C" LLVMBuilderRef
+lp_create_builder(LLVMContextRef ctx, enum lp_float_mode float_mode)
+{
+   LLVMBuilderRef builder = LLVMCreateBuilderInContext(ctx);
+
+#if HAVE_LLVM >= 0x0308
+   llvm::FastMathFlags flags;
+
+   switch (float_mode) {
+   case LP_FLOAT_MODE_DEFAULT:
+      break;
+   case LP_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH:
+      flags.setNoSignedZeros();
+      llvm::unwrap(builder)->setFastMathFlags(flags);
+      break;
+   case LP_FLOAT_MODE_UNSAFE_FP_MATH:
+      flags.setUnsafeAlgebra();
+      llvm::unwrap(builder)->setFastMathFlags(flags);
+      break;
+   }
 #endif
+
+   return builder;
 }
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h
index c127c480d..1b725d10d 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h
@@ -42,9 +42,6 @@ extern "C" {
 
 struct lp_generated_code;
 
-extern void
-gallivm_init_llvm_targets(void);
-
 extern LLVMTargetLibraryInfoRef
 gallivm_create_target_library_info(const char *triple);
 
@@ -73,8 +70,20 @@ lp_get_default_memory_manager();
 extern void
 lp_free_memory_manager(LLVMMCJITMemoryManagerRef memorymgr);
 
-extern void
-lp_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes);
+extern LLVMValueRef
+lp_get_called_value(LLVMValueRef call);
+
+extern bool
+lp_is_function(LLVMValueRef v);
+
+enum lp_float_mode {
+   LP_FLOAT_MODE_DEFAULT,
+   LP_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH,
+   LP_FLOAT_MODE_UNSAFE_FP_MATH,
+};
+
+extern LLVMBuilderRef
+lp_create_builder(LLVMContextRef ctx, enum lp_float_mode float_mode);
 
 #ifdef __cplusplus
 }
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index a4b3a7b83..a1dc61d40 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -1412,8 +1412,8 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
 {
    const unsigned dims = bld->dims;
    LLVMValueRef width;
-   LLVMValueRef height;
-   LLVMValueRef depth;
+   LLVMValueRef height = NULL;
+   LLVMValueRef depth = NULL;
 
    lp_build_extract_image_sizes(bld,
                                 &bld->float_size_bld,
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index f91b761dc..c46749dba 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -579,10 +579,12 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
    LLVMValueRef rgba8;
    struct lp_build_context u8n;
    LLVMTypeRef u8n_vec_type;
+   struct lp_type fetch_type;
 
    lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
    u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
 
+   fetch_type = lp_type_uint(bld->texel_type.width);
    if (util_format_is_rgba8_variant(bld->format_desc)) {
       /*
        * Given the format is a rgba8, just read the pixels as is,
@@ -591,7 +593,7 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
       rgba8 = lp_build_gather(bld->gallivm,
                               bld->texel_type.length,
                               bld->format_desc->block.bits,
-                              bld->texel_type.width,
+                              fetch_type,
                               TRUE,
                               data_ptr, offset, TRUE);
 
@@ -925,14 +927,16 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
             LLVMValueRef rgba8;
 
             if (util_format_is_rgba8_variant(bld->format_desc)) {
+               struct lp_type fetch_type;
                /*
                 * Given the format is a rgba8, just read the pixels as is,
                 * without any swizzling. Swizzling will be done later.
                 */
+               fetch_type = lp_type_uint(bld->texel_type.width);
                rgba8 = lp_build_gather(bld->gallivm,
                                        bld->texel_type.length,
                                        bld->format_desc->block.bits,
-                                       bld->texel_type.width,
+                                       fetch_type,
                                        TRUE,
                                        data_ptr, offset[k][j][i], TRUE);
 
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 1477a72d6..cb4660e42 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -60,6 +60,7 @@
 #include "lp_bld_struct.h"
 #include "lp_bld_quad.h"
 #include "lp_bld_pack.h"
+#include "lp_bld_intr.h"
 
 
 /**
@@ -158,7 +159,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
 
    lp_build_fetch_rgba_soa(bld->gallivm,
                            bld->format_desc,
-                           bld->texel_type,
+                           bld->texel_type, TRUE,
                            data_ptr, offset,
                            i, j,
                            bld->cache,
@@ -2405,7 +2406,7 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
 
    lp_build_fetch_rgba_soa(bld->gallivm,
                            bld->format_desc,
-                           bld->texel_type,
+                           bld->texel_type, TRUE,
                            bld->base_ptr, offset,
                            i, j,
                            bld->cache,
@@ -3316,7 +3317,8 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
 
       for (i = 0; i < num_param; ++i) {
          if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
-            LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute);
+
+            lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
          }
       }
 
@@ -3460,7 +3462,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
                         struct lp_sampler_dynamic_state *dynamic_state,
                         const struct lp_sampler_size_query_params *params)
 {
-   LLVMValueRef lod, level, size;
+   LLVMValueRef lod, level = 0, size;
    LLVMValueRef first_level = NULL;
    int dims, i;
    boolean has_array;
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index 68ac69538..69863ab93 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -323,16 +323,14 @@ lp_build_tgsi_inst_llvm(
 
 
 LLVMValueRef
-lp_build_emit_fetch(
+lp_build_emit_fetch_src(
    struct lp_build_tgsi_context *bld_base,
-   const struct tgsi_full_instruction *inst,
-   unsigned src_op,
+   const struct tgsi_full_src_register *reg,
+   enum tgsi_opcode_type stype,
    const unsigned chan_index)
 {
-   const struct tgsi_full_src_register *reg = &inst->Src[src_op];
    unsigned swizzle;
    LLVMValueRef res;
-   enum tgsi_opcode_type stype = tgsi_opcode_infer_src_type(inst->Instruction.Opcode);
 
    if (chan_index == LP_CHAN_ALL) {
       swizzle = ~0u;
@@ -360,7 +358,7 @@ lp_build_emit_fetch(
       case TGSI_TYPE_DOUBLE:
       case TGSI_TYPE_UNTYPED:
           /* modifiers on movs assume data is float */
-         res = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_ABS, res);
+         res = lp_build_abs(&bld_base->base, res);
          break;
       case TGSI_TYPE_UNSIGNED:
       case TGSI_TYPE_SIGNED:
@@ -413,7 +411,21 @@ lp_build_emit_fetch(
    }
 
    return res;
+}
+
+
+LLVMValueRef
+lp_build_emit_fetch(
+   struct lp_build_tgsi_context *bld_base,
+   const struct tgsi_full_instruction *inst,
+   unsigned src_op,
+   const unsigned chan_index)
+{
+   const struct tgsi_full_src_register *reg = &inst->Src[src_op];
+   enum tgsi_opcode_type stype =
+      tgsi_opcode_infer_src_type(inst->Instruction.Opcode);
 
+   return lp_build_emit_fetch_src(bld_base, reg, stype, chan_index);
 }
 
 
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index b6b3fe369..eb632b700 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -458,7 +458,6 @@ struct lp_build_tgsi_soa_context
    LLVMValueRef immediates[LP_MAX_INLINED_IMMEDIATES][TGSI_NUM_CHANNELS];
    LLVMValueRef temps[LP_MAX_INLINED_TEMPS][TGSI_NUM_CHANNELS];
    LLVMValueRef addr[LP_MAX_TGSI_ADDRS][TGSI_NUM_CHANNELS];
-   LLVMValueRef preds[LP_MAX_TGSI_PREDS][TGSI_NUM_CHANNELS];
 
    /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
     * set in the indirect_files field.
@@ -552,7 +551,6 @@ struct lp_build_tgsi_aos_context
    LLVMValueRef immediates[LP_MAX_INLINED_IMMEDIATES];
    LLVMValueRef temps[LP_MAX_INLINED_TEMPS];
    LLVMValueRef addr[LP_MAX_TGSI_ADDRS];
-   LLVMValueRef preds[LP_MAX_TGSI_PREDS];
 
    /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
     * set in the indirect_files field.
@@ -645,6 +643,13 @@ lp_build_tgsi_inst_llvm(
    const struct tgsi_full_instruction *inst);
 
 LLVMValueRef
+lp_build_emit_fetch_src(
+   struct lp_build_tgsi_context *bld_base,
+   const struct tgsi_full_src_register *reg,
+   enum tgsi_opcode_type stype,
+   const unsigned chan_index);
+
+LLVMValueRef
 lp_build_emit_fetch(
    struct lp_build_tgsi_context *bld_base,
    const struct tgsi_full_instruction *inst,
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 2e837afe2..dc6568a2d 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -110,21 +110,6 @@ arr_emit(
 							bld_base->uint_bld.vec_type, "");
 }
 
-/* TGSI_OPCODE_CLAMP */
-static void
-clamp_emit(
-   const struct lp_build_tgsi_action * action,
-   struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
-{
-   LLVMValueRef tmp;
-   tmp = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
-                                   emit_data->args[0],
-                                   emit_data->args[1]);
-   emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base,
-                                       TGSI_OPCODE_MIN, tmp, emit_data->args[2]);
-}
-
 /* DP* Helper */
 
 static void
@@ -368,8 +353,8 @@ exp_emit(
                                        TGSI_OPCODE_EX2, floor_x);
 
    /* src0.x - floor( src0.x ) */
-   emit_data->output[TGSI_CHAN_Y] = lp_build_emit_llvm_binary(bld_base,
-                   TGSI_OPCODE_SUB,  emit_data->args[0] /* src0.x */, floor_x);
+   emit_data->output[TGSI_CHAN_Y] =
+      lp_build_sub(&bld_base->base, emit_data->args[0] /* src0.x */, floor_x);
 
    /* 2 ^ src0.x */
    emit_data->output[TGSI_CHAN_Z] = lp_build_emit_llvm_unary(bld_base,
@@ -394,8 +379,8 @@ frc_emit(
    LLVMValueRef tmp;
    tmp = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_FLR,
                                   emit_data->args[0]);
-   emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base,
-                                       TGSI_OPCODE_SUB, emit_data->args[0], tmp);
+   emit_data->output[emit_data->chan] =
+      lp_build_sub(&bld_base->base, emit_data->args[0], tmp);
 }
 
 /* TGSI_OPCODE_KILL_IF */
@@ -499,8 +484,7 @@ log_emit(
    LLVMValueRef abs_x, log_abs_x, flr_log_abs_x, ex2_flr_log_abs_x;
 
    /* abs( src0.x) */
-   abs_x = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_ABS,
-                                    emit_data->args[0] /* src0.x */);
+   abs_x = lp_build_abs(&bld_base->base, emit_data->args[0] /* src0.x */);
 
    /* log( abs( src0.x ) ) */
    log_abs_x = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_LG2,
@@ -771,19 +755,6 @@ const struct lp_build_tgsi_action scs_action = {
    scs_emit	 /* emit */
 };
 
-/* TGSI_OPCODE_SUB */
-static void
-sub_emit(
-   const struct lp_build_tgsi_action * action,
-   struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
-{
-   emit_data->output[emit_data->chan] =
-      LLVMBuildFSub(bld_base->base.gallivm->builder,
-                    emit_data->args[0],
-                    emit_data->args[1], "");
-}
-
 /* TGSI_OPCODE_F2U */
 static void
 f2u_emit(
@@ -842,26 +813,32 @@ imul_hi_emit(
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
    struct lp_build_context *int_bld = &bld_base->int_bld;
-   struct lp_type type = int_bld->type;
-   LLVMValueRef src0, src1;
-   LLVMValueRef dst64;
-   LLVMTypeRef typeRef;
-
-   assert(type.width == 32);
-   type.width = 64;
-   typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
-   src0 = LLVMBuildSExt(builder, emit_data->args[0], typeRef, "");
-   src1 = LLVMBuildSExt(builder, emit_data->args[1], typeRef, "");
-   dst64 = LLVMBuildMul(builder, src0, src1, "");
-   dst64 = LLVMBuildAShr(
-            builder, dst64,
-            lp_build_const_vec(bld_base->base.gallivm, type, 32), "");
-   type.width = 32;
-   typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
-   emit_data->output[emit_data->chan] =
-         LLVMBuildTrunc(builder, dst64, typeRef, "");
+   LLVMValueRef hi_bits;
+
+   assert(int_bld->type.width == 32);
+
+   /* low result bits are tossed away */
+   lp_build_mul_32_lohi(int_bld, emit_data->args[0],
+                        emit_data->args[1], &hi_bits);
+   emit_data->output[emit_data->chan] = hi_bits;
+}
+
+static void
+imul_hi_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_context *int_bld = &bld_base->int_bld;
+   LLVMValueRef hi_bits;
+
+   assert(int_bld->type.width == 32);
+
+   /* low result bits are tossed away */
+   lp_build_mul_32_lohi_cpu(int_bld, emit_data->args[0],
+                            emit_data->args[1], &hi_bits);
+   emit_data->output[emit_data->chan] = hi_bits;
 }
 
 /* TGSI_OPCODE_UMUL_HI */
@@ -871,26 +848,32 @@ umul_hi_emit(
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
-   struct lp_type type = uint_bld->type;
-   LLVMValueRef src0, src1;
-   LLVMValueRef dst64;
-   LLVMTypeRef typeRef;
-
-   assert(type.width == 32);
-   type.width = 64;
-   typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
-   src0 = LLVMBuildZExt(builder, emit_data->args[0], typeRef, "");
-   src1 = LLVMBuildZExt(builder, emit_data->args[1], typeRef, "");
-   dst64 = LLVMBuildMul(builder, src0, src1, "");
-   dst64 = LLVMBuildLShr(
-            builder, dst64,
-            lp_build_const_vec(bld_base->base.gallivm, type, 32), "");
-   type.width = 32;
-   typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
-   emit_data->output[emit_data->chan] =
-         LLVMBuildTrunc(builder, dst64, typeRef, "");
+   LLVMValueRef hi_bits;
+
+   assert(uint_bld->type.width == 32);
+
+   /* low result bits are tossed away */
+   lp_build_mul_32_lohi(uint_bld, emit_data->args[0],
+                        emit_data->args[1], &hi_bits);
+   emit_data->output[emit_data->chan] = hi_bits;
+}
+
+static void
+umul_hi_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   LLVMValueRef hi_bits;
+
+   assert(uint_bld->type.width == 32);
+
+   /* low result bits are tossed away */
+   lp_build_mul_32_lohi_cpu(uint_bld, emit_data->args[0],
+                            emit_data->args[1], &hi_bits);
+   emit_data->output[emit_data->chan] = hi_bits;
 }
 
 /* TGSI_OPCODE_MAX */
@@ -945,7 +928,7 @@ xpd_helper(
    tmp0 = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MUL, a, b);
    tmp1 = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MUL, c, d);
 
-   return lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_SUB, tmp0, tmp1);
+   return lp_build_sub(&bld_base->base, tmp0, tmp1);
 }
 
 static void
@@ -1332,7 +1315,6 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
 
    bld_base->op_actions[TGSI_OPCODE_ADD].emit = add_emit;
    bld_base->op_actions[TGSI_OPCODE_ARR].emit = arr_emit;
-   bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = clamp_emit;
    bld_base->op_actions[TGSI_OPCODE_END].emit = end_emit;
    bld_base->op_actions[TGSI_OPCODE_FRC].emit = frc_emit;
    bld_base->op_actions[TGSI_OPCODE_LRP].emit = lrp_emit;
@@ -1341,7 +1323,6 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
    bld_base->op_actions[TGSI_OPCODE_MUL].emit = mul_emit;
    bld_base->op_actions[TGSI_OPCODE_DIV].emit = fdiv_emit;
    bld_base->op_actions[TGSI_OPCODE_RCP].emit = rcp_emit;
-   bld_base->op_actions[TGSI_OPCODE_SUB].emit = sub_emit;
 
    bld_base->op_actions[TGSI_OPCODE_UARL].emit = mov_emit;
    bld_base->op_actions[TGSI_OPCODE_F2U].emit = f2u_emit;
@@ -1358,6 +1339,7 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
    bld_base->op_actions[TGSI_OPCODE_DMAX].emit = fmax_emit;
    bld_base->op_actions[TGSI_OPCODE_DMIN].emit = fmin_emit;
    bld_base->op_actions[TGSI_OPCODE_DMUL].emit = mul_emit;
+   bld_base->op_actions[TGSI_OPCODE_DDIV].emit = fdiv_emit;
 
    bld_base->op_actions[TGSI_OPCODE_D2F].emit = d2f_emit;
    bld_base->op_actions[TGSI_OPCODE_D2I].emit = d2i_emit;
@@ -1400,18 +1382,6 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
  * intrinsics.
  */
 
-/* TGSI_OPCODE_ABS (CPU Only)*/
-
-static void
-abs_emit_cpu(
-   const struct lp_build_tgsi_action * action,
-   struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
-{
-   emit_data->output[emit_data->chan] = lp_build_abs(&bld_base->base,
-                                                       emit_data->args[0]);
-}
-
 /* TGSI_OPCODE_ADD (CPU Only) */
 static void
 add_emit_cpu(
@@ -2072,19 +2042,6 @@ ssg_emit_cpu(
                                                        emit_data->args[0]);
 }
 
-/* TGSI_OPCODE_SUB (CPU Only) */
-
-static void
-sub_emit_cpu(
-   const struct lp_build_tgsi_action * action,
-   struct lp_build_tgsi_context * bld_base,
-   struct lp_build_emit_data * emit_data)
-{
-   emit_data->output[emit_data->chan] = lp_build_sub(&bld_base->base,
-                                                        emit_data->args[0],
-                                                        emit_data->args[1]);
-}
-
 /* TGSI_OPCODE_TRUNC (CPU Only) */
 
 static void
@@ -2576,7 +2533,6 @@ lp_set_default_actions_cpu(
    struct lp_build_tgsi_context * bld_base)
 {
    lp_set_default_actions(bld_base);
-   bld_base->op_actions[TGSI_OPCODE_ABS].emit = abs_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_ADD].emit = add_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_AND].emit = and_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_ARL].emit = arl_emit_cpu;
@@ -2603,6 +2559,8 @@ lp_set_default_actions_cpu(
    bld_base->op_actions[TGSI_OPCODE_ISHR].emit = ishr_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_ISLT].emit = islt_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_ISSG].emit = issg_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_IMUL_HI].emit = imul_hi_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_UMUL_HI].emit = umul_hi_emit_cpu;
 
    bld_base->op_actions[TGSI_OPCODE_LG2].emit = lg2_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_LOG].emit = log_emit_cpu;
@@ -2624,7 +2582,6 @@ lp_set_default_actions_cpu(
    bld_base->op_actions[TGSI_OPCODE_SLT].emit = slt_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_SNE].emit = sne_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_SSG].emit = ssg_emit_cpu;
-   bld_base->op_actions[TGSI_OPCODE_SUB].emit = sub_emit_cpu;
    bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = trunc_emit_cpu;
 
    bld_base->rsq_action.emit = recip_sqrt_emit_cpu;
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index 610283d79..58c39facf 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -256,10 +256,6 @@ lp_emit_store_aos(
       ptr = bld->addr[reg->Indirect.Index];
       break;
 
-   case TGSI_FILE_PREDICATE:
-      ptr = bld->preds[reg->Register.Index];
-      break;
-
    default:
       assert(0);
       return;
@@ -267,43 +263,6 @@ lp_emit_store_aos(
 
    if (!ptr)
       return;
-   /*
-    * Predicate
-    */
-
-   if (inst->Instruction.Predicate) {
-      LLVMValueRef pred;
-
-      assert(inst->Predicate.Index < LP_MAX_TGSI_PREDS);
-
-      pred = LLVMBuildLoad(builder,
-                           bld->preds[inst->Predicate.Index], "");
-
-      /*
-       * Convert the value to an integer mask.
-       */
-      pred = lp_build_compare(bld->bld_base.base.gallivm,
-                               bld->bld_base.base.type,
-                               PIPE_FUNC_NOTEQUAL,
-                               pred,
-                               bld->bld_base.base.zero);
-
-      if (inst->Predicate.Negate) {
-         pred = LLVMBuildNot(builder, pred, "");
-      }
-
-      pred = bld->bld_base.emit_swizzle(&bld->bld_base, pred,
-                         inst->Predicate.SwizzleX,
-                         inst->Predicate.SwizzleY,
-                         inst->Predicate.SwizzleZ,
-                         inst->Predicate.SwizzleW);
-
-      if (mask) {
-         mask = LLVMBuildAnd(builder, mask, pred, "");
-      } else {
-         mask = pred;
-      }
-   }
 
    /*
     * Writemask
@@ -442,11 +401,6 @@ lp_emit_declaration_aos(
          bld->addr[idx] = lp_build_alloca(gallivm, vec_type, "");
          break;
 
-      case TGSI_FILE_PREDICATE:
-         assert(idx < LP_MAX_TGSI_PREDS);
-         bld->preds[idx] = lp_build_alloca(gallivm, vec_type, "");
-         break;
-
       case TGSI_FILE_SAMPLER_VIEW:
          /*
           * The target stored here MUST match whatever there actually
@@ -521,7 +475,7 @@ lp_emit_instruction_aos(
    case TGSI_OPCODE_RSQ:
    /* TGSI_OPCODE_RECIPSQRT */
       src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
-      tmp0 = lp_build_emit_llvm_unary(&bld->bld_base, TGSI_OPCODE_ABS, src0);
+      tmp0 = lp_build_abs(&bld->bld_base.base, src0);
       dst0 = lp_build_rsqrt(&bld->bld_base.base, tmp0);
       break;
 
@@ -591,12 +545,6 @@ lp_emit_instruction_aos(
       dst0 = lp_build_add(&bld->bld_base.base, tmp0, src2);
       break;
 
-   case TGSI_OPCODE_SUB:
-      src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
-      src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL);
-      dst0 = lp_build_sub(&bld->bld_base.base, src0, src1);
-      break;
-
    case TGSI_OPCODE_LRP:
       src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
       src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL);
@@ -615,14 +563,6 @@ lp_emit_instruction_aos(
       dst0 = lp_build_sub(&bld->bld_base.base, src0, tmp0);
       break;
 
-   case TGSI_OPCODE_CLAMP:
-      src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
-      src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL);
-      src2 = lp_build_emit_fetch(&bld->bld_base, inst, 2, LP_CHAN_ALL);
-      tmp0 = lp_build_max(&bld->bld_base.base, src0, src1);
-      dst0 = lp_build_min(&bld->bld_base.base, tmp0, src2);
-      break;
-
    case TGSI_OPCODE_FLR:
       src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
       dst0 = lp_build_floor(&bld->bld_base.base, src0);
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
index f8f43a561..e0cc0af27 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
@@ -305,8 +305,7 @@ analyse_instruction(struct analysis_context *ctx,
       } else if (dst->File == TGSI_FILE_OUTPUT) {
          regs = info->output;
          max_regs = ARRAY_SIZE(info->output);
-      } else if (dst->File == TGSI_FILE_ADDRESS ||
-                 dst->File == TGSI_FILE_PREDICATE) {
+      } else if (dst->File == TGSI_FILE_ADDRESS) {
          continue;
       } else {
          assert(0);
@@ -389,8 +388,7 @@ analyse_instruction(struct analysis_context *ctx,
 
          memset(res, 0, sizeof res);
 
-         if (!inst->Instruction.Predicate &&
-             !inst->Instruction.Saturate) {
+         if (!inst->Instruction.Saturate) {
             for (chan = 0; chan < 4; ++chan) {
                if (dst->WriteMask & (1 << chan)) {
                   if (inst->Instruction.Opcode == TGSI_OPCODE_MOV) {
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 6871795b4..bfa32b9ad 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -753,30 +753,21 @@ static void lp_exec_default(struct lp_exec_mask *mask,
  */
 static void lp_exec_mask_store(struct lp_exec_mask *mask,
                                struct lp_build_context *bld_store,
-                               LLVMValueRef pred,
                                LLVMValueRef val,
                                LLVMValueRef dst_ptr)
 {
    LLVMBuilderRef builder = mask->bld->gallivm->builder;
+   LLVMValueRef exec_mask = mask->has_mask ? mask->exec_mask : NULL;
 
    assert(lp_check_value(bld_store->type, val));
    assert(LLVMGetTypeKind(LLVMTypeOf(dst_ptr)) == LLVMPointerTypeKind);
    assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val));
 
-   /* Mix the predicate and execution mask */
-   if (mask->has_mask) {
-      if (pred) {
-         pred = LLVMBuildAnd(builder, pred, mask->exec_mask, "");
-      } else {
-         pred = mask->exec_mask;
-      }
-   }
-
-   if (pred) {
+   if (exec_mask) {
       LLVMValueRef res, dst;
 
       dst = LLVMBuildLoad(builder, dst_ptr, "");
-      res = lp_build_select(bld_store, pred, val, dst);
+      res = lp_build_select(bld_store, exec_mask, val, dst);
       LLVMBuildStore(builder, res, dst_ptr);
    } else
       LLVMBuildStore(builder, val, dst_ptr);
@@ -1036,22 +1027,12 @@ emit_mask_scatter(struct lp_build_tgsi_soa_context *bld,
                   LLVMValueRef base_ptr,
                   LLVMValueRef indexes,
                   LLVMValueRef values,
-                  struct lp_exec_mask *mask,
-                  LLVMValueRef pred)
+                  struct lp_exec_mask *mask)
 {
    struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    unsigned i;
-
-   /* Mix the predicate and execution mask */
-   if (mask->has_mask) {
-      if (pred) {
-         pred = LLVMBuildAnd(builder, pred, mask->exec_mask, "");
-      }
-      else {
-         pred = mask->exec_mask;
-      }
-   }
+   LLVMValueRef pred = mask->has_mask ? mask->exec_mask : NULL;
 
    /*
     * Loop over elements of index_vec, store scalar value.
@@ -1733,74 +1714,6 @@ emit_fetch_deriv(
       *ddy = lp_build_ddy(&bld->bld_base.base, src);
 }
 
-
-/**
- * Predicate.
- */
-static void
-emit_fetch_predicate(
-   struct lp_build_tgsi_soa_context *bld,
-   const struct tgsi_full_instruction *inst,
-   LLVMValueRef *pred)
-{
-   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   unsigned index;
-   unsigned char swizzles[4];
-   LLVMValueRef unswizzled[4] = {NULL, NULL, NULL, NULL};
-   LLVMValueRef value;
-   unsigned chan;
-
-   if (!inst->Instruction.Predicate) {
-      TGSI_FOR_EACH_CHANNEL( chan ) {
-         pred[chan] = NULL;
-      }
-      return;
-   }
-
-   swizzles[0] = inst->Predicate.SwizzleX;
-   swizzles[1] = inst->Predicate.SwizzleY;
-   swizzles[2] = inst->Predicate.SwizzleZ;
-   swizzles[3] = inst->Predicate.SwizzleW;
-
-   index = inst->Predicate.Index;
-   assert(index < LP_MAX_TGSI_PREDS);
-
-   TGSI_FOR_EACH_CHANNEL( chan ) {
-      unsigned swizzle = swizzles[chan];
-
-      /*
-       * Only fetch the predicate register channels that are actually listed
-       * in the swizzles
-       */
-      if (!unswizzled[swizzle]) {
-         value = LLVMBuildLoad(builder,
-                               bld->preds[index][swizzle], "");
-
-         /*
-          * Convert the value to an integer mask.
-          *
-          * TODO: Short-circuit this comparison -- a D3D setp_xx instructions
-          * is needlessly causing two comparisons due to storing the intermediate
-          * result as float vector instead of an integer mask vector.
-          */
-         value = lp_build_compare(bld->bld_base.base.gallivm,
-                                  bld->bld_base.base.type,
-                                  PIPE_FUNC_NOTEQUAL,
-                                  value,
-                                  bld->bld_base.base.zero);
-         if (inst->Predicate.Negate) {
-            value = LLVMBuildNot(builder, value, "");
-         }
-
-         unswizzled[swizzle] = value;
-      } else {
-         value = unswizzled[swizzle];
-      }
-
-      pred[chan] = value;
-   }
-}
-
 /**
  * store an array of 8 64-bit into two arrays of 8 floats
  * i.e.
@@ -1813,7 +1726,6 @@ emit_fetch_predicate(
 static void
 emit_store_64bit_chan(struct lp_build_tgsi_context *bld_base,
                       LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2,
-                      LLVMValueRef pred,
                       LLVMValueRef value)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
@@ -1841,8 +1753,8 @@ emit_store_64bit_chan(struct lp_build_tgsi_context *bld_base,
                                                   bld_base->base.type.length),
                                   "");
 
-   lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp, chan_ptr);
-   lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp2, chan_ptr2);
+   lp_exec_mask_store(&bld->exec_mask, float_bld, temp, chan_ptr);
+   lp_exec_mask_store(&bld->exec_mask, float_bld, temp2, chan_ptr2);
 }
 
 /**
@@ -1854,7 +1766,6 @@ emit_store_chan(
    const struct tgsi_full_instruction *inst,
    unsigned index,
    unsigned chan_index,
-   LLVMValueRef pred,
    LLVMValueRef value)
 {
    struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
@@ -1917,7 +1828,7 @@ emit_store_chan(
 
          /* Scatter store values into output registers */
          emit_mask_scatter(bld, outputs_array, index_vec, value,
-                           &bld->exec_mask, pred);
+                           &bld->exec_mask);
       }
       else {
          LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index,
@@ -1927,9 +1838,9 @@ emit_store_chan(
             LLVMValueRef out_ptr2 = lp_get_output_ptr(bld, reg->Register.Index,
                                                       chan_index + 1);
             emit_store_64bit_chan(bld_base, out_ptr, out_ptr2,
-                                  pred, value);
+                                  value);
          } else
-            lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
+            lp_exec_mask_store(&bld->exec_mask, float_bld, value, out_ptr);
       }
       break;
 
@@ -1955,7 +1866,7 @@ emit_store_chan(
 
          /* Scatter store values into temp registers */
          emit_mask_scatter(bld, temps_array, index_vec, value,
-                           &bld->exec_mask, pred);
+                           &bld->exec_mask);
       }
       else {
          LLVMValueRef temp_ptr;
@@ -1966,10 +1877,10 @@ emit_store_chan(
                                                          reg->Register.Index,
                                                          chan_index + 1);
             emit_store_64bit_chan(bld_base, temp_ptr, temp_ptr2,
-                                  pred, value);
+                                  value);
          }
          else
-            lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
+            lp_exec_mask_store(&bld->exec_mask, float_bld, value, temp_ptr);
       }
       break;
 
@@ -1977,17 +1888,10 @@ emit_store_chan(
       assert(dtype == TGSI_TYPE_SIGNED);
       assert(LLVMTypeOf(value) == int_bld->vec_type);
       value = LLVMBuildBitCast(builder, value, int_bld->vec_type, "");
-      lp_exec_mask_store(&bld->exec_mask, int_bld, pred, value,
+      lp_exec_mask_store(&bld->exec_mask, int_bld, value,
                          bld->addr[reg->Register.Index][chan_index]);
       break;
 
-   case TGSI_FILE_PREDICATE:
-      assert(LLVMTypeOf(value) == float_bld->vec_type);
-      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
-      lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value,
-                         bld->preds[reg->Register.Index][chan_index]);
-      break;
-
    default:
       assert( 0 );
    }
@@ -2037,18 +1941,14 @@ emit_store(
 
 {
    unsigned chan_index;
-   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
    enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode);
-   if(info->num_dst) {
-      LLVMValueRef pred[TGSI_NUM_CHANNELS];
-
-      emit_fetch_predicate( bld, inst, pred );
 
+   if(info->num_dst) {
       TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
 
          if (tgsi_type_is_64bit(dtype) && (chan_index == 1 || chan_index == 3))
              continue;
-         emit_store_chan(bld_base, inst, 0, chan_index, pred[chan_index], dst[chan_index]);
+         emit_store_chan(bld_base, inst, 0, chan_index, dst[chan_index]);
       }
    }
 }
@@ -2998,15 +2898,6 @@ lp_emit_declaration_soa(
       }
       break;
 
-   case TGSI_FILE_PREDICATE:
-      assert(last < LP_MAX_TGSI_PREDS);
-      for (idx = first; idx <= last; ++idx) {
-         for (i = 0; i < TGSI_NUM_CHANNELS; i++)
-            bld->preds[idx][i] = lp_build_alloca(gallivm, vec_type,
-                                                 "predicate");
-      }
-      break;
-
    case TGSI_FILE_SAMPLER_VIEW:
       /*
        * The target stored here MUST match whatever there actually
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h
index 7fb449fd0..afe8722b0 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -41,6 +41,10 @@
 #include "pipe/p_compiler.h"
 #include "gallivm/lp_bld.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * Native SIMD architecture width available at runtime.
  *
@@ -449,5 +453,8 @@ lp_build_context_init(struct lp_build_context *bld,
 unsigned
 lp_build_count_ir_module(LLVMModuleRef module);
 
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* !LP_BLD_TYPE_H */
author	Jonathan Gray <jsg@cvs.openbsd.org>	2017-12-31 07:12:27 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2017-12-31 07:12:27 +0000
commit	051645c92924bf915d82bf219f2ed67309b5577a (patch)
tree	4aae126dd8e5a18c6a9926a5468d1561e6038a07 /lib/mesa/src/gallium/auxiliary/gallivm
parent	2dae6fe6f74cf7fb9fd65285302c0331d9786b00 (diff)