summaryrefslogtreecommitdiff
path: root/lib/mesa/src/gallium/auxiliary/gallivm
diff options
context:
space:
mode:
authorJonathan Gray <jsg@cvs.openbsd.org>2017-12-31 07:12:27 +0000
committerJonathan Gray <jsg@cvs.openbsd.org>2017-12-31 07:12:27 +0000
commit051645c92924bf915d82bf219f2ed67309b5577a (patch)
tree4aae126dd8e5a18c6a9926a5468d1561e6038a07 /lib/mesa/src/gallium/auxiliary/gallivm
parent2dae6fe6f74cf7fb9fd65285302c0331d9786b00 (diff)
Merge Mesa 17.2.8
Diffstat (limited to 'lib/mesa/src/gallium/auxiliary/gallivm')
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c178
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h12
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c195
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h1
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c259
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c9
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c708
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c4
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c399
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h2
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c71
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c117
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h24
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h8
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c2
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp171
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h19
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c4
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c8
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c10
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c24
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h9
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c161
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c62
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c6
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c141
-rw-r--r--lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h7
27 files changed, 1871 insertions, 740 deletions
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 3ea073433..04f86bef2 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1091,6 +1091,180 @@ lp_build_mul(struct lp_build_context *bld,
return res;
}
+/*
+ * Widening mul, valid for 32x32 bit -> 64bit only.
+ * Result is low 32bits, high bits returned in res_hi.
+ *
+ * Emits code that is meant to be compiled for the host CPU.
+ */
+LLVMValueRef
+lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
+ LLVMValueRef a,
+ LLVMValueRef b,
+ LLVMValueRef *res_hi)
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+
+ assert(bld->type.width == 32);
+ assert(bld->type.floating == 0);
+ assert(bld->type.fixed == 0);
+ assert(bld->type.norm == 0);
+
+ /*
+ * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
+ * for x86 simd is atrocious (even if the high bits weren't required),
+ * trying to handle real 64bit inputs (which of course can't happen due
+ * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
+ * apparently llvm does not recognize this widening mul). This includes 6
+ * (instead of 2) pmuludq plus extra adds and shifts
+ * The same story applies to signed mul, albeit fixing this requires sse41.
+ * https://llvm.org/bugs/show_bug.cgi?id=30845
+ * So, whip up our own code, albeit only for length 4 and 8 (which
+ * should be good enough)...
+ */
+ if ((bld->type.length == 4 || bld->type.length == 8) &&
+ ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
+ util_cpu_caps.has_sse4_1)) {
+ const char *intrinsic = NULL;
+ LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
+ LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
+ struct lp_type type_wide = lp_wider_type(bld->type);
+ LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
+ unsigned i;
+ for (i = 0; i < bld->type.length; i += 2) {
+ shuf[i] = lp_build_const_int32(gallivm, i+1);
+ shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+ }
+ shuf_vec = LLVMConstVector(shuf, bld->type.length);
+ aeven = a;
+ beven = b;
+ aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
+ bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
+
+ if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
+ if (bld->type.sign) {
+ intrinsic = "llvm.x86.avx2.pmul.dq";
+ } else {
+ intrinsic = "llvm.x86.avx2.pmulu.dq";
+ }
+ muleven = lp_build_intrinsic_binary(builder, intrinsic,
+ wider_type, aeven, beven);
+ mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+ wider_type, aodd, bodd);
+ }
+ else {
+ /* for consistent naming look elsewhere... */
+ if (bld->type.sign) {
+ intrinsic = "llvm.x86.sse41.pmuldq";
+ } else {
+ intrinsic = "llvm.x86.sse2.pmulu.dq";
+ }
+ /*
+ * XXX If we only have AVX but not AVX2 this is a pain.
+ * lp_build_intrinsic_binary_anylength() can't handle it
+ * (due to src and dst type not being identical).
+ */
+ if (bld->type.length == 8) {
+ LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
+ LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
+ LLVMValueRef muleven2[2], mulodd2[2];
+ struct lp_type type_wide_half = type_wide;
+ LLVMTypeRef wtype_half;
+ type_wide_half.length = 2;
+ wtype_half = lp_build_vec_type(gallivm, type_wide_half);
+ aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
+ aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
+ bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
+ bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
+ aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
+ aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
+ boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
+ boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
+ muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
+ wtype_half, aevenlo, bevenlo);
+ mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
+ wtype_half, aoddlo, boddlo);
+ muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
+ wtype_half, aevenhi, bevenhi);
+ mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
+ wtype_half, aoddhi, boddhi);
+ muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
+ mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
+
+ }
+ else {
+ muleven = lp_build_intrinsic_binary(builder, intrinsic,
+ wider_type, aeven, beven);
+ mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+ wider_type, aodd, bodd);
+ }
+ }
+ muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
+ mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
+
+ for (i = 0; i < bld->type.length; i += 2) {
+ shuf[i] = lp_build_const_int32(gallivm, i + 1);
+ shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
+ }
+ shuf_vec = LLVMConstVector(shuf, bld->type.length);
+ *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
+
+ for (i = 0; i < bld->type.length; i += 2) {
+ shuf[i] = lp_build_const_int32(gallivm, i);
+ shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
+ }
+ shuf_vec = LLVMConstVector(shuf, bld->type.length);
+ return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
+ }
+ else {
+ return lp_build_mul_32_lohi(bld, a, b, res_hi);
+ }
+}
+
+
+/*
+ * Widening mul, valid for 32x32 bit -> 64bit only.
+ * Result is low 32bits, high bits returned in res_hi.
+ *
+ * Emits generic code.
+ */
+LLVMValueRef
+lp_build_mul_32_lohi(struct lp_build_context *bld,
+ LLVMValueRef a,
+ LLVMValueRef b,
+ LLVMValueRef *res_hi)
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef tmp, shift, res_lo;
+ struct lp_type type_tmp;
+ LLVMTypeRef wide_type, narrow_type;
+
+ type_tmp = bld->type;
+ narrow_type = lp_build_vec_type(gallivm, type_tmp);
+ type_tmp.width *= 2;
+ wide_type = lp_build_vec_type(gallivm, type_tmp);
+ shift = lp_build_const_vec(gallivm, type_tmp, 32);
+
+ if (bld->type.sign) {
+ a = LLVMBuildSExt(builder, a, wide_type, "");
+ b = LLVMBuildSExt(builder, b, wide_type, "");
+ } else {
+ a = LLVMBuildZExt(builder, a, wide_type, "");
+ b = LLVMBuildZExt(builder, b, wide_type, "");
+ }
+ tmp = LLVMBuildMul(builder, a, b, "");
+
+ res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
+
+ /* Since we truncate anyway, LShr and AShr are equivalent. */
+ tmp = LLVMBuildLShr(builder, tmp, shift, "");
+ *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
+
+ return res_lo;
+}
+
/* a * b + c */
LLVMValueRef
@@ -1198,7 +1372,9 @@ lp_build_div(struct lp_build_context *bld,
return LLVMConstUDiv(a, b);
}
- if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+ /* fast rcp is disabled (just uses div), so makes no sense to try that */
+ if(FALSE &&
+ ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
(util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
type.floating)
return lp_build_mul(bld, a, lp_build_rcp(bld, b));
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 622b930a9..2a4137a67 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -77,6 +77,18 @@ lp_build_mul(struct lp_build_context *bld,
LLVMValueRef b);
LLVMValueRef
+lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
+ LLVMValueRef a,
+ LLVMValueRef b,
+ LLVMValueRef *res_hi);
+
+LLVMValueRef
+lp_build_mul_32_lohi(struct lp_build_context *bld,
+ LLVMValueRef a,
+ LLVMValueRef b,
+ LLVMValueRef *res_hi);
+
+LLVMValueRef
lp_build_mul_imm(struct lp_build_context *bld,
LLVMValueRef a,
int b);
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 69d24a55b..c688965a7 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -456,21 +456,21 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
src_type.sign == dst_type->sign)
return num_dsts;
- /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
+ /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8
*/
- if (src_type.floating == 1 &&
- src_type.fixed == 0 &&
- src_type.sign == 1 &&
- src_type.norm == 0 &&
+ if (src_type.norm == 0 &&
src_type.width == 32 &&
+ src_type.fixed == 0 &&
dst_type->floating == 0 &&
dst_type->fixed == 0 &&
- dst_type->sign == 0 &&
- dst_type->norm == 1 &&
- dst_type->width == 8)
- {
- /* Special case 4x4f --> 1x16ub */
+ dst_type->width == 8 &&
+
+ ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) ||
+ (src_type.floating == 0 && dst_type->floating == 0 &&
+ src_type.sign == dst_type->sign && dst_type->norm == 0))) {
+
+ /* Special case 4x4x32 --> 1x16x8 */
if (src_type.length == 4 &&
(util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
{
@@ -481,7 +481,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
return num_dsts;
}
- /* Special case 2x8f --> 1x16ub */
+ /* Special case 2x8x32 --> 1x16x8 */
if (src_type.length == 8 &&
util_cpu_caps.has_avx)
{
@@ -497,8 +497,25 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
if (src_type.width == dst_type->width) {
lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
} else {
- for (i = 0; i < num_srcs; ++i) {
- lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
+ /*
+ * If dst_width is 16 bits and src_width 32 and the dst vector size
+ * 64bit, try feeding 2 vectors at once so pack intrinsics can be used.
+ * (For AVX, this isn't needed, since we usually get 256bit src and
+ * 128bit dst vectors which works ok. If we do AVX2 pack this should
+ * be extended but need to be able to tell conversion code about pack
+ * ordering first.)
+ */
+ unsigned ratio = 1;
+ if (src_type.width == 2 * dst_type->width &&
+ src_type.length == dst_type->length &&
+ dst_type->floating == 0 && (num_srcs % 2 == 0) &&
+ dst_type->width * dst_type->length == 64) {
+ ratio = 2;
+ num_dsts /= 2;
+ dst_type->length *= 2;
+ }
+ for (i = 0; i < num_dsts; i++) {
+ lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, &dst[i], 1);
}
}
@@ -541,21 +558,25 @@ lp_build_conv(struct gallivm_state *gallivm,
num_tmps = num_srcs;
- /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub
+ /*
+ * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8
+ * Only float -> s/unorm8 and (u)int32->(u)int8.
+ * XXX: This should cover all interesting backend cases for 8 bit,
+ * but should use same strategy if dst is 16 bit.
*/
- if (src_type.floating == 1 &&
- src_type.fixed == 0 &&
- src_type.sign == 1 &&
- src_type.norm == 0 &&
+ if (src_type.norm == 0 &&
src_type.width == 32 &&
src_type.length == 4 &&
+ src_type.fixed == 0 &&
dst_type.floating == 0 &&
dst_type.fixed == 0 &&
- dst_type.sign == 0 &&
- dst_type.norm == 1 &&
dst_type.width == 8 &&
+ ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
+ (src_type.floating == 0 && dst_type.floating == 0 &&
+ src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
+
((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
(num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
@@ -564,7 +585,7 @@ lp_build_conv(struct gallivm_state *gallivm,
struct lp_build_context bld;
struct lp_type int16_type, int32_type;
struct lp_type dst_type_ext = dst_type;
- LLVMValueRef const_255f;
+ LLVMValueRef const_scale;
unsigned i, j;
lp_build_context_init(&bld, gallivm, src_type);
@@ -580,14 +601,54 @@ lp_build_conv(struct gallivm_state *gallivm,
int32_type.length /= 4;
int32_type.sign = 1;
- const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+ const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
for (i = 0; i < num_dsts; ++i, src += 4) {
LLVMValueRef lo, hi;
- for (j = 0; j < dst_type.length / 4; ++j) {
- tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
- tmp[j] = lp_build_iround(&bld, tmp[j]);
+ if (src_type.floating) {
+ for (j = 0; j < dst_type.length / 4; ++j) {
+ /*
+ * XXX This is not actually fully correct. The float to int
+ * conversion will produce 0x80000000 value for everything
+ * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq).
+ * Hence, NaNs and negatives will get clamped just fine to zero
+ * (relying on clamping pack behavior) when converting to unorm,
+ * however too large values (both finite and infinite) will also
+ * end up as zero, not 255.
+ * For snorm, for now we'll keep bug compatibility with generic
+ * conversion path (meaning too large values are fine, but
+ * NaNs get converted to -128 (purely by luck, as we don't
+ * specify nan behavior for the max there) instead of 0).
+ */
+ if (dst_type.sign) {
+ tmp[j] = lp_build_min(&bld, bld.one, src[j]);
+
+ }
+ else {
+ if (0) {
+ tmp[j] = lp_build_min_ext(&bld, bld.one, src[j],
+ GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
+ }
+ tmp[j] = src[j];
+ }
+ tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, "");
+ tmp[j] = lp_build_iround(&bld, tmp[j]);
+ }
+ } else {
+ for (j = 0; j < dst_type.length / 4; ++j) {
+ if (!dst_type.sign) {
+ /*
+ * Pack clamp is always signed->unsigned (or signed->signed).
+ * Hence need min.
+ */
+ LLVMValueRef const_max;
+ const_max = lp_build_const_int_vec(gallivm, src_type, 255);
+ tmp[j] = lp_build_min(&bld, src[j], const_max);
+ } else {
+ tmp[j] = src[j];
+ }
+ }
}
if (num_srcs == 1) {
@@ -612,20 +673,20 @@ lp_build_conv(struct gallivm_state *gallivm,
return;
}
- /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub
+ /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8
*/
- else if (src_type.floating == 1 &&
- src_type.fixed == 0 &&
- src_type.sign == 1 &&
- src_type.norm == 0 &&
- src_type.width == 32 &&
- src_type.length == 8 &&
-
- dst_type.floating == 0 &&
- dst_type.fixed == 0 &&
- dst_type.sign == 0 &&
- dst_type.norm == 1 &&
- dst_type.width == 8 &&
+ else if (src_type.norm == 0 &&
+ src_type.width == 32 &&
+ src_type.length == 8 &&
+ src_type.fixed == 0 &&
+
+ dst_type.floating == 0 &&
+ dst_type.fixed == 0 &&
+ dst_type.width == 8 &&
+
+ ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
+ (src_type.floating == 0 && dst_type.floating == 0 &&
+ src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
(num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
@@ -635,7 +696,7 @@ lp_build_conv(struct gallivm_state *gallivm,
struct lp_build_context bld;
struct lp_type int16_type, int32_type;
struct lp_type dst_type_ext = dst_type;
- LLVMValueRef const_255f;
+ LLVMValueRef const_scale;
unsigned i;
lp_build_context_init(&bld, gallivm, src_type);
@@ -651,30 +712,44 @@ lp_build_conv(struct gallivm_state *gallivm,
int32_type.length /= 4;
int32_type.sign = 1;
- const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+ const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
for (i = 0; i < num_dsts; ++i, src += 2) {
- LLVMValueRef lo, hi, a, b;
-
- a = LLVMBuildFMul(builder, src[0], const_255f, "");
- a = lp_build_iround(&bld, a);
- tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
- tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
- /* relying on clamping behavior of sse2 intrinsics here */
- lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
-
- if (num_srcs == 1) {
- hi = lo;
+ unsigned j;
+ for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) {
+ LLVMValueRef lo, hi, a;
+
+ a = src[j];
+ if (src_type.floating) {
+ if (dst_type.sign) {
+ a = lp_build_min(&bld, bld.one, a);
+
+ }
+ else {
+ if (0) {
+ a = lp_build_min_ext(&bld, bld.one, a,
+ GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
+ }
+ }
+ a = LLVMBuildFMul(builder, a, const_scale, "");
+ a = lp_build_iround(&bld, a);
+ } else {
+ if (!dst_type.sign) {
+ LLVMValueRef const_max;
+ const_max = lp_build_const_int_vec(gallivm, src_type, 255);
+ a = lp_build_min(&bld, a, const_max);
+ }
+ }
+ lo = lp_build_extract_range(gallivm, a, 0, 4);
+ hi = lp_build_extract_range(gallivm, a, 4, 4);
+ /* relying on clamping behavior of sse2 intrinsics here */
+ tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi);
}
- else {
- b = LLVMBuildFMul(builder, src[1], const_255f, "");
- b = lp_build_iround(&bld, b);
- tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
- tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
- hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
+ if (num_srcs == 1) {
+ tmp[1] = tmp[0];
}
- dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
+ dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]);
}
if (num_srcs == 1) {
@@ -841,6 +916,10 @@ lp_build_conv(struct gallivm_state *gallivm,
new_type.width = dst_type.width;
new_type.length = dst_type.length;
+ /*
+ * Note that resize when using packs can sometimes get min/max
+ * clamping for free. Should be able to exploit this...
+ */
lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
tmp_type = new_type;
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 5c866f420..6540caaa2 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -143,6 +143,7 @@ void
lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
const struct util_format_description *format_desc,
struct lp_type type,
+ boolean aligned,
LLVMValueRef base_ptr,
LLVMValueRef offsets,
LLVMValueRef i,
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 9f6b9e9fb..2f723857f 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -38,6 +38,7 @@
#include "util/u_math.h"
#include "util/u_pointer.h"
#include "util/u_string.h"
+#include "util/u_cpu_detect.h"
#include "lp_bld_arit.h"
#include "lp_bld_init.h"
@@ -49,7 +50,10 @@
#include "lp_bld_gather.h"
#include "lp_bld_debug.h"
#include "lp_bld_format.h"
+#include "lp_bld_pack.h"
#include "lp_bld_intr.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_bitarit.h"
/**
@@ -137,6 +141,73 @@ format_matches_type(const struct util_format_description *desc,
return TRUE;
}
+/*
+ * Do rounding when converting small unorm values to larger ones.
+ * Not quite 100% accurate, as it's done by appending MSBs, but
+ * should be good enough.
+ */
+
+static inline LLVMValueRef
+scale_bits_up(struct gallivm_state *gallivm,
+ int src_bits,
+ int dst_bits,
+ LLVMValueRef src,
+ struct lp_type src_type)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef result = src;
+
+ if (src_bits == 1 && dst_bits > 1) {
+ /*
+ * Useful for a1 - we'd need quite some repeated copies otherwise.
+ */
+ struct lp_build_context bld;
+ LLVMValueRef dst_mask;
+ lp_build_context_init(&bld, gallivm, src_type);
+ dst_mask = lp_build_const_int_vec(gallivm, src_type,
+ (1 << dst_bits) - 1),
+ result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src,
+ lp_build_const_int_vec(gallivm, src_type, 0));
+ result = lp_build_andnot(&bld, dst_mask, result);
+ }
+ else if (dst_bits > src_bits) {
+ /* Scale up bits */
+ int db = dst_bits - src_bits;
+
+ /* Shift left by difference in bits */
+ result = LLVMBuildShl(builder,
+ src,
+ lp_build_const_int_vec(gallivm, src_type, db),
+ "");
+
+ if (db <= src_bits) {
+ /* Enough bits in src to fill the remainder */
+ LLVMValueRef lower = LLVMBuildLShr(builder,
+ src,
+ lp_build_const_int_vec(gallivm, src_type,
+ src_bits - db),
+ "");
+
+ result = LLVMBuildOr(builder, result, lower, "");
+ } else if (db > src_bits) {
+ /* Need to repeatedly copy src bits to fill remainder in dst */
+ unsigned n;
+
+ for (n = src_bits; n < dst_bits; n *= 2) {
+ LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
+
+ result = LLVMBuildOr(builder,
+ result,
+ LLVMBuildLShr(builder, result, shuv, ""),
+ "");
+ }
+ }
+ } else {
+ assert (dst_bits == src_bits);
+ }
+
+ return result;
+}
/**
* Unpack a single pixel into its XYZW components.
@@ -156,6 +227,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
LLVMValueRef shifts[4];
LLVMValueRef masks[4];
LLVMValueRef scales[4];
+ LLVMTypeRef vec32_type;
boolean normalized;
boolean needs_uitofp;
@@ -171,19 +243,17 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
* matches floating point size */
assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
+ vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+
/* Broadcast the packed value to all four channels
* before: packed = BGRA
* after: packed = {BGRA, BGRA, BGRA, BGRA}
*/
- packed = LLVMBuildInsertElement(builder,
- LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
- packed,
+ packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
"");
- packed = LLVMBuildShuffleVector(builder,
- packed,
- LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
- LLVMConstNull(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
+ packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
+ LLVMConstNull(vec32_type),
"");
/* Initialize vector constants */
@@ -224,8 +294,40 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
/* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
* into masked = {X, Y, Z, W}
*/
- shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
- masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+ if (desc->block.bits < 32 && normalized) {
+ /*
+ * Note: we cannot do the shift below on x86 natively until AVX2.
+ *
+ * Old llvm versions will resort to scalar extract/shift insert,
+ * which is definitely terrible, new versions will just do
+ * several vector shifts and shuffle/blend results together.
+ * We could turn this into a variable left shift plus a constant
+ * right shift, and llvm would then turn the variable left shift
+ * into a mul for us (albeit without sse41 the mul needs emulation
+ * too...). However, since we're going to do a float mul
+ * anyway, we just adjust that mul instead (plus the mask), skipping
+ * the shift completely.
+ * We could also use a extra mul when the format isn't normalized and
+ * we don't have AVX2 support, but don't bother for now. Unfortunately,
+ * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
+ * rgba8 if it ends up here), as that would require UIToFP, albeit that
+ * would be fixable with easy 16bit shuffle (unless there's channels
+ * crossing 16bit boundaries).
+ */
+ for (i = 0; i < 4; ++i) {
+ if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
+ unsigned bits = desc->channel[i].size;
+ unsigned shift = desc->channel[i].shift;
+ unsigned long long mask = ((1ULL << bits) - 1) << shift;
+ scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
+ masks[i] = lp_build_const_int32(gallivm, mask);
+ }
+ }
+ masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
+ } else {
+ shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
+ masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+ }
if (!needs_uitofp) {
/* UIToFP can't be expressed in SSE2 */
@@ -234,8 +336,10 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
}
- /* At this point 'casted' may be a vector of floats such as
- * {255.0, 255.0, 255.0, 255.0}. Next, if the pixel values are normalized
+ /*
+ * At this point 'casted' may be a vector of floats such as
+ * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
+ * by powers of two). Next, if the pixel values are normalized
* we'll scale this to {1.0, 1.0, 1.0, 1.0}.
*/
@@ -391,9 +495,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
if (format_matches_type(format_desc, type) &&
format_desc->block.bits <= type.width * 4 &&
+ /* XXX this shouldn't be needed */
util_is_power_of_two(format_desc->block.bits)) {
LLVMValueRef packed;
LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
+ struct lp_type fetch_type;
unsigned vec_len = type.width * type.length;
/*
@@ -401,8 +507,9 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
* scaling or converting.
*/
+ fetch_type = lp_type_uint(type.width*4);
packed = lp_build_gather(gallivm, type.length/4,
- format_desc->block.bits, type.width*4,
+ format_desc->block.bits, fetch_type,
aligned, base_ptr, offset, TRUE);
assert(format_desc->block.bits <= vec_len);
@@ -413,6 +520,86 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
}
/*
+ * Bit arithmetic for converting small_unorm to unorm8.
+ *
+ * This misses some opportunities for optimizations (like skipping mask
+ * for the highest channel for instance, or doing bit scaling in parallel
+ * for channels with the same bit width) but it should be passable for
+ * all arithmetic formats.
+ */
+ if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+ format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
+ util_format_fits_8unorm(format_desc) &&
+ type.width == 8 && type.norm == 1 && type.sign == 0 &&
+ type.fixed == 0 && type.floating == 0) {
+ LLVMValueRef packed, res, chans[4], rgba[4];
+ LLVMTypeRef dst_vec_type, conv_vec_type;
+ struct lp_type fetch_type, conv_type;
+ struct lp_build_context bld_conv;
+ unsigned j;
+
+ fetch_type = lp_type_uint(type.width*4);
+ conv_type = lp_type_int_vec(type.width*4, type.width * type.length);
+ dst_vec_type = lp_build_vec_type(gallivm, type);
+ conv_vec_type = lp_build_vec_type(gallivm, conv_type);
+ lp_build_context_init(&bld_conv, gallivm, conv_type);
+
+ packed = lp_build_gather(gallivm, type.length/4,
+ format_desc->block.bits, fetch_type,
+ aligned, base_ptr, offset, TRUE);
+
+ assert(format_desc->block.bits * type.length / 4 <=
+ type.width * type.length);
+
+ packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, "");
+
+ for (j = 0; j < format_desc->nr_channels; ++j) {
+ unsigned mask = 0;
+ unsigned sa = format_desc->channel[j].shift;
+
+ mask = (1 << format_desc->channel[j].size) - 1;
+
+ /* Extract bits from source */
+ chans[j] = LLVMBuildLShr(builder, packed,
+ lp_build_const_int_vec(gallivm, conv_type, sa),
+ "");
+
+ chans[j] = LLVMBuildAnd(builder, chans[j],
+ lp_build_const_int_vec(gallivm, conv_type, mask),
+ "");
+
+ /* Scale bits */
+ if (type.norm) {
+ chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size,
+ type.width, chans[j], conv_type);
+ }
+ }
+ /*
+ * This is a hacked lp_build_format_swizzle_soa() since we need a
+ * normalized 1 but only 8 bits in a 32bit vector...
+ */
+ for (j = 0; j < 4; ++j) {
+ enum pipe_swizzle swizzle = format_desc->swizzle[j];
+ if (swizzle == PIPE_SWIZZLE_1) {
+ rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1);
+ } else {
+ rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle);
+ }
+ if (j == 0) {
+ res = rgba[j];
+ } else {
+ rgba[j] = LLVMBuildShl(builder, rgba[j],
+ lp_build_const_int_vec(gallivm, conv_type,
+ j * type.width), "");
+ res = LLVMBuildOr(builder, res, rgba[j], "");
+ }
+ }
+ res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, "");
+
+ return res;
+ }
+
+ /*
* Bit arithmetic
*/
@@ -421,6 +608,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
format_desc->block.width == 1 &&
format_desc->block.height == 1 &&
+ /* XXX this shouldn't be needed */
util_is_power_of_two(format_desc->block.bits) &&
format_desc->block.bits <= 32 &&
format_desc->is_bitmask &&
@@ -430,8 +618,15 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
!format_desc->channel[0].pure_integer) {
LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
- LLVMValueRef res;
- unsigned k;
+ LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
+ struct lp_type conv_type;
+ unsigned k, num_conv_src, num_conv_dst;
+
+ /*
+ * Note this path is generally terrible for fetching multiple pixels.
+ * We should make sure we cannot hit this code path for anything but
+ * single pixels.
+ */
/*
* Unpack a pixel at a time into a <4 x float> RGBA vector
@@ -461,12 +656,38 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
__FUNCTION__, format_desc->short_name);
}
- lp_build_conv(gallivm,
- lp_float32_vec4_type(),
- type,
- tmps, num_pixels, &res, 1);
+ conv_type = lp_float32_vec4_type();
+ num_conv_src = num_pixels;
+ num_conv_dst = 1;
+
+ if (num_pixels % 8 == 0) {
+ lp_build_concat_n(gallivm, lp_float32_vec4_type(),
+ tmps, num_pixels, tmps, num_pixels / 2);
+ conv_type.length *= num_pixels / 4;
+ num_conv_src = 4 * num_pixels / 8;
+ if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
+ /*
+ * FIXME: The fast float->unorm path (which is basically
+ * skipping the MIN/MAX which are extremely pointless in any
+ * case) requires that there's 2 destinations...
+ * In any case, we really should make sure we don't hit this
+ * code with multiple pixels for unorm8 dst types, it's
+ * completely hopeless even if we do hit the right conversion.
+ */
+ type.length /= num_pixels / 4;
+ num_conv_dst = num_pixels / 4;
+ }
+ }
+
+ lp_build_conv(gallivm, conv_type, type,
+ tmps, num_conv_src, res, num_conv_dst);
+
+ if (num_pixels % 8 == 0 &&
+ (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
+ lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
+ }
- return lp_build_format_swizzle_aos(format_desc, &bld, res);
+ return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
}
/* If all channels are of same type and we are not using half-floats */
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
index 8cad3a6fc..636a4a623 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -70,7 +70,14 @@ lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
src_vec_type = lp_build_vec_type(gallivm, src_type);
- /* Read whole vector from memory, unaligned */
+ /*
+ * Read whole vector from memory, unaligned.
+ * XXX: Note it's actually aligned to element type. Not sure if all
+ * callers are able to guarantee that (whereas for others, we should
+ * be able to use full alignment when there's 2 or 4 channels).
+ * (If all callers can guarantee element type alignment, we should
+ * relax alignment restrictions elsewhere.)
+ */
ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
res = LLVMBuildLoad(builder, ptr, "");
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 7fc4e8d24..22c19b10d 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -31,6 +31,7 @@
#include "util/u_format.h"
#include "util/u_memory.h"
#include "util/u_string.h"
+#include "util/u_math.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
@@ -40,6 +41,39 @@
#include "lp_bld_debug.h"
#include "lp_bld_format.h"
#include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
+
+
+static void
+convert_to_soa(struct gallivm_state *gallivm,
+ LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
+ LLVMValueRef dst_soa[4],
+ const struct lp_type soa_type)
+{
+ unsigned j, k;
+ struct lp_type aos_channel_type = soa_type;
+
+ LLVMValueRef aos_channels[4];
+ unsigned pixels_per_channel = soa_type.length / 4;
+
+ debug_assert((soa_type.length % 4) == 0);
+
+ aos_channel_type.length >>= 1;
+
+ for (j = 0; j < 4; ++j) {
+ LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
+
+ assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
+
+ for (k = 0; k < pixels_per_channel; ++k) {
+ channel[k] = src_aos[j + 4 * k];
+ }
+
+ aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
+ }
+
+ lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
+}
void
@@ -48,9 +82,6 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
const LLVMValueRef *unswizzled,
LLVMValueRef swizzled_out[4])
{
- assert(PIPE_SWIZZLE_0 == (int)PIPE_SWIZZLE_0);
- assert(PIPE_SWIZZLE_1 == (int)PIPE_SWIZZLE_1);
-
if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
enum pipe_swizzle swizzle;
LLVMValueRef depth_or_stencil;
@@ -83,6 +114,166 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
}
+
+static LLVMValueRef
+lp_build_extract_soa_chan(struct lp_build_context *bld,
+ unsigned blockbits,
+ boolean srgb_chan,
+ struct util_format_channel_description chan_desc,
+ LLVMValueRef packed)
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ struct lp_type type = bld->type;
+ LLVMValueRef input = packed;
+ const unsigned width = chan_desc.size;
+ const unsigned start = chan_desc.shift;
+ const unsigned stop = start + width;
+
+ /* Decode the input vector component */
+
+ switch(chan_desc.type) {
+ case UTIL_FORMAT_TYPE_VOID:
+ input = bld->undef;
+ break;
+
+ case UTIL_FORMAT_TYPE_UNSIGNED:
+ /*
+ * Align the LSB
+ */
+ if (start) {
+ input = LLVMBuildLShr(builder, input,
+ lp_build_const_int_vec(gallivm, type, start), "");
+ }
+
+ /*
+ * Zero the MSBs
+ */
+ if (stop < blockbits) {
+ unsigned mask = ((unsigned long long)1 << width) - 1;
+ input = LLVMBuildAnd(builder, input,
+ lp_build_const_int_vec(gallivm, type, mask), "");
+ }
+
+ /*
+ * Type conversion
+ */
+ if (type.floating) {
+ if (srgb_chan) {
+ struct lp_type conv_type = lp_uint_type(type);
+ input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
+ }
+ else {
+ if(chan_desc.normalized)
+ input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
+ else
+ input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+ }
+ }
+ else if (chan_desc.pure_integer) {
+ /* Nothing to do */
+ } else {
+ /* FIXME */
+ assert(0);
+ }
+ break;
+
+ case UTIL_FORMAT_TYPE_SIGNED:
+ /*
+ * Align the sign bit first.
+ */
+ if (stop < type.width) {
+ unsigned bits = type.width - stop;
+ LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
+ input = LLVMBuildShl(builder, input, bits_val, "");
+ }
+
+ /*
+ * Align the LSB (with an arithmetic shift to preserve the sign)
+ */
+ if (chan_desc.size < type.width) {
+ unsigned bits = type.width - chan_desc.size;
+ LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
+ input = LLVMBuildAShr(builder, input, bits_val, "");
+ }
+
+ /*
+ * Type conversion
+ */
+ if (type.floating) {
+ input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+ if (chan_desc.normalized) {
+ double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
+ LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+ input = LLVMBuildFMul(builder, input, scale_val, "");
+ /*
+ * The formula above will produce value below -1.0 for most negative
+ * value but everything seems happy with that hence disable for now.
+ */
+ if (0)
+ input = lp_build_max(bld, input,
+ lp_build_const_vec(gallivm, type, -1.0f));
+ }
+ }
+ else if (chan_desc.pure_integer) {
+ /* Nothing to do */
+ } else {
+ /* FIXME */
+ assert(0);
+ }
+ break;
+
+ case UTIL_FORMAT_TYPE_FLOAT:
+ if (type.floating) {
+ if (chan_desc.size == 16) {
+ struct lp_type f16i_type = type;
+ f16i_type.width /= 2;
+ f16i_type.floating = 0;
+ if (start) {
+ input = LLVMBuildLShr(builder, input,
+ lp_build_const_int_vec(gallivm, type, start), "");
+ }
+ input = LLVMBuildTrunc(builder, input,
+ lp_build_vec_type(gallivm, f16i_type), "");
+ input = lp_build_half_to_float(gallivm, input);
+ } else {
+ assert(start == 0);
+ assert(stop == 32);
+ assert(type.width == 32);
+ }
+ input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
+ }
+ else {
+ /* FIXME */
+ assert(0);
+ input = bld->undef;
+ }
+ break;
+
+ case UTIL_FORMAT_TYPE_FIXED:
+ if (type.floating) {
+ double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
+ LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+ input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+ input = LLVMBuildFMul(builder, input, scale_val, "");
+ }
+ else {
+ /* FIXME */
+ assert(0);
+ input = bld->undef;
+ }
+ break;
+
+ default:
+ assert(0);
+ input = bld->undef;
+ break;
+ }
+
+ return input;
+}
+
+
/**
* Unpack several pixels in SoA.
*
@@ -113,7 +304,6 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
LLVMValueRef packed,
LLVMValueRef rgba_out[4])
{
- LLVMBuilderRef builder = gallivm->builder;
struct lp_build_context bld;
LLVMValueRef inputs[4];
unsigned chan;
@@ -129,149 +319,19 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
/* Decode the input vector components */
for (chan = 0; chan < format_desc->nr_channels; ++chan) {
- const unsigned width = format_desc->channel[chan].size;
- const unsigned start = format_desc->channel[chan].shift;
- const unsigned stop = start + width;
- LLVMValueRef input;
-
- input = packed;
-
- switch(format_desc->channel[chan].type) {
- case UTIL_FORMAT_TYPE_VOID:
- input = lp_build_undef(gallivm, type);
- break;
-
- case UTIL_FORMAT_TYPE_UNSIGNED:
- /*
- * Align the LSB
- */
-
- if (start) {
- input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), "");
- }
-
- /*
- * Zero the MSBs
- */
-
- if (stop < format_desc->block.bits) {
- unsigned mask = ((unsigned long long)1 << width) - 1;
- input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), "");
- }
-
- /*
- * Type conversion
- */
-
- if (type.floating) {
- if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
- if (format_desc->swizzle[3] == chan) {
- input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
- }
- else {
- struct lp_type conv_type = lp_uint_type(type);
- input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
- }
- }
- else {
- if(format_desc->channel[chan].normalized)
- input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
- else
- input = LLVMBuildSIToFP(builder, input,
- lp_build_vec_type(gallivm, type), "");
- }
- }
- else if (format_desc->channel[chan].pure_integer) {
- /* Nothing to do */
- } else {
- /* FIXME */
- assert(0);
- }
-
- break;
-
- case UTIL_FORMAT_TYPE_SIGNED:
- /*
- * Align the sign bit first.
- */
-
- if (stop < type.width) {
- unsigned bits = type.width - stop;
- LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
- input = LLVMBuildShl(builder, input, bits_val, "");
- }
-
- /*
- * Align the LSB (with an arithmetic shift to preserve the sign)
- */
-
- if (format_desc->channel[chan].size < type.width) {
- unsigned bits = type.width - format_desc->channel[chan].size;
- LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
- input = LLVMBuildAShr(builder, input, bits_val, "");
- }
-
- /*
- * Type conversion
- */
+ struct util_format_channel_description chan_desc = format_desc->channel[chan];
+ boolean srgb_chan = FALSE;
- if (type.floating) {
- input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
- if (format_desc->channel[chan].normalized) {
- double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
- LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
- input = LLVMBuildFMul(builder, input, scale_val, "");
- /* the formula above will produce value below -1.0 for most negative
- * value but everything seems happy with that hence disable for now */
- if (0)
- input = lp_build_max(&bld, input,
- lp_build_const_vec(gallivm, type, -1.0f));
- }
- }
- else if (format_desc->channel[chan].pure_integer) {
- /* Nothing to do */
- } else {
- /* FIXME */
- assert(0);
- }
-
- break;
-
- case UTIL_FORMAT_TYPE_FLOAT:
- if (type.floating) {
- assert(start == 0);
- assert(stop == 32);
- assert(type.width == 32);
- input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), "");
- }
- else {
- /* FIXME */
- assert(0);
- input = lp_build_undef(gallivm, type);
- }
- break;
-
- case UTIL_FORMAT_TYPE_FIXED:
- if (type.floating) {
- double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
- LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
- input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
- input = LLVMBuildFMul(builder, input, scale_val, "");
- }
- else {
- /* FIXME */
- assert(0);
- input = lp_build_undef(gallivm, type);
- }
- break;
-
- default:
- assert(0);
- input = lp_build_undef(gallivm, type);
- break;
+ if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
+ format_desc->swizzle[3] != chan) {
+ srgb_chan = TRUE;
}
- inputs[chan] = input;
+ inputs[chan] = lp_build_extract_soa_chan(&bld,
+ format_desc->block.bits,
+ srgb_chan,
+ chan_desc,
+ packed);
}
lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
@@ -336,6 +396,7 @@ lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
*
* \param type the desired return type for 'rgba'. The vector length
* is the number of texels to fetch
+ * \param aligned if the offset is guaranteed to be aligned to element width
*
* \param base_ptr points to the base of the texture mip tree.
* \param offset offset to start of the texture image block. For non-
@@ -352,6 +413,7 @@ void
lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
const struct util_format_description *format_desc,
struct lp_type type,
+ boolean aligned,
LLVMValueRef base_ptr,
LLVMValueRef offset,
LLVMValueRef i,
@@ -360,6 +422,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
LLVMValueRef rgba_out[4])
{
LLVMBuilderRef builder = gallivm->builder;
+ enum pipe_format format = format_desc->format;
+ struct lp_type fetch_type;
if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
@@ -369,7 +433,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
format_desc->block.height == 1 &&
format_desc->block.bits <= type.width &&
(format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
- format_desc->channel[0].size == 32))
+ format_desc->channel[0].size == 32 ||
+ format_desc->channel[0].size == 16))
{
/*
* The packed pixel fits into an element of the destination format. Put
@@ -384,11 +449,12 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
* Ex: packed = {XYZW, XYZW, XYZW, XYZW}
*/
assert(format_desc->block.bits <= type.width);
+ fetch_type = lp_type_uint(type.width);
packed = lp_build_gather(gallivm,
type.length,
format_desc->block.bits,
- type.width,
- TRUE,
+ fetch_type,
+ aligned,
base_ptr, offset, FALSE);
/*
@@ -401,22 +467,232 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
return;
}
- if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
- format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+
+ if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+ (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
+ format_desc->block.width == 1 &&
+ format_desc->block.height == 1 &&
+ format_desc->block.bits > type.width &&
+ ((format_desc->block.bits <= type.width * type.length &&
+ format_desc->channel[0].size <= type.width) ||
+ (format_desc->channel[0].size == 64 &&
+ format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+ type.floating)))
+ {
+ /*
+ * Similar to above, but the packed pixel is larger than what fits
+ * into an element of the destination format. The packed pixels will be
+ * shuffled into SoA vectors appropriately, and then the extraction will
+ * be done in parallel as much as possible.
+ * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
+ * the gathered vectors can be shuffled easily (even with avx).
+ * 64xn float -> 32xn float is handled too but it's a bit special as
+ * it does the conversion pre-shuffle.
+ */
+
+ LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
+ struct lp_type fetch_type, gather_type = type;
+ unsigned num_gather, fetch_width, i, j;
+ struct lp_build_context bld;
+ boolean fp64 = format_desc->channel[0].size == 64;
+
+ lp_build_context_init(&bld, gallivm, type);
+
+ assert(type.width == 32);
+ assert(format_desc->block.bits > type.width);
+
+ /*
+ * First, figure out fetch order.
+ */
+ fetch_width = util_next_power_of_two(format_desc->block.bits);
+ /*
+ * fp64 are treated like fp32 except we fetch twice wide values
+ * (as we shuffle after trunc). The shuffles for that work out
+ * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
+ * albeit we miss the potential opportunity for hw gather (as it
+ * only handles native size).
+ */
+ num_gather = fetch_width / type.width;
+ gather_type.width *= num_gather;
+ if (fp64) {
+ num_gather /= 2;
+ }
+ gather_type.length /= num_gather;
+
+ for (i = 0; i < num_gather; i++) {
+ LLVMValueRef offsetr, shuf_vec;
+ if(num_gather == 4) {
+ for (j = 0; j < gather_type.length; j++) {
+ unsigned idx = i + 4*j;
+ shuffles[j] = lp_build_const_int32(gallivm, idx);
+ }
+ shuf_vec = LLVMConstVector(shuffles, gather_type.length);
+ offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
+
+ }
+ else if (num_gather == 2) {
+ assert(num_gather == 2);
+ for (j = 0; j < gather_type.length; j++) {
+ unsigned idx = i*2 + (j%2) + (j/2)*4;
+ shuffles[j] = lp_build_const_int32(gallivm, idx);
+ }
+ shuf_vec = LLVMConstVector(shuffles, gather_type.length);
+ offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
+ }
+ else {
+ assert(num_gather == 1);
+ offsetr = offset;
+ }
+ if (gather_type.length == 1) {
+ LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
+ offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
+ }
+
+ /*
+ * Determine whether to use float or int loads. This is mostly
+ * to outsmart the (stupid) llvm int/float shuffle logic, we
+ * don't really care much if the data is floats or ints...
+ * But llvm will refuse to use single float shuffle with int data
+ * and instead use 3 int shuffles instead, the code looks atrocious.
+ * (Note bitcasts often won't help, as llvm is too smart to be
+ * fooled by that.)
+ * Nobody cares about simd float<->int domain transition penalties,
+ * which usually don't even exist for shuffles anyway.
+ * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
+ * going into transpose, which is unpacks, so doesn't really matter
+ * much).
+ * With 2x32bit or 4x16bit fetch, we use float vec, since those
+ * go into the weird channel separation shuffle. With floats,
+ * this is (with 128bit vectors):
+ * - 2 movq, 2 movhpd, 2 shufps
+ * With ints it would be:
+ * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
+ * I've seen texture functions increase in code size by 15% just due
+ * to that (there's lots of such fetches in them...)
+ * (We could chose a different gather order to improve this somewhat
+ * for the int path, but it would basically just drop the blends,
+ * so the float path with this order really is optimal.)
+ * Albeit it is tricky sometimes llvm doesn't ignore the float->int
+ * casts so must avoid them until we're done with the float shuffle...
+ * 3x16bit formats (the same is also true for 3x8) are pretty bad but
+ * there's nothing we can do about them (we could overallocate by
+ * those couple bytes and use unaligned but pot sized load).
+ * Note that this is very much x86 specific. I don't know if this
+ * affect other archs at all.
+ */
+ if (num_gather > 1) {
+ /*
+ * We always want some float type here (with x86)
+ * due to shuffles being float ones afterwards (albeit for
+ * the num_gather == 4 case int should work fine too
+ * (unless there's some problems with avx but not avx2).
+ */
+ if (format_desc->channel[0].size == 64) {
+ fetch_type = lp_type_float_vec(64, gather_type.width);
+ } else {
+ fetch_type = lp_type_int_vec(32, gather_type.width);
+ }
+ }
+ else {
+ /* type doesn't matter much */
+ if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+ (format_desc->channel[0].size == 32 ||
+ format_desc->channel[0].size == 64)) {
+ fetch_type = lp_type_float(gather_type.width);
+ } else {
+ fetch_type = lp_type_uint(gather_type.width);
+ }
+ }
+
+ /* Now finally gather the values */
+ packed[i] = lp_build_gather(gallivm, gather_type.length,
+ format_desc->block.bits,
+ fetch_type, aligned,
+ base_ptr, offsetr, FALSE);
+ if (fp64) {
+ struct lp_type conv_type = type;
+ conv_type.width *= 2;
+ packed[i] = LLVMBuildBitCast(builder, packed[i],
+ lp_build_vec_type(gallivm, conv_type), "");
+ packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
+ }
+ }
+
+ /* shuffle the gathered values to SoA */
+ if (num_gather == 2) {
+ for (i = 0; i < num_gather; i++) {
+ for (j = 0; j < type.length; j++) {
+ unsigned idx = (j%2)*2 + (j/4)*4 + i;
+ if ((j/2)%2)
+ idx += type.length;
+ shuffles[j] = lp_build_const_int32(gallivm, idx);
+ }
+ dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
+ LLVMConstVector(shuffles, type.length), "");
+ }
+ }
+ else if (num_gather == 4) {
+ lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
+ }
+ else {
+ assert(num_gather == 1);
+ dst[0] = packed[0];
+ }
+
+ /*
+ * And finally unpack exactly as above, except that
+ * chan shift is adjusted and the right vector selected.
+ */
+ if (!fp64) {
+ for (i = 0; i < num_gather; i++) {
+ dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
+ }
+ for (i = 0; i < format_desc->nr_channels; i++) {
+ struct util_format_channel_description chan_desc = format_desc->channel[i];
+ unsigned blockbits = type.width;
+ unsigned vec_nr;
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+ vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
+#else
+ vec_nr = chan_desc.shift / type.width;
+#endif
+ chan_desc.shift %= type.width;
+
+ output[i] = lp_build_extract_soa_chan(&bld,
+ blockbits,
+ FALSE,
+ chan_desc,
+ dst[vec_nr]);
+ }
+ }
+ else {
+ for (i = 0; i < format_desc->nr_channels; i++) {
+ output[i] = dst[i];
+ }
+ }
+
+ lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
+ return;
+ }
+
+ if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+ format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
/*
* similar conceptually to above but requiring special
* AoS packed -> SoA float conversion code.
*/
LLVMValueRef packed;
+ struct lp_type fetch_type = lp_type_uint(type.width);
assert(type.floating);
assert(type.width == 32);
packed = lp_build_gather(gallivm, type.length,
format_desc->block.bits,
- type.width, TRUE,
+ fetch_type, aligned,
base_ptr, offset, FALSE);
- if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+ if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
}
else {
@@ -432,8 +708,9 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
* 32bit (or 8bit) from each block.
*/
LLVMValueRef packed;
+ struct lp_type fetch_type = lp_type_uint(type.width);
- if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) {
+ if (format == PIPE_FORMAT_X32_S8X24_UINT) {
/*
* for stencil simply fix up offsets - could in fact change
* base_ptr instead even outside the shader.
@@ -441,15 +718,15 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
unsigned mask = (1 << 8) - 1;
LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
offset = LLVMBuildAdd(builder, offset, s_offset, "");
- packed = lp_build_gather(gallivm, type.length, 32, type.width,
- TRUE, base_ptr, offset, FALSE);
+ packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
+ aligned, base_ptr, offset, FALSE);
packed = LLVMBuildAnd(builder, packed,
lp_build_const_int_vec(gallivm, type, mask), "");
}
else {
- assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
- packed = lp_build_gather(gallivm, type.length, 32, type.width,
- TRUE, base_ptr, offset, TRUE);
+ assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+ packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
+ aligned, base_ptr, offset, TRUE);
packed = LLVMBuildBitCast(builder, packed,
lp_build_vec_type(gallivm, type), "");
}
@@ -461,63 +738,69 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
/*
* Try calling lp_build_fetch_rgba_aos for all pixels.
+ * Should only really hit subsampled, compressed
+ * (for s3tc srgb too, for rgtc the unorm ones only) by now.
+ * (This is invalid for plain 8unorm formats because we're lazy with
+ * the swizzle since some results would arrive swizzled, some not.)
*/
- if (util_format_fits_8unorm(format_desc) &&
+ if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
+ (util_format_fits_8unorm(format_desc) ||
+ format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
type.floating && type.width == 32 &&
(type.length == 1 || (type.length % 4 == 0))) {
struct lp_type tmp_type;
- LLVMValueRef tmp;
+ struct lp_build_context bld;
+ LLVMValueRef packed, rgba[4];
+ const struct util_format_description *flinear_desc;
+ const struct util_format_description *frgba8_desc;
+ unsigned chan;
+
+ lp_build_context_init(&bld, gallivm, type);
+ /*
+ * Make sure the conversion in aos really only does convert to rgba8
+ * and not anything more (so use linear format, adjust type).
+ */
+ flinear_desc = util_format_description(util_format_linear(format));
memset(&tmp_type, 0, sizeof tmp_type);
tmp_type.width = 8;
tmp_type.length = type.length * 4;
tmp_type.norm = TRUE;
- tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
- TRUE, base_ptr, offset, i, j, cache);
+ packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
+ aligned, base_ptr, offset, i, j, cache);
+ packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
- lp_build_rgba8_to_fi32_soa(gallivm,
- type,
- tmp,
- rgba_out);
-
- return;
- }
-
- if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC &&
- /* non-srgb case is already handled above */
- format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
- type.floating && type.width == 32 &&
- (type.length == 1 || (type.length % 4 == 0)) &&
- cache) {
- const struct util_format_description *format_decompressed;
- const struct util_format_description *flinear_desc;
- LLVMValueRef packed;
- flinear_desc = util_format_description(util_format_linear(format_desc->format));
- packed = lp_build_fetch_cached_texels(gallivm,
- flinear_desc,
- type.length,
- base_ptr,
- offset,
- i, j,
- cache);
- packed = LLVMBuildBitCast(builder, packed,
- lp_build_int_vec_type(gallivm, type), "");
/*
- * The values are now packed so they match ordinary srgb RGBA8 format,
+ * The values are now packed so they match ordinary (srgb) RGBA8 format,
* hence need to use matching format for unpack.
*/
- format_decompressed = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
-
+ frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
+ if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+ assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
+ frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
+ }
lp_build_unpack_rgba_soa(gallivm,
- format_decompressed,
+ frgba8_desc,
type,
- packed, rgba_out);
+ packed, rgba);
+ /*
+ * We converted 4 channels. Make sure llvm can drop unneeded ones
+ * (luckily the rgba order is fixed, only LA needs special case).
+ */
+ for (chan = 0; chan < 4; chan++) {
+ enum pipe_swizzle swizzle = format_desc->swizzle[chan];
+ if (chan == 3 && util_format_is_luminance_alpha(format)) {
+ swizzle = PIPE_SWIZZLE_W;
+ }
+ rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
+ }
return;
}
+
/*
* Fallback to calling lp_build_fetch_rgba_aos for each pixel.
*
@@ -525,30 +808,40 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
* miss some opportunities to do vectorization, but this is
* convenient for formats or scenarios for which there was no
* opportunity or incentive to optimize.
+ *
+ * We do NOT want to end up here, this typically is quite terrible,
+ * in particular if the formats have less than 4 channels.
+ *
+ * Right now, this should only be hit for:
+ * - RGTC snorm formats
+ * (those miss fast fetch functions hence they are terrible anyway)
*/
{
- unsigned k, chan;
+ unsigned k;
struct lp_type tmp_type;
+ LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
if (gallivm_debug & GALLIVM_DEBUG_PERF) {
- debug_printf("%s: scalar unpacking of %s\n",
+ debug_printf("%s: AoS fetch fallback for %s\n",
__FUNCTION__, format_desc->short_name);
}
tmp_type = type;
tmp_type.length = 4;
- for (chan = 0; chan < 4; ++chan) {
- rgba_out[chan] = lp_build_undef(gallivm, type);
- }
+ /*
+ * Note that vector transpose can be worse compared to insert/extract
+ * for aos->soa conversion (for formats with 1 or 2 channels). However,
+ * we should try to avoid getting here for just about all formats, so
+ * don't bother.
+ */
/* loop over number of pixels */
for(k = 0; k < type.length; ++k) {
LLVMValueRef index = lp_build_const_int32(gallivm, k);
LLVMValueRef offset_elem;
LLVMValueRef i_elem, j_elem;
- LLVMValueRef tmp;
offset_elem = LLVMBuildExtractElement(builder, offset,
index, "");
@@ -557,20 +850,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
j_elem = LLVMBuildExtractElement(builder, j, index, "");
/* Get a single float[4]={R,G,B,A} pixel */
- tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
- TRUE, base_ptr, offset_elem,
- i_elem, j_elem, cache);
+ aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
+ aligned, base_ptr, offset_elem,
+ i_elem, j_elem, cache);
- /*
- * Insert the AoS tmp value channels into the SoA result vectors at
- * position = 'index'.
- */
- for (chan = 0; chan < 4; ++chan) {
- LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan),
- tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, "");
- rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
- tmp_chan, index, "");
- }
}
+ convert_to_soa(gallivm, aos_fetch, rgba_out, type);
}
}
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index fa0e8b656..d6d755298 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -491,13 +491,15 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
{
LLVMValueRef packed;
LLVMValueRef rgba;
+ struct lp_type fetch_type;
assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
assert(format_desc->block.bits == 32);
assert(format_desc->block.width == 2);
assert(format_desc->block.height == 1);
- packed = lp_build_gather(gallivm, n, 32, 32, TRUE, base_ptr, offset, FALSE);
+ fetch_type = lp_type_uint(32);
+ packed = lp_build_gather(gallivm, n, 32, fetch_type, TRUE, base_ptr, offset, FALSE);
(void)j;
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c
index 439bbb679..7d11dcd3b 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -28,13 +28,16 @@
#include "util/u_debug.h"
#include "util/u_cpu_detect.h"
+#include "util/u_math.h"
#include "lp_bld_debug.h"
#include "lp_bld_const.h"
#include "lp_bld_format.h"
#include "lp_bld_gather.h"
#include "lp_bld_swizzle.h"
+#include "lp_bld_type.h"
#include "lp_bld_init.h"
#include "lp_bld_intr.h"
+#include "lp_bld_pack.h"
/**
@@ -113,14 +116,29 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
* translation of offsets to first_elem in sampler_views it actually seems
* gallium could not do anything else except 16 no matter what...
*/
- if (!aligned) {
+ if (!aligned) {
LLVMSetAlignment(res, 1);
+ } else if (!util_is_power_of_two(src_width)) {
+ /*
+ * Full alignment is impossible, assume the caller really meant
+ * the individual elements were aligned (e.g. 3x32bit format).
+ * And yes the generated code may otherwise crash, llvm will
+ * really assume 128bit alignment with a 96bit fetch (I suppose
+ * that makes sense as it can just assume the upper 32bit to be
+ * whatever).
+ * Maybe the caller should be able to explicitly set this, but
+ * this should cover all the 3-channel formats.
+ */
+ if (((src_width / 24) * 24 == src_width) &&
+ util_is_power_of_two(src_width / 24)) {
+ LLVMSetAlignment(res, src_width / 24);
+ } else {
+ LLVMSetAlignment(res, 1);
+ }
}
assert(src_width <= dst_width);
- if (src_width > dst_width) {
- res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, "");
- } else if (src_width < dst_width) {
+ if (src_width < dst_width) {
res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
if (vector_justify) {
#ifdef PIPE_ARCH_BIG_ENDIAN
@@ -134,28 +152,162 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
}
+/**
+ * Gather one element from scatter positions in memory.
+ * Nearly the same as above, however the individual elements
+ * may be vectors themselves, and fetches may be float type.
+ * Can also do pad vector instead of ZExt.
+ *
+ * @sa lp_build_gather()
+ */
+static LLVMValueRef
+lp_build_gather_elem_vec(struct gallivm_state *gallivm,
+ unsigned length,
+ unsigned src_width,
+ LLVMTypeRef src_type,
+ struct lp_type dst_type,
+ boolean aligned,
+ LLVMValueRef base_ptr,
+ LLVMValueRef offsets,
+ unsigned i,
+ boolean vector_justify)
+{
+ LLVMValueRef ptr, res;
+ LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+ assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
+
+ ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
+ ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
+ res = LLVMBuildLoad(gallivm->builder, ptr, "");
+
+ /* XXX
+ * On some archs we probably really want to avoid having to deal
+ * with alignments lower than 4 bytes (if fetch size is a power of
+ * two >= 32). On x86 it doesn't matter, however.
+ * We should be able to guarantee full alignment for any kind of texture
+ * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
+ * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
+ * but I don't think that's quite what we wanted).
+ * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
+ * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
+ * enforcing what we want (which is what d3d10 does, the offset needs to
+ * be aligned to element size, but GL has bytes regardless of element
+ * size which would only leave us with minimum alignment restriction of 16
+ * which doesn't make much sense if the type isn't 4x32bit). Due to
+ * translation of offsets to first_elem in sampler_views it actually seems
+ * gallium could not do anything else except 16 no matter what...
+ */
+ if (!aligned) {
+ LLVMSetAlignment(res, 1);
+ } else if (!util_is_power_of_two(src_width)) {
+ /*
+ * Full alignment is impossible, assume the caller really meant
+ * the individual elements were aligned (e.g. 3x32bit format).
+ * And yes the generated code may otherwise crash, llvm will
+ * really assume 128bit alignment with a 96bit fetch (I suppose
+ * that makes sense as it can just assume the upper 32bit to be
+ * whatever).
+ * Maybe the caller should be able to explicitly set this, but
+ * this should cover all the 3-channel formats.
+ */
+ if (((src_width / 24) * 24 == src_width) &&
+ util_is_power_of_two(src_width / 24)) {
+ LLVMSetAlignment(res, src_width / 24);
+ } else {
+ LLVMSetAlignment(res, 1);
+ }
+ }
+
+ assert(src_width <= dst_type.width * dst_type.length);
+ if (src_width < dst_type.width * dst_type.length) {
+ if (dst_type.length > 1) {
+ res = lp_build_pad_vector(gallivm, res, dst_type.length);
+ /*
+ * vector_justify hopefully a non-issue since we only deal
+ * with src_width >= 32 here?
+ */
+ } else {
+ LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type);
+
+ /*
+ * Only valid if src_ptr_type is int type...
+ */
+ res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+ if (vector_justify) {
+ res = LLVMBuildShl(gallivm->builder, res,
+ LLVMConstInt(dst_elem_type,
+ dst_type.width - src_width, 0), "");
+ }
+ if (src_width == 48) {
+ /* Load 3x16 bit vector.
+ * The sequence of loads on big-endian hardware proceeds as follows.
+ * 16-bit fields are denoted by X, Y, Z, and 0. In memory, the sequence
+ * of three fields appears in the order X, Y, Z.
+ *
+ * Load 32-bit word: 0.0.X.Y
+ * Load 16-bit halfword: 0.0.0.Z
+ * Rotate left: 0.X.Y.0
+ * Bitwise OR: 0.X.Y.Z
+ *
+ * The order in which we need the fields in the result is 0.Z.Y.X,
+ * the same as on little-endian; permute 16-bit fields accordingly
+ * within 64-bit register:
+ */
+ LLVMValueRef shuffles[4] = {
+ lp_build_const_int32(gallivm, 2),
+ lp_build_const_int32(gallivm, 1),
+ lp_build_const_int32(gallivm, 0),
+ lp_build_const_int32(gallivm, 3),
+ };
+ res = LLVMBuildBitCast(gallivm->builder, res,
+ lp_build_vec_type(gallivm, lp_type_uint_vec(16, 4*16)), "");
+ res = LLVMBuildShuffleVector(gallivm->builder, res, res, LLVMConstVector(shuffles, 4), "");
+ res = LLVMBuildBitCast(gallivm->builder, res, dst_elem_type, "");
+ }
+#endif
+ }
+ }
+ return res;
+}
+
+
+
+
static LLVMValueRef
lp_build_gather_avx2(struct gallivm_state *gallivm,
unsigned length,
unsigned src_width,
- unsigned dst_width,
+ struct lp_type dst_type,
LLVMValueRef base_ptr,
LLVMValueRef offsets)
{
LLVMBuilderRef builder = gallivm->builder;
- LLVMTypeRef dst_type = LLVMIntTypeInContext(gallivm->context, dst_width);
- LLVMTypeRef dst_vec_type = LLVMVectorType(dst_type, length);
- LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
- LLVMTypeRef src_vec_type = LLVMVectorType(src_type, length);
+ LLVMTypeRef src_type, src_vec_type;
LLVMValueRef res;
+ struct lp_type res_type = dst_type;
+ res_type.length *= length;
+
+ if (dst_type.floating) {
+ src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
+ LLVMFloatTypeInContext(gallivm->context);
+ } else {
+ src_type = LLVMIntTypeInContext(gallivm->context, src_width);
+ }
+ src_vec_type = LLVMVectorType(src_type, length);
+ /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
if (0) {
/*
* XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
- * will not use the AVX2 gather instrinsics. See
+ * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
+ * least with Haswell. See
* http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
+ * And the generated code doing the emulation is quite a bit worse
+ * than what we get by doing it ourselves too.
*/
LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
@@ -175,7 +327,8 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");
char intrinsic[64];
- util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%ui%u", length, src_width);
+ util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
+ length, dst_type.floating ? "f" : "i", src_width);
LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
@@ -184,26 +337,35 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
} else {
- assert(src_width == 32);
-
LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
-
- /*
- * We should get the caller to give more type information so we can use
- * the intrinsics for the right int/float domain. Int should be the most
- * common.
- */
const char *intrinsic = NULL;
- switch (length) {
- case 4:
- intrinsic = "llvm.x86.avx2.gather.d.d";
- break;
- case 8:
- intrinsic = "llvm.x86.avx2.gather.d.d.256";
- break;
- default:
- assert(0);
+ unsigned l_idx = 0;
+
+ assert(src_width == 32 || src_width == 64);
+ if (src_width == 32) {
+ assert(length == 4 || length == 8);
+ } else {
+ assert(length == 2 || length == 4);
+ }
+
+ static const char *intrinsics[2][2][2] = {
+
+ {{"llvm.x86.avx2.gather.d.d",
+ "llvm.x86.avx2.gather.d.d.256"},
+ {"llvm.x86.avx2.gather.d.q",
+ "llvm.x86.avx2.gather.d.q.256"}},
+
+ {{"llvm.x86.avx2.gather.d.ps",
+ "llvm.x86.avx2.gather.d.ps.256"},
+ {"llvm.x86.avx2.gather.d.pd",
+ "llvm.x86.avx2.gather.d.pd.256"}},
+ };
+
+ if ((src_width == 32 && length == 8) ||
+ (src_width == 64 && length == 4)) {
+ l_idx = 1;
}
+ intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
@@ -214,12 +376,7 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
}
-
- if (src_width > dst_width) {
- res = LLVMBuildTrunc(builder, res, dst_vec_type, "");
- } else if (src_width < dst_width) {
- res = LLVMBuildZExt(builder, res, dst_vec_type, "");
- }
+ res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
return res;
}
@@ -240,9 +397,11 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
*
* @param length length of the offsets
* @param src_width src element width in bits
- * @param dst_width result element width in bits (src will be expanded to fit)
+ * @param dst_type result element type (src will be expanded to fit,
+ * but truncation is not allowed)
+ * (this may be a vector, must be pot sized)
* @param aligned whether the data is guaranteed to be aligned (to src_width)
- * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param base_ptr base pointer, needs to be a i8 pointer type.
* @param offsets vector with offsets
* @param vector_justify select vector rather than integer justification
*/
@@ -250,36 +409,174 @@ LLVMValueRef
lp_build_gather(struct gallivm_state *gallivm,
unsigned length,
unsigned src_width,
- unsigned dst_width,
+ struct lp_type dst_type,
boolean aligned,
LLVMValueRef base_ptr,
LLVMValueRef offsets,
boolean vector_justify)
{
LLVMValueRef res;
+ boolean need_expansion = src_width < dst_type.width * dst_type.length;
+ boolean vec_fetch;
+ struct lp_type fetch_type, fetch_dst_type;
+ LLVMTypeRef src_type;
+
+ assert(src_width <= dst_type.width * dst_type.length);
+
+ /*
+ * This is quite a mess...
+ * Figure out if the fetch should be done as:
+ * a) scalar or vector
+ * b) float or int
+ *
+ * As an example, for a 96bit fetch expanded into 4x32bit, it is better
+ * to use (3x32bit) vector type (then pad the vector). Otherwise, the
+ * zext will cause extra instructions.
+ * However, the same isn't true for 3x16bit (the codegen for that is
+ * completely worthless on x86 simd, and for 3x8bit is is way worse
+ * still, don't try that... (To get really good code out of llvm for
+ * these cases, the only way is to decompose the fetches manually
+ * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
+ * case requires sse41, otherwise simple scalar zext is way better.
+ * But probably not important enough, so don't bother.)
+ * Also, we try to honor the floating bit of destination (but isn't
+ * possible if caller asks for instance for 2x32bit dst_type with
+ * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
+ * cast to 2x32f type, so the fetch is always int and on top of that
+ * we avoid the vec pad and use scalar zext due the above mentioned
+ * issue).
+ * Note this is optimized for x86 sse2 and up backend. Could be tweaked
+ * for other archs if necessary...
+ */
+ if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
+ (dst_type.length > 1)) {
+ /* use vector fetch (if dst_type is vector) */
+ vec_fetch = TRUE;
+ if (dst_type.floating) {
+ fetch_type = lp_type_float_vec(dst_type.width, src_width);
+ } else {
+ fetch_type = lp_type_int_vec(dst_type.width, src_width);
+ }
+ /* intentionally not using lp_build_vec_type here */
+ src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
+ fetch_type.length);
+ fetch_dst_type = fetch_type;
+ fetch_dst_type.length = dst_type.length;
+ } else {
+ /* use scalar fetch */
+ vec_fetch = FALSE;
+ if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
+ fetch_type = lp_type_float(src_width);
+ } else {
+ fetch_type = lp_type_int(src_width);
+ }
+ src_type = lp_build_vec_type(gallivm, fetch_type);
+ fetch_dst_type = fetch_type;
+ fetch_dst_type.width = dst_type.width * dst_type.length;
+ }
if (length == 1) {
/* Scalar */
- return lp_build_gather_elem(gallivm, length,
- src_width, dst_width, aligned,
- base_ptr, offsets, 0, vector_justify);
- } else if (util_cpu_caps.has_avx2 && src_width == 32 && (length == 4 || length == 8)) {
- return lp_build_gather_avx2(gallivm, length, src_width, dst_width, base_ptr, offsets);
+ res = lp_build_gather_elem_vec(gallivm, length,
+ src_width, src_type, fetch_dst_type,
+ aligned, base_ptr, offsets, 0,
+ vector_justify);
+ return LLVMBuildBitCast(gallivm->builder, res,
+ lp_build_vec_type(gallivm, dst_type), "");
+ /*
+ * Excluding expansion from these paths because if you need it for
+ * 32bit/64bit fetches you're doing it wrong (this is gather, not
+ * conversion) and it would be awkward for floats.
+ */
+ } else if (util_cpu_caps.has_avx2 && !need_expansion &&
+ src_width == 32 && (length == 4 || length == 8)) {
+ return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+ base_ptr, offsets);
+ /*
+ * This looks bad on paper wrt throughtput/latency on Haswell.
+ * Even on Broadwell it doesn't look stellar.
+ * Albeit no measurements were done (but tested to work).
+ * Should definitely enable on Skylake.
+ * (In general, should be more of a win if the fetch is 256bit wide -
+ * this is true for the 32bit case above too.)
+ */
+ } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
+ src_width == 64 && (length == 2 || length == 4)) {
+ return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+ base_ptr, offsets);
} else {
/* Vector */
- LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
- LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
+ LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
unsigned i;
-
- res = LLVMGetUndef(dst_vec_type);
+ boolean vec_zext = FALSE;
+ struct lp_type res_type, gather_res_type;
+ LLVMTypeRef res_t, gather_res_t;
+
+ res_type = fetch_dst_type;
+ res_type.length *= length;
+ gather_res_type = res_type;
+
+ if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
+ /*
+ * Note that llvm is never able to optimize zext/insert combos
+ * directly (i.e. zero the simd reg, then place the elements into
+ * the appropriate place directly). (I think this has to do with
+ * scalar/vector transition.) And scalar 16->32bit zext simd loads
+ * aren't possible (instead loading to scalar reg first).
+ * No idea about other archs...
+ * We could do this manually, but instead we just use a vector
+ * zext, which is simple enough (and, in fact, llvm might optimize
+ * this away).
+ * (We're not trying that with other bit widths as that might not be
+ * easier, in particular with 8 bit values at least with only sse2.)
+ */
+ assert(vec_fetch == FALSE);
+ gather_res_type.width /= 2;
+ fetch_dst_type = fetch_type;
+ src_type = lp_build_vec_type(gallivm, fetch_type);
+ vec_zext = TRUE;
+ }
+ res_t = lp_build_vec_type(gallivm, res_type);
+ gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
+ res = LLVMGetUndef(gather_res_t);
for (i = 0; i < length; ++i) {
LLVMValueRef index = lp_build_const_int32(gallivm, i);
- LLVMValueRef elem;
- elem = lp_build_gather_elem(gallivm, length,
- src_width, dst_width, aligned,
- base_ptr, offsets, i, vector_justify);
- res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
+ elems[i] = lp_build_gather_elem_vec(gallivm, length,
+ src_width, src_type, fetch_dst_type,
+ aligned, base_ptr, offsets, i,
+ vector_justify);
+ if (!vec_fetch) {
+ res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
+ }
+ }
+ if (vec_zext) {
+ res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
+ if (vector_justify) {
+#ifdef PIPE_ARCH_BIG_ENDIAN
+ unsigned sv = dst_type.width - src_width;
+ res = LLVMBuildShl(gallivm->builder, res,
+ lp_build_const_int_vec(gallivm, res_type, sv), "");
+#endif
+ }
+ }
+ if (vec_fetch) {
+ /*
+ * Do bitcast now otherwise llvm might get some funny ideas wrt
+ * float/int types...
+ */
+ for (i = 0; i < length; i++) {
+ elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
+ lp_build_vec_type(gallivm, dst_type), "");
+ }
+ res = lp_build_concat(gallivm, elems, dst_type, length);
+ } else {
+ struct lp_type really_final_type = dst_type;
+ assert(res_type.length * res_type.width ==
+ dst_type.length * dst_type.width * length);
+ really_final_type.length *= length;
+ res = LLVMBuildBitCast(gallivm->builder, res,
+ lp_build_vec_type(gallivm, really_final_type), "");
}
}
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h
index 3ede4763a..7930864e6 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h
@@ -55,7 +55,7 @@ LLVMValueRef
lp_build_gather(struct gallivm_state *gallivm,
unsigned length,
unsigned src_width,
- unsigned dst_width,
+ struct lp_type dst_type,
boolean aligned,
LLVMValueRef base_ptr,
LLVMValueRef offsets,
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c
index fed43e99e..c456a97eb 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -48,8 +48,12 @@
# define USE_MCJIT 1
#elif defined(PIPE_ARCH_PPC_64) || defined(PIPE_ARCH_S390) || defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64)
# define USE_MCJIT 1
+#endif
+
+#if defined(USE_MCJIT)
+static const bool use_mcjit = USE_MCJIT;
#else
-static bool USE_MCJIT = 0;
+static bool use_mcjit = FALSE;
#endif
@@ -121,19 +125,6 @@ create_pass_manager(struct gallivm_state *gallivm)
LLVMAddTargetData(gallivm->target, gallivm->passmgr);
#endif
- /* Setting the module's DataLayout to an empty string will cause the
- * ExecutionEngine to copy to the DataLayout string from its target
- * machine to the module. As of LLVM 3.8 the module and the execution
- * engine are required to have the same DataLayout.
- *
- * TODO: This is just a temporary work-around. The correct solution is
- * for gallivm_init_state() to create a TargetMachine and pull the
- * DataLayout from there. Currently, the TargetMachine used by llvmpipe
- * is being implicitly created by the EngineBuilder in
- * lp_build_create_jit_compiler_for_module()
- */
-
-#if HAVE_LLVM < 0x0308
{
char *td_str;
// New ones from the Module.
@@ -141,9 +132,6 @@ create_pass_manager(struct gallivm_state *gallivm)
LLVMSetDataLayout(gallivm->module, td_str);
free(td_str);
}
-#else
- LLVMSetDataLayout(gallivm->module, "");
-#endif
if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) {
/* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
@@ -190,7 +178,7 @@ gallivm_free_ir(struct gallivm_state *gallivm)
FREE(gallivm->module_name);
- if (!USE_MCJIT) {
+ if (!use_mcjit) {
/* Don't free the TargetData, it's owned by the exec engine */
} else {
if (gallivm->target) {
@@ -248,7 +236,7 @@ init_gallivm_engine(struct gallivm_state *gallivm)
gallivm->module,
gallivm->memorymgr,
(unsigned) optlevel,
- USE_MCJIT,
+ use_mcjit,
&error);
if (ret) {
_debug_printf("%s\n", error);
@@ -257,7 +245,7 @@ init_gallivm_engine(struct gallivm_state *gallivm)
}
}
- if (!USE_MCJIT) {
+ if (!use_mcjit) {
gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine);
if (!gallivm->target)
goto fail;
@@ -336,7 +324,7 @@ init_gallivm_state(struct gallivm_state *gallivm, const char *name,
* complete when MC-JIT is created. So defer the MC-JIT engine creation for
* now.
*/
- if (!USE_MCJIT) {
+ if (!use_mcjit) {
if (!init_gallivm_engine(gallivm)) {
goto fail;
}
@@ -395,10 +383,21 @@ lp_build_init(void)
if (gallivm_initialized)
return TRUE;
- LLVMLinkInMCJIT();
-#if !defined(USE_MCJIT)
- USE_MCJIT = debug_get_bool_option("GALLIVM_MCJIT", 0);
+
+ /* LLVMLinkIn* are no-ops at runtime. They just ensure the respective
+ * component is linked at buildtime, which is sufficient for its static
+ * constructors to be called at load time.
+ */
+#if defined(USE_MCJIT)
+# if USE_MCJIT
+ LLVMLinkInMCJIT();
+# else
+ LLVMLinkInJIT();
+# endif
+#else
+ use_mcjit = debug_get_bool_option("GALLIVM_MCJIT", FALSE);
LLVMLinkInJIT();
+ LLVMLinkInMCJIT();
#endif
#ifdef DEBUG
@@ -457,7 +456,7 @@ lp_build_init(void)
util_cpu_caps.has_f16c = 0;
util_cpu_caps.has_fma = 0;
}
- if (HAVE_LLVM < 0x0304 || !USE_MCJIT) {
+ if (HAVE_LLVM < 0x0304 || !use_mcjit) {
/* AVX2 support has only been tested with LLVM 3.4, and it requires
* MCJIT. */
util_cpu_caps.has_avx2 = 0;
@@ -607,12 +606,30 @@ gallivm_compile_module(struct gallivm_state *gallivm)
LLVMWriteBitcodeToFile(gallivm->module, filename);
debug_printf("%s written\n", filename);
debug_printf("Invoke as \"llc %s%s -o - %s\"\n",
- (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option] " : "",
+ (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option>] " : "",
"[-mattr=<-mattr option(s)>]",
filename);
}
- if (USE_MCJIT) {
+ if (use_mcjit) {
+ /* Setting the module's DataLayout to an empty string will cause the
+ * ExecutionEngine to copy to the DataLayout string from its target
+ * machine to the module. As of LLVM 3.8 the module and the execution
+ * engine are required to have the same DataLayout.
+ *
+ * We must make sure we do this after running the optimization passes,
+ * because those passes need a correct datalayout string. For example,
+ * if those optimization passes see an empty datalayout, they will assume
+ * this is a little endian target and will do optimizations that break big
+ * endian machines.
+ *
+ * TODO: This is just a temporary work-around. The correct solution is
+ * for gallivm_init_state() to create a TargetMachine and pull the
+ * DataLayout from there. Currently, the TargetMachine used by llvmpipe
+ * is being implicitly created by the EngineBuilder in
+ * lp_build_create_jit_compiler_for_module()
+ */
+ LLVMSetDataLayout(gallivm->module, "");
assert(!gallivm->engine);
if (!init_gallivm_engine(gallivm)) {
assert(0);
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c
index f12e735b5..b92455593 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -46,6 +46,7 @@
#include "util/u_debug.h"
#include "util/u_string.h"
+#include "util/bitscan.h"
#include "lp_bld_const.h"
#include "lp_bld_intr.h"
@@ -120,16 +121,113 @@ lp_declare_intrinsic(LLVMModuleRef module,
}
+#if HAVE_LLVM < 0x0400
+static LLVMAttribute lp_attr_to_llvm_attr(enum lp_func_attr attr)
+{
+ switch (attr) {
+ case LP_FUNC_ATTR_ALWAYSINLINE: return LLVMAlwaysInlineAttribute;
+ case LP_FUNC_ATTR_BYVAL: return LLVMByValAttribute;
+ case LP_FUNC_ATTR_INREG: return LLVMInRegAttribute;
+ case LP_FUNC_ATTR_NOALIAS: return LLVMNoAliasAttribute;
+ case LP_FUNC_ATTR_NOUNWIND: return LLVMNoUnwindAttribute;
+ case LP_FUNC_ATTR_READNONE: return LLVMReadNoneAttribute;
+ case LP_FUNC_ATTR_READONLY: return LLVMReadOnlyAttribute;
+ default:
+ _debug_printf("Unhandled function attribute: %x\n", attr);
+ return 0;
+ }
+}
+
+#else
+
+static const char *attr_to_str(enum lp_func_attr attr)
+{
+ switch (attr) {
+ case LP_FUNC_ATTR_ALWAYSINLINE: return "alwaysinline";
+ case LP_FUNC_ATTR_BYVAL: return "byval";
+ case LP_FUNC_ATTR_INREG: return "inreg";
+ case LP_FUNC_ATTR_NOALIAS: return "noalias";
+ case LP_FUNC_ATTR_NOUNWIND: return "nounwind";
+ case LP_FUNC_ATTR_READNONE: return "readnone";
+ case LP_FUNC_ATTR_READONLY: return "readonly";
+ case LP_FUNC_ATTR_WRITEONLY: return "writeonly";
+ case LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY: return "inaccessiblememonly";
+ case LP_FUNC_ATTR_CONVERGENT: return "convergent";
+ default:
+ _debug_printf("Unhandled function attribute: %x\n", attr);
+ return 0;
+ }
+}
+
+#endif
+
+void
+lp_add_function_attr(LLVMValueRef function_or_call,
+ int attr_idx, enum lp_func_attr attr)
+{
+
+#if HAVE_LLVM < 0x0400
+ LLVMAttribute llvm_attr = lp_attr_to_llvm_attr(attr);
+ if (LLVMIsAFunction(function_or_call)) {
+ if (attr_idx == -1) {
+ LLVMAddFunctionAttr(function_or_call, llvm_attr);
+ } else {
+ LLVMAddAttribute(LLVMGetParam(function_or_call, attr_idx - 1), llvm_attr);
+ }
+ } else {
+ LLVMAddInstrAttribute(function_or_call, attr_idx, llvm_attr);
+ }
+#else
+
+ LLVMModuleRef module;
+ if (LLVMIsAFunction(function_or_call)) {
+ module = LLVMGetGlobalParent(function_or_call);
+ } else {
+ LLVMBasicBlockRef bb = LLVMGetInstructionParent(function_or_call);
+ LLVMValueRef function = LLVMGetBasicBlockParent(bb);
+ module = LLVMGetGlobalParent(function);
+ }
+ LLVMContextRef ctx = LLVMGetModuleContext(module);
+
+ const char *attr_name = attr_to_str(attr);
+ unsigned kind_id = LLVMGetEnumAttributeKindForName(attr_name,
+ strlen(attr_name));
+ LLVMAttributeRef llvm_attr = LLVMCreateEnumAttribute(ctx, kind_id, 0);
+
+ if (LLVMIsAFunction(function_or_call))
+ LLVMAddAttributeAtIndex(function_or_call, attr_idx, llvm_attr);
+ else
+ LLVMAddCallSiteAttribute(function_or_call, attr_idx, llvm_attr);
+#endif
+}
+
+static void
+lp_add_func_attributes(LLVMValueRef function, unsigned attrib_mask)
+{
+ /* NoUnwind indicates that the intrinsic never raises a C++ exception.
+ * Set it for all intrinsics.
+ */
+ attrib_mask |= LP_FUNC_ATTR_NOUNWIND;
+ attrib_mask &= ~LP_FUNC_ATTR_LEGACY;
+
+ while (attrib_mask) {
+ enum lp_func_attr attr = 1u << u_bit_scan(&attrib_mask);
+ lp_add_function_attr(function, -1, attr);
+ }
+}
+
LLVMValueRef
lp_build_intrinsic(LLVMBuilderRef builder,
const char *name,
LLVMTypeRef ret_type,
LLVMValueRef *args,
unsigned num_args,
- LLVMAttribute attr)
+ unsigned attr_mask)
{
LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
- LLVMValueRef function;
+ LLVMValueRef function, call;
+ bool set_callsite_attrs = HAVE_LLVM >= 0x0400 &&
+ !(attr_mask & LP_FUNC_ATTR_LEGACY);
function = LLVMGetNamedFunction(module, name);
if(!function) {
@@ -145,17 +243,18 @@ lp_build_intrinsic(LLVMBuilderRef builder,
function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args);
- /* NoUnwind indicates that the intrinsic never raises a C++ exception.
- * Set it for all intrinsics.
- */
- LLVMAddFunctionAttr(function, attr | LLVMNoUnwindAttribute);
+ if (!set_callsite_attrs)
+ lp_add_func_attributes(function, attr_mask);
if (gallivm_debug & GALLIVM_DEBUG_IR) {
lp_debug_dump_value(function);
}
}
- return LLVMBuildCall(builder, function, args, num_args, "");
+ call = LLVMBuildCall(builder, function, args, num_args, "");
+ if (set_callsite_attrs)
+ lp_add_func_attributes(call, attr_mask);
+ return call;
}
@@ -243,9 +342,9 @@ lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
unsigned num_vec = src_type.length / intrin_length;
LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
- /* don't support arbitrary size here as this is so yuck */
+ /* don't support arbitrary size here as this is so yuck */
if (src_type.length % intrin_length) {
- /* FIXME: This is something which should be supported
+ /* FIXME: This is something which should be supported
* but there doesn't seem to be any need for it currently
* so crash and burn.
*/
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h
index 7d80ac28f..0a929c519 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -46,6 +46,24 @@
*/
#define LP_MAX_FUNC_ARGS 32
+enum lp_func_attr {
+ LP_FUNC_ATTR_ALWAYSINLINE = (1 << 0),
+ LP_FUNC_ATTR_BYVAL = (1 << 1),
+ LP_FUNC_ATTR_INREG = (1 << 2),
+ LP_FUNC_ATTR_NOALIAS = (1 << 3),
+ LP_FUNC_ATTR_NOUNWIND = (1 << 4),
+ LP_FUNC_ATTR_READNONE = (1 << 5),
+ LP_FUNC_ATTR_READONLY = (1 << 6),
+ LP_FUNC_ATTR_WRITEONLY = HAVE_LLVM >= 0x0400 ? (1 << 7) : 0,
+ LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY = HAVE_LLVM >= 0x0400 ? (1 << 8) : 0,
+ LP_FUNC_ATTR_CONVERGENT = HAVE_LLVM >= 0x0400 ? (1 << 9) : 0,
+
+ /* Legacy intrinsic that needs attributes on function declarations
+ * and they must match the internal LLVM definition exactly, otherwise
+ * intrinsic selection fails.
+ */
+ LP_FUNC_ATTR_LEGACY = (1u << 31),
+};
void
lp_format_intrinsic(char *name,
@@ -60,13 +78,17 @@ lp_declare_intrinsic(LLVMModuleRef module,
LLVMTypeRef *arg_types,
unsigned num_args);
+void
+lp_add_function_attr(LLVMValueRef function_or_call,
+ int attr_idx, enum lp_func_attr attr);
+
LLVMValueRef
lp_build_intrinsic(LLVMBuilderRef builder,
const char *name,
LLVMTypeRef ret_type,
LLVMValueRef *args,
unsigned num_args,
- LLVMAttribute attr);
+ unsigned attr_mask);
LLVMValueRef
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index 32addec97..354e2a46b 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -49,8 +49,6 @@
#define LP_MAX_TGSI_IMMEDIATES 4096
-#define LP_MAX_TGSI_PREDS 16
-
#define LP_MAX_TGSI_CONSTS 4096
#define LP_MAX_TGSI_CONST_BUFFERS 16
@@ -109,8 +107,6 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
return PIPE_MAX_CONSTANT_BUFFERS;
case PIPE_SHADER_CAP_MAX_TEMPS:
return LP_MAX_TGSI_TEMPS;
- case PIPE_SHADER_CAP_MAX_PREDS:
- return LP_MAX_TGSI_PREDS;
case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
return 1;
case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
@@ -133,13 +129,13 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
return 1;
- case PIPE_SHADER_CAP_DOUBLES:
- return 1;
case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+ case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+ case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
return 0;
case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
return 32;
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 1a50e82c2..524917abe 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -327,6 +327,8 @@ lp_build_select(struct lp_build_context *bld,
* supported yet for a long time, and LLVM will generate poor code when
* the mask is not the result of a comparison.
* Also, llvm 3.7 may miscompile them (bug 94972).
+ * XXX: Even if the instruction was an SExt, this may still produce
+ * terrible code. Try piglit stencil-twoside.
*/
/* Convert the mask to a vector of booleans.
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 3efb6a8e7..d988910a7 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -32,14 +32,6 @@
*/
-#ifndef __STDC_LIMIT_MACROS
-#define __STDC_LIMIT_MACROS
-#endif
-
-#ifndef __STDC_CONSTANT_MACROS
-#define __STDC_CONSTANT_MACROS
-#endif
-
// Undef these vars just to silence warnings
#undef PACKAGE_BUGREPORT
#undef PACKAGE_NAME
@@ -57,6 +49,9 @@
#endif
#include <llvm-c/Core.h>
+#if HAVE_LLVM >= 0x0306
+#include <llvm-c/Support.h>
+#endif
#include <llvm-c/ExecutionEngine.h>
#include <llvm/Target/TargetOptions.h>
#include <llvm/ExecutionEngine/ExecutionEngine.h>
@@ -77,6 +72,9 @@
#include <llvm/Support/TargetSelect.h>
+#if HAVE_LLVM >= 0x0305
+#include <llvm/IR/CallSite.h>
+#endif
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/Module.h>
#include <llvm/Support/CBindingWrapping.h>
@@ -127,20 +125,26 @@ static void init_native_targets()
llvm::InitializeNativeTargetAsmPrinter();
llvm::InitializeNativeTargetDisassembler();
-}
-
-/**
- * The llvm target registry is not thread-safe, so drivers and state-trackers
- * that want to initialize targets should use the gallivm_init_llvm_targets()
- * function to safely initialize targets.
- *
- * LLVM targets should be initialized before the driver or state-tracker tries
- * to access the registry.
- */
-extern "C" void
-gallivm_init_llvm_targets(void)
-{
- call_once(&init_native_targets_once_flag, init_native_targets);
+#if DEBUG && HAVE_LLVM >= 0x0306
+ {
+ char *env_llc_options = getenv("GALLIVM_LLC_OPTIONS");
+ if (env_llc_options) {
+ char *option;
+ char *options[64] = {(char *) "llc"}; // Warning without cast
+ int n;
+ for (n = 0, option = strtok(env_llc_options, " "); option; n++, option = strtok(NULL, " ")) {
+ options[n + 1] = option;
+ }
+ if (gallivm_debug & (GALLIVM_DEBUG_IR | GALLIVM_DEBUG_ASM | GALLIVM_DEBUG_DUMP_BC)) {
+ debug_printf("llc additional options (%d):\n", n);
+ for (int i = 1; i <= n; i++)
+ debug_printf("\t%s\n", options[i]);
+ debug_printf("\n");
+ }
+ LLVMParseCommandLineOptions(n + 1, options, NULL);
+ }
+ }
+#endif
}
extern "C" void
@@ -155,7 +159,14 @@ lp_set_target_options(void)
llvm::DisablePrettyStackTrace = true;
#endif
- gallivm_init_llvm_targets();
+ /* The llvm target registry is not thread-safe, so drivers and state-trackers
+ * that want to initialize targets should use the lp_set_target_options()
+ * function to safely initialize targets.
+ *
+ * LLVM targets should be initialized before the driver or state-tracker tries
+ * to access the registry.
+ */
+ call_once(&init_native_targets_once_flag, init_native_targets);
}
extern "C"
@@ -347,14 +358,20 @@ class DelegatingJITMemoryManager : public BaseMemoryManager {
virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
mgr()->registerEHFrames(Addr, LoadAddr, Size);
}
- virtual void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
- mgr()->deregisterEHFrames(Addr, LoadAddr, Size);
- }
#else
virtual void registerEHFrames(llvm::StringRef SectionData) {
mgr()->registerEHFrames(SectionData);
}
#endif
+#if HAVE_LLVM >= 0x0500
+ virtual void deregisterEHFrames() {
+ mgr()->deregisterEHFrames();
+ }
+#elif HAVE_LLVM >= 0x0304
+ virtual void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
+ mgr()->deregisterEHFrames(Addr, LoadAddr, Size);
+ }
+#endif
virtual void *getPointerToNamedFunction(const std::string &Name,
bool AbortOnFailure=true) {
return mgr()->getPointerToNamedFunction(Name, AbortOnFailure);
@@ -540,6 +557,20 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
llvm::SmallVector<std::string, 16> MAttrs;
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+#if HAVE_LLVM >= 0x0400
+ /* llvm-3.7+ implements sys::getHostCPUFeatures for x86,
+ * which allows us to enable/disable code generation based
+ * on the results of cpuid.
+ */
+ llvm::StringMap<bool> features;
+ llvm::sys::getHostCPUFeatures(features);
+
+ for (StringMapIterator<bool> f = features.begin();
+ f != features.end();
+ ++f) {
+ MAttrs.push_back(((*f).second ? "+" : "-") + (*f).first().str());
+ }
+#else
/*
* We need to unset attributes because sometimes LLVM mistakenly assumes
* certain features are present given the processor name.
@@ -594,27 +625,51 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
MAttrs.push_back("-avx512vl");
#endif
#endif
+#endif
#if defined(PIPE_ARCH_PPC)
MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec");
#if (HAVE_LLVM >= 0x0304)
-#if (HAVE_LLVM <= 0x0307) || (HAVE_LLVM == 0x0308 && MESA_LLVM_VERSION_PATCH == 0)
+#if (HAVE_LLVM < 0x0400)
/*
* Make sure VSX instructions are disabled
- * See LLVM bug https://llvm.org/bugs/show_bug.cgi?id=25503#c7
+ * See LLVM bugs:
+ * https://llvm.org/bugs/show_bug.cgi?id=25503#c7 (fixed in 3.8.1)
+ * https://llvm.org/bugs/show_bug.cgi?id=26775 (fixed in 3.8.1)
+ * https://llvm.org/bugs/show_bug.cgi?id=33531 (fixed in 4.0)
+ * https://llvm.org/bugs/show_bug.cgi?id=34647 (llc performance on certain unusual shader IR; intro'd in 4.0, pending as of 5.0)
*/
if (util_cpu_caps.has_altivec) {
MAttrs.push_back("-vsx");
}
#else
/*
- * However, bug 25503 is fixed, by the same fix that fixed
- * bug 26775, in versions of LLVM later than 3.8 (starting with 3.8.1):
- * Make sure VSX instructions are ENABLED
- * See LLVM bug https://llvm.org/bugs/show_bug.cgi?id=26775
+ * Bug 25503 is fixed, by the same fix that fixed
+ * bug 26775, in versions of LLVM later than 3.8 (starting with 3.8.1).
+ * BZ 33531 actually comprises more than one bug, all of
+ * which are fixed in LLVM 4.0.
+ *
+ * With LLVM 4.0 or higher:
+ * Make sure VSX instructions are ENABLED, unless
+ * a) the entire -mattr option is overridden via GALLIVM_MATTRS, or
+ * b) VSX instructions are explicitly enabled/disabled via GALLIVM_VSX=1 or 0.
*/
if (util_cpu_caps.has_altivec) {
- MAttrs.push_back("+vsx");
+ char *env_mattrs = getenv("GALLIVM_MATTRS");
+ if (env_mattrs) {
+ MAttrs.push_back(env_mattrs);
+ }
+ else {
+ boolean enable_vsx = true;
+ char *env_vsx = getenv("GALLIVM_VSX");
+ if (env_vsx && env_vsx[0] == '0') {
+ enable_vsx = false;
+ }
+ if (enable_vsx)
+ MAttrs.push_back("+vsx");
+ else
+ MAttrs.push_back("-vsx");
+ }
}
#endif
#endif
@@ -737,13 +792,49 @@ lp_free_memory_manager(LLVMMCJITMemoryManagerRef memorymgr)
delete reinterpret_cast<BaseMemoryManager*>(memorymgr);
}
-extern "C" void
-lp_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes)
+extern "C" LLVMValueRef
+lp_get_called_value(LLVMValueRef call)
{
-#if HAVE_LLVM >= 0x0306
- llvm::Argument *A = llvm::unwrap<llvm::Argument>(val);
- llvm::AttrBuilder B;
- B.addDereferenceableAttr(bytes);
- A->addAttr(llvm::AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+#if HAVE_LLVM >= 0x0309
+ return LLVMGetCalledValue(call);
+#elif HAVE_LLVM >= 0x0305
+ return llvm::wrap(llvm::CallSite(llvm::unwrap<llvm::Instruction>(call)).getCalledValue());
+#else
+ return NULL; /* radeonsi doesn't support so old LLVM. */
+#endif
+}
+
+extern "C" bool
+lp_is_function(LLVMValueRef v)
+{
+#if HAVE_LLVM >= 0x0309
+ return LLVMGetValueKind(v) == LLVMFunctionValueKind;
+#else
+ return llvm::isa<llvm::Function>(llvm::unwrap(v));
+#endif
+}
+
+extern "C" LLVMBuilderRef
+lp_create_builder(LLVMContextRef ctx, enum lp_float_mode float_mode)
+{
+ LLVMBuilderRef builder = LLVMCreateBuilderInContext(ctx);
+
+#if HAVE_LLVM >= 0x0308
+ llvm::FastMathFlags flags;
+
+ switch (float_mode) {
+ case LP_FLOAT_MODE_DEFAULT:
+ break;
+ case LP_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH:
+ flags.setNoSignedZeros();
+ llvm::unwrap(builder)->setFastMathFlags(flags);
+ break;
+ case LP_FLOAT_MODE_UNSAFE_FP_MATH:
+ flags.setUnsafeAlgebra();
+ llvm::unwrap(builder)->setFastMathFlags(flags);
+ break;
+ }
#endif
+
+ return builder;
}
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h
index c127c480d..1b725d10d 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h
@@ -42,9 +42,6 @@ extern "C" {
struct lp_generated_code;
-extern void
-gallivm_init_llvm_targets(void);
-
extern LLVMTargetLibraryInfoRef
gallivm_create_target_library_info(const char *triple);
@@ -73,8 +70,20 @@ lp_get_default_memory_manager();
extern void
lp_free_memory_manager(LLVMMCJITMemoryManagerRef memorymgr);
-extern void
-lp_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes);
+extern LLVMValueRef
+lp_get_called_value(LLVMValueRef call);
+
+extern bool
+lp_is_function(LLVMValueRef v);
+
+enum lp_float_mode {
+ LP_FLOAT_MODE_DEFAULT,
+ LP_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH,
+ LP_FLOAT_MODE_UNSAFE_FP_MATH,
+};
+
+extern LLVMBuilderRef
+lp_create_builder(LLVMContextRef ctx, enum lp_float_mode float_mode);
#ifdef __cplusplus
}
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index a4b3a7b83..a1dc61d40 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -1412,8 +1412,8 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
{
const unsigned dims = bld->dims;
LLVMValueRef width;
- LLVMValueRef height;
- LLVMValueRef depth;
+ LLVMValueRef height = NULL;
+ LLVMValueRef depth = NULL;
lp_build_extract_image_sizes(bld,
&bld->float_size_bld,
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index f91b761dc..c46749dba 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -579,10 +579,12 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
LLVMValueRef rgba8;
struct lp_build_context u8n;
LLVMTypeRef u8n_vec_type;
+ struct lp_type fetch_type;
lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
+ fetch_type = lp_type_uint(bld->texel_type.width);
if (util_format_is_rgba8_variant(bld->format_desc)) {
/*
* Given the format is a rgba8, just read the pixels as is,
@@ -591,7 +593,7 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
rgba8 = lp_build_gather(bld->gallivm,
bld->texel_type.length,
bld->format_desc->block.bits,
- bld->texel_type.width,
+ fetch_type,
TRUE,
data_ptr, offset, TRUE);
@@ -925,14 +927,16 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
LLVMValueRef rgba8;
if (util_format_is_rgba8_variant(bld->format_desc)) {
+ struct lp_type fetch_type;
/*
* Given the format is a rgba8, just read the pixels as is,
* without any swizzling. Swizzling will be done later.
*/
+ fetch_type = lp_type_uint(bld->texel_type.width);
rgba8 = lp_build_gather(bld->gallivm,
bld->texel_type.length,
bld->format_desc->block.bits,
- bld->texel_type.width,
+ fetch_type,
TRUE,
data_ptr, offset[k][j][i], TRUE);
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 1477a72d6..cb4660e42 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -60,6 +60,7 @@
#include "lp_bld_struct.h"
#include "lp_bld_quad.h"
#include "lp_bld_pack.h"
+#include "lp_bld_intr.h"
/**
@@ -158,7 +159,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
lp_build_fetch_rgba_soa(bld->gallivm,
bld->format_desc,
- bld->texel_type,
+ bld->texel_type, TRUE,
data_ptr, offset,
i, j,
bld->cache,
@@ -2405,7 +2406,7 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
lp_build_fetch_rgba_soa(bld->gallivm,
bld->format_desc,
- bld->texel_type,
+ bld->texel_type, TRUE,
bld->base_ptr, offset,
i, j,
bld->cache,
@@ -3316,7 +3317,8 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
for (i = 0; i < num_param; ++i) {
if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
- LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute);
+
+ lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
}
}
@@ -3460,7 +3462,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
struct lp_sampler_dynamic_state *dynamic_state,
const struct lp_sampler_size_query_params *params)
{
- LLVMValueRef lod, level, size;
+ LLVMValueRef lod, level = 0, size;
LLVMValueRef first_level = NULL;
int dims, i;
boolean has_array;
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index 68ac69538..69863ab93 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -323,16 +323,14 @@ lp_build_tgsi_inst_llvm(
LLVMValueRef
-lp_build_emit_fetch(
+lp_build_emit_fetch_src(
struct lp_build_tgsi_context *bld_base,
- const struct tgsi_full_instruction *inst,
- unsigned src_op,
+ const struct tgsi_full_src_register *reg,
+ enum tgsi_opcode_type stype,
const unsigned chan_index)
{
- const struct tgsi_full_src_register *reg = &inst->Src[src_op];
unsigned swizzle;
LLVMValueRef res;
- enum tgsi_opcode_type stype = tgsi_opcode_infer_src_type(inst->Instruction.Opcode);
if (chan_index == LP_CHAN_ALL) {
swizzle = ~0u;
@@ -360,7 +358,7 @@ lp_build_emit_fetch(
case TGSI_TYPE_DOUBLE:
case TGSI_TYPE_UNTYPED:
/* modifiers on movs assume data is float */
- res = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_ABS, res);
+ res = lp_build_abs(&bld_base->base, res);
break;
case TGSI_TYPE_UNSIGNED:
case TGSI_TYPE_SIGNED:
@@ -413,7 +411,21 @@ lp_build_emit_fetch(
}
return res;
+}
+
+
+LLVMValueRef
+lp_build_emit_fetch(
+ struct lp_build_tgsi_context *bld_base,
+ const struct tgsi_full_instruction *inst,
+ unsigned src_op,
+ const unsigned chan_index)
+{
+ const struct tgsi_full_src_register *reg = &inst->Src[src_op];
+ enum tgsi_opcode_type stype =
+ tgsi_opcode_infer_src_type(inst->Instruction.Opcode);
+ return lp_build_emit_fetch_src(bld_base, reg, stype, chan_index);
}
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index b6b3fe369..eb632b700 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -458,7 +458,6 @@ struct lp_build_tgsi_soa_context
LLVMValueRef immediates[LP_MAX_INLINED_IMMEDIATES][TGSI_NUM_CHANNELS];
LLVMValueRef temps[LP_MAX_INLINED_TEMPS][TGSI_NUM_CHANNELS];
LLVMValueRef addr[LP_MAX_TGSI_ADDRS][TGSI_NUM_CHANNELS];
- LLVMValueRef preds[LP_MAX_TGSI_PREDS][TGSI_NUM_CHANNELS];
/* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
* set in the indirect_files field.
@@ -552,7 +551,6 @@ struct lp_build_tgsi_aos_context
LLVMValueRef immediates[LP_MAX_INLINED_IMMEDIATES];
LLVMValueRef temps[LP_MAX_INLINED_TEMPS];
LLVMValueRef addr[LP_MAX_TGSI_ADDRS];
- LLVMValueRef preds[LP_MAX_TGSI_PREDS];
/* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
* set in the indirect_files field.
@@ -645,6 +643,13 @@ lp_build_tgsi_inst_llvm(
const struct tgsi_full_instruction *inst);
LLVMValueRef
+lp_build_emit_fetch_src(
+ struct lp_build_tgsi_context *bld_base,
+ const struct tgsi_full_src_register *reg,
+ enum tgsi_opcode_type stype,
+ const unsigned chan_index);
+
+LLVMValueRef
lp_build_emit_fetch(
struct lp_build_tgsi_context *bld_base,
const struct tgsi_full_instruction *inst,
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 2e837afe2..dc6568a2d 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -110,21 +110,6 @@ arr_emit(
bld_base->uint_bld.vec_type, "");
}
-/* TGSI_OPCODE_CLAMP */
-static void
-clamp_emit(
- const struct lp_build_tgsi_action * action,
- struct lp_build_tgsi_context * bld_base,
- struct lp_build_emit_data * emit_data)
-{
- LLVMValueRef tmp;
- tmp = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
- emit_data->args[0],
- emit_data->args[1]);
- emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base,
- TGSI_OPCODE_MIN, tmp, emit_data->args[2]);
-}
-
/* DP* Helper */
static void
@@ -368,8 +353,8 @@ exp_emit(
TGSI_OPCODE_EX2, floor_x);
/* src0.x - floor( src0.x ) */
- emit_data->output[TGSI_CHAN_Y] = lp_build_emit_llvm_binary(bld_base,
- TGSI_OPCODE_SUB, emit_data->args[0] /* src0.x */, floor_x);
+ emit_data->output[TGSI_CHAN_Y] =
+ lp_build_sub(&bld_base->base, emit_data->args[0] /* src0.x */, floor_x);
/* 2 ^ src0.x */
emit_data->output[TGSI_CHAN_Z] = lp_build_emit_llvm_unary(bld_base,
@@ -394,8 +379,8 @@ frc_emit(
LLVMValueRef tmp;
tmp = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_FLR,
emit_data->args[0]);
- emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base,
- TGSI_OPCODE_SUB, emit_data->args[0], tmp);
+ emit_data->output[emit_data->chan] =
+ lp_build_sub(&bld_base->base, emit_data->args[0], tmp);
}
/* TGSI_OPCODE_KILL_IF */
@@ -499,8 +484,7 @@ log_emit(
LLVMValueRef abs_x, log_abs_x, flr_log_abs_x, ex2_flr_log_abs_x;
/* abs( src0.x) */
- abs_x = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_ABS,
- emit_data->args[0] /* src0.x */);
+ abs_x = lp_build_abs(&bld_base->base, emit_data->args[0] /* src0.x */);
/* log( abs( src0.x ) ) */
log_abs_x = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_LG2,
@@ -771,19 +755,6 @@ const struct lp_build_tgsi_action scs_action = {
scs_emit /* emit */
};
-/* TGSI_OPCODE_SUB */
-static void
-sub_emit(
- const struct lp_build_tgsi_action * action,
- struct lp_build_tgsi_context * bld_base,
- struct lp_build_emit_data * emit_data)
-{
- emit_data->output[emit_data->chan] =
- LLVMBuildFSub(bld_base->base.gallivm->builder,
- emit_data->args[0],
- emit_data->args[1], "");
-}
-
/* TGSI_OPCODE_F2U */
static void
f2u_emit(
@@ -842,26 +813,32 @@ imul_hi_emit(
struct lp_build_tgsi_context * bld_base,
struct lp_build_emit_data * emit_data)
{
- LLVMBuilderRef builder = bld_base->base.gallivm->builder;
struct lp_build_context *int_bld = &bld_base->int_bld;
- struct lp_type type = int_bld->type;
- LLVMValueRef src0, src1;
- LLVMValueRef dst64;
- LLVMTypeRef typeRef;
-
- assert(type.width == 32);
- type.width = 64;
- typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
- src0 = LLVMBuildSExt(builder, emit_data->args[0], typeRef, "");
- src1 = LLVMBuildSExt(builder, emit_data->args[1], typeRef, "");
- dst64 = LLVMBuildMul(builder, src0, src1, "");
- dst64 = LLVMBuildAShr(
- builder, dst64,
- lp_build_const_vec(bld_base->base.gallivm, type, 32), "");
- type.width = 32;
- typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
- emit_data->output[emit_data->chan] =
- LLVMBuildTrunc(builder, dst64, typeRef, "");
+ LLVMValueRef hi_bits;
+
+ assert(int_bld->type.width == 32);
+
+ /* low result bits are tossed away */
+ lp_build_mul_32_lohi(int_bld, emit_data->args[0],
+ emit_data->args[1], &hi_bits);
+ emit_data->output[emit_data->chan] = hi_bits;
+}
+
+static void
+imul_hi_emit_cpu(
+ const struct lp_build_tgsi_action * action,
+ struct lp_build_tgsi_context * bld_base,
+ struct lp_build_emit_data * emit_data)
+{
+ struct lp_build_context *int_bld = &bld_base->int_bld;
+ LLVMValueRef hi_bits;
+
+ assert(int_bld->type.width == 32);
+
+ /* low result bits are tossed away */
+ lp_build_mul_32_lohi_cpu(int_bld, emit_data->args[0],
+ emit_data->args[1], &hi_bits);
+ emit_data->output[emit_data->chan] = hi_bits;
}
/* TGSI_OPCODE_UMUL_HI */
@@ -871,26 +848,32 @@ umul_hi_emit(
struct lp_build_tgsi_context * bld_base,
struct lp_build_emit_data * emit_data)
{
- LLVMBuilderRef builder = bld_base->base.gallivm->builder;
struct lp_build_context *uint_bld = &bld_base->uint_bld;
- struct lp_type type = uint_bld->type;
- LLVMValueRef src0, src1;
- LLVMValueRef dst64;
- LLVMTypeRef typeRef;
-
- assert(type.width == 32);
- type.width = 64;
- typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
- src0 = LLVMBuildZExt(builder, emit_data->args[0], typeRef, "");
- src1 = LLVMBuildZExt(builder, emit_data->args[1], typeRef, "");
- dst64 = LLVMBuildMul(builder, src0, src1, "");
- dst64 = LLVMBuildLShr(
- builder, dst64,
- lp_build_const_vec(bld_base->base.gallivm, type, 32), "");
- type.width = 32;
- typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
- emit_data->output[emit_data->chan] =
- LLVMBuildTrunc(builder, dst64, typeRef, "");
+ LLVMValueRef hi_bits;
+
+ assert(uint_bld->type.width == 32);
+
+ /* low result bits are tossed away */
+ lp_build_mul_32_lohi(uint_bld, emit_data->args[0],
+ emit_data->args[1], &hi_bits);
+ emit_data->output[emit_data->chan] = hi_bits;
+}
+
+static void
+umul_hi_emit_cpu(
+ const struct lp_build_tgsi_action * action,
+ struct lp_build_tgsi_context * bld_base,
+ struct lp_build_emit_data * emit_data)
+{
+ struct lp_build_context *uint_bld = &bld_base->uint_bld;
+ LLVMValueRef hi_bits;
+
+ assert(uint_bld->type.width == 32);
+
+ /* low result bits are tossed away */
+ lp_build_mul_32_lohi_cpu(uint_bld, emit_data->args[0],
+ emit_data->args[1], &hi_bits);
+ emit_data->output[emit_data->chan] = hi_bits;
}
/* TGSI_OPCODE_MAX */
@@ -945,7 +928,7 @@ xpd_helper(
tmp0 = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MUL, a, b);
tmp1 = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MUL, c, d);
- return lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_SUB, tmp0, tmp1);
+ return lp_build_sub(&bld_base->base, tmp0, tmp1);
}
static void
@@ -1332,7 +1315,6 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
bld_base->op_actions[TGSI_OPCODE_ADD].emit = add_emit;
bld_base->op_actions[TGSI_OPCODE_ARR].emit = arr_emit;
- bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = clamp_emit;
bld_base->op_actions[TGSI_OPCODE_END].emit = end_emit;
bld_base->op_actions[TGSI_OPCODE_FRC].emit = frc_emit;
bld_base->op_actions[TGSI_OPCODE_LRP].emit = lrp_emit;
@@ -1341,7 +1323,6 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
bld_base->op_actions[TGSI_OPCODE_MUL].emit = mul_emit;
bld_base->op_actions[TGSI_OPCODE_DIV].emit = fdiv_emit;
bld_base->op_actions[TGSI_OPCODE_RCP].emit = rcp_emit;
- bld_base->op_actions[TGSI_OPCODE_SUB].emit = sub_emit;
bld_base->op_actions[TGSI_OPCODE_UARL].emit = mov_emit;
bld_base->op_actions[TGSI_OPCODE_F2U].emit = f2u_emit;
@@ -1358,6 +1339,7 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
bld_base->op_actions[TGSI_OPCODE_DMAX].emit = fmax_emit;
bld_base->op_actions[TGSI_OPCODE_DMIN].emit = fmin_emit;
bld_base->op_actions[TGSI_OPCODE_DMUL].emit = mul_emit;
+ bld_base->op_actions[TGSI_OPCODE_DDIV].emit = fdiv_emit;
bld_base->op_actions[TGSI_OPCODE_D2F].emit = d2f_emit;
bld_base->op_actions[TGSI_OPCODE_D2I].emit = d2i_emit;
@@ -1400,18 +1382,6 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
* intrinsics.
*/
-/* TGSI_OPCODE_ABS (CPU Only)*/
-
-static void
-abs_emit_cpu(
- const struct lp_build_tgsi_action * action,
- struct lp_build_tgsi_context * bld_base,
- struct lp_build_emit_data * emit_data)
-{
- emit_data->output[emit_data->chan] = lp_build_abs(&bld_base->base,
- emit_data->args[0]);
-}
-
/* TGSI_OPCODE_ADD (CPU Only) */
static void
add_emit_cpu(
@@ -2072,19 +2042,6 @@ ssg_emit_cpu(
emit_data->args[0]);
}
-/* TGSI_OPCODE_SUB (CPU Only) */
-
-static void
-sub_emit_cpu(
- const struct lp_build_tgsi_action * action,
- struct lp_build_tgsi_context * bld_base,
- struct lp_build_emit_data * emit_data)
-{
- emit_data->output[emit_data->chan] = lp_build_sub(&bld_base->base,
- emit_data->args[0],
- emit_data->args[1]);
-}
-
/* TGSI_OPCODE_TRUNC (CPU Only) */
static void
@@ -2576,7 +2533,6 @@ lp_set_default_actions_cpu(
struct lp_build_tgsi_context * bld_base)
{
lp_set_default_actions(bld_base);
- bld_base->op_actions[TGSI_OPCODE_ABS].emit = abs_emit_cpu;
bld_base->op_actions[TGSI_OPCODE_ADD].emit = add_emit_cpu;
bld_base->op_actions[TGSI_OPCODE_AND].emit = and_emit_cpu;
bld_base->op_actions[TGSI_OPCODE_ARL].emit = arl_emit_cpu;
@@ -2603,6 +2559,8 @@ lp_set_default_actions_cpu(
bld_base->op_actions[TGSI_OPCODE_ISHR].emit = ishr_emit_cpu;
bld_base->op_actions[TGSI_OPCODE_ISLT].emit = islt_emit_cpu;
bld_base->op_actions[TGSI_OPCODE_ISSG].emit = issg_emit_cpu;
+ bld_base->op_actions[TGSI_OPCODE_IMUL_HI].emit = imul_hi_emit_cpu;
+ bld_base->op_actions[TGSI_OPCODE_UMUL_HI].emit = umul_hi_emit_cpu;
bld_base->op_actions[TGSI_OPCODE_LG2].emit = lg2_emit_cpu;
bld_base->op_actions[TGSI_OPCODE_LOG].emit = log_emit_cpu;
@@ -2624,7 +2582,6 @@ lp_set_default_actions_cpu(
bld_base->op_actions[TGSI_OPCODE_SLT].emit = slt_emit_cpu;
bld_base->op_actions[TGSI_OPCODE_SNE].emit = sne_emit_cpu;
bld_base->op_actions[TGSI_OPCODE_SSG].emit = ssg_emit_cpu;
- bld_base->op_actions[TGSI_OPCODE_SUB].emit = sub_emit_cpu;
bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = trunc_emit_cpu;
bld_base->rsq_action.emit = recip_sqrt_emit_cpu;
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index 610283d79..58c39facf 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -256,10 +256,6 @@ lp_emit_store_aos(
ptr = bld->addr[reg->Indirect.Index];
break;
- case TGSI_FILE_PREDICATE:
- ptr = bld->preds[reg->Register.Index];
- break;
-
default:
assert(0);
return;
@@ -267,43 +263,6 @@ lp_emit_store_aos(
if (!ptr)
return;
- /*
- * Predicate
- */
-
- if (inst->Instruction.Predicate) {
- LLVMValueRef pred;
-
- assert(inst->Predicate.Index < LP_MAX_TGSI_PREDS);
-
- pred = LLVMBuildLoad(builder,
- bld->preds[inst->Predicate.Index], "");
-
- /*
- * Convert the value to an integer mask.
- */
- pred = lp_build_compare(bld->bld_base.base.gallivm,
- bld->bld_base.base.type,
- PIPE_FUNC_NOTEQUAL,
- pred,
- bld->bld_base.base.zero);
-
- if (inst->Predicate.Negate) {
- pred = LLVMBuildNot(builder, pred, "");
- }
-
- pred = bld->bld_base.emit_swizzle(&bld->bld_base, pred,
- inst->Predicate.SwizzleX,
- inst->Predicate.SwizzleY,
- inst->Predicate.SwizzleZ,
- inst->Predicate.SwizzleW);
-
- if (mask) {
- mask = LLVMBuildAnd(builder, mask, pred, "");
- } else {
- mask = pred;
- }
- }
/*
* Writemask
@@ -442,11 +401,6 @@ lp_emit_declaration_aos(
bld->addr[idx] = lp_build_alloca(gallivm, vec_type, "");
break;
- case TGSI_FILE_PREDICATE:
- assert(idx < LP_MAX_TGSI_PREDS);
- bld->preds[idx] = lp_build_alloca(gallivm, vec_type, "");
- break;
-
case TGSI_FILE_SAMPLER_VIEW:
/*
* The target stored here MUST match whatever there actually
@@ -521,7 +475,7 @@ lp_emit_instruction_aos(
case TGSI_OPCODE_RSQ:
/* TGSI_OPCODE_RECIPSQRT */
src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
- tmp0 = lp_build_emit_llvm_unary(&bld->bld_base, TGSI_OPCODE_ABS, src0);
+ tmp0 = lp_build_abs(&bld->bld_base.base, src0);
dst0 = lp_build_rsqrt(&bld->bld_base.base, tmp0);
break;
@@ -591,12 +545,6 @@ lp_emit_instruction_aos(
dst0 = lp_build_add(&bld->bld_base.base, tmp0, src2);
break;
- case TGSI_OPCODE_SUB:
- src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
- src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL);
- dst0 = lp_build_sub(&bld->bld_base.base, src0, src1);
- break;
-
case TGSI_OPCODE_LRP:
src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL);
@@ -615,14 +563,6 @@ lp_emit_instruction_aos(
dst0 = lp_build_sub(&bld->bld_base.base, src0, tmp0);
break;
- case TGSI_OPCODE_CLAMP:
- src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
- src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL);
- src2 = lp_build_emit_fetch(&bld->bld_base, inst, 2, LP_CHAN_ALL);
- tmp0 = lp_build_max(&bld->bld_base.base, src0, src1);
- dst0 = lp_build_min(&bld->bld_base.base, tmp0, src2);
- break;
-
case TGSI_OPCODE_FLR:
src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
dst0 = lp_build_floor(&bld->bld_base.base, src0);
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
index f8f43a561..e0cc0af27 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
@@ -305,8 +305,7 @@ analyse_instruction(struct analysis_context *ctx,
} else if (dst->File == TGSI_FILE_OUTPUT) {
regs = info->output;
max_regs = ARRAY_SIZE(info->output);
- } else if (dst->File == TGSI_FILE_ADDRESS ||
- dst->File == TGSI_FILE_PREDICATE) {
+ } else if (dst->File == TGSI_FILE_ADDRESS) {
continue;
} else {
assert(0);
@@ -389,8 +388,7 @@ analyse_instruction(struct analysis_context *ctx,
memset(res, 0, sizeof res);
- if (!inst->Instruction.Predicate &&
- !inst->Instruction.Saturate) {
+ if (!inst->Instruction.Saturate) {
for (chan = 0; chan < 4; ++chan) {
if (dst->WriteMask & (1 << chan)) {
if (inst->Instruction.Opcode == TGSI_OPCODE_MOV) {
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 6871795b4..bfa32b9ad 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -753,30 +753,21 @@ static void lp_exec_default(struct lp_exec_mask *mask,
*/
static void lp_exec_mask_store(struct lp_exec_mask *mask,
struct lp_build_context *bld_store,
- LLVMValueRef pred,
LLVMValueRef val,
LLVMValueRef dst_ptr)
{
LLVMBuilderRef builder = mask->bld->gallivm->builder;
+ LLVMValueRef exec_mask = mask->has_mask ? mask->exec_mask : NULL;
assert(lp_check_value(bld_store->type, val));
assert(LLVMGetTypeKind(LLVMTypeOf(dst_ptr)) == LLVMPointerTypeKind);
assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val));
- /* Mix the predicate and execution mask */
- if (mask->has_mask) {
- if (pred) {
- pred = LLVMBuildAnd(builder, pred, mask->exec_mask, "");
- } else {
- pred = mask->exec_mask;
- }
- }
-
- if (pred) {
+ if (exec_mask) {
LLVMValueRef res, dst;
dst = LLVMBuildLoad(builder, dst_ptr, "");
- res = lp_build_select(bld_store, pred, val, dst);
+ res = lp_build_select(bld_store, exec_mask, val, dst);
LLVMBuildStore(builder, res, dst_ptr);
} else
LLVMBuildStore(builder, val, dst_ptr);
@@ -1036,22 +1027,12 @@ emit_mask_scatter(struct lp_build_tgsi_soa_context *bld,
LLVMValueRef base_ptr,
LLVMValueRef indexes,
LLVMValueRef values,
- struct lp_exec_mask *mask,
- LLVMValueRef pred)
+ struct lp_exec_mask *mask)
{
struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
LLVMBuilderRef builder = gallivm->builder;
unsigned i;
-
- /* Mix the predicate and execution mask */
- if (mask->has_mask) {
- if (pred) {
- pred = LLVMBuildAnd(builder, pred, mask->exec_mask, "");
- }
- else {
- pred = mask->exec_mask;
- }
- }
+ LLVMValueRef pred = mask->has_mask ? mask->exec_mask : NULL;
/*
* Loop over elements of index_vec, store scalar value.
@@ -1733,74 +1714,6 @@ emit_fetch_deriv(
*ddy = lp_build_ddy(&bld->bld_base.base, src);
}
-
-/**
- * Predicate.
- */
-static void
-emit_fetch_predicate(
- struct lp_build_tgsi_soa_context *bld,
- const struct tgsi_full_instruction *inst,
- LLVMValueRef *pred)
-{
- LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
- unsigned index;
- unsigned char swizzles[4];
- LLVMValueRef unswizzled[4] = {NULL, NULL, NULL, NULL};
- LLVMValueRef value;
- unsigned chan;
-
- if (!inst->Instruction.Predicate) {
- TGSI_FOR_EACH_CHANNEL( chan ) {
- pred[chan] = NULL;
- }
- return;
- }
-
- swizzles[0] = inst->Predicate.SwizzleX;
- swizzles[1] = inst->Predicate.SwizzleY;
- swizzles[2] = inst->Predicate.SwizzleZ;
- swizzles[3] = inst->Predicate.SwizzleW;
-
- index = inst->Predicate.Index;
- assert(index < LP_MAX_TGSI_PREDS);
-
- TGSI_FOR_EACH_CHANNEL( chan ) {
- unsigned swizzle = swizzles[chan];
-
- /*
- * Only fetch the predicate register channels that are actually listed
- * in the swizzles
- */
- if (!unswizzled[swizzle]) {
- value = LLVMBuildLoad(builder,
- bld->preds[index][swizzle], "");
-
- /*
- * Convert the value to an integer mask.
- *
- * TODO: Short-circuit this comparison -- a D3D setp_xx instructions
- * is needlessly causing two comparisons due to storing the intermediate
- * result as float vector instead of an integer mask vector.
- */
- value = lp_build_compare(bld->bld_base.base.gallivm,
- bld->bld_base.base.type,
- PIPE_FUNC_NOTEQUAL,
- value,
- bld->bld_base.base.zero);
- if (inst->Predicate.Negate) {
- value = LLVMBuildNot(builder, value, "");
- }
-
- unswizzled[swizzle] = value;
- } else {
- value = unswizzled[swizzle];
- }
-
- pred[chan] = value;
- }
-}
-
/**
* store an array of 8 64-bit into two arrays of 8 floats
* i.e.
@@ -1813,7 +1726,6 @@ emit_fetch_predicate(
static void
emit_store_64bit_chan(struct lp_build_tgsi_context *bld_base,
LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2,
- LLVMValueRef pred,
LLVMValueRef value)
{
struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
@@ -1841,8 +1753,8 @@ emit_store_64bit_chan(struct lp_build_tgsi_context *bld_base,
bld_base->base.type.length),
"");
- lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp, chan_ptr);
- lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp2, chan_ptr2);
+ lp_exec_mask_store(&bld->exec_mask, float_bld, temp, chan_ptr);
+ lp_exec_mask_store(&bld->exec_mask, float_bld, temp2, chan_ptr2);
}
/**
@@ -1854,7 +1766,6 @@ emit_store_chan(
const struct tgsi_full_instruction *inst,
unsigned index,
unsigned chan_index,
- LLVMValueRef pred,
LLVMValueRef value)
{
struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
@@ -1917,7 +1828,7 @@ emit_store_chan(
/* Scatter store values into output registers */
emit_mask_scatter(bld, outputs_array, index_vec, value,
- &bld->exec_mask, pred);
+ &bld->exec_mask);
}
else {
LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index,
@@ -1927,9 +1838,9 @@ emit_store_chan(
LLVMValueRef out_ptr2 = lp_get_output_ptr(bld, reg->Register.Index,
chan_index + 1);
emit_store_64bit_chan(bld_base, out_ptr, out_ptr2,
- pred, value);
+ value);
} else
- lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr);
+ lp_exec_mask_store(&bld->exec_mask, float_bld, value, out_ptr);
}
break;
@@ -1955,7 +1866,7 @@ emit_store_chan(
/* Scatter store values into temp registers */
emit_mask_scatter(bld, temps_array, index_vec, value,
- &bld->exec_mask, pred);
+ &bld->exec_mask);
}
else {
LLVMValueRef temp_ptr;
@@ -1966,10 +1877,10 @@ emit_store_chan(
reg->Register.Index,
chan_index + 1);
emit_store_64bit_chan(bld_base, temp_ptr, temp_ptr2,
- pred, value);
+ value);
}
else
- lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr);
+ lp_exec_mask_store(&bld->exec_mask, float_bld, value, temp_ptr);
}
break;
@@ -1977,17 +1888,10 @@ emit_store_chan(
assert(dtype == TGSI_TYPE_SIGNED);
assert(LLVMTypeOf(value) == int_bld->vec_type);
value = LLVMBuildBitCast(builder, value, int_bld->vec_type, "");
- lp_exec_mask_store(&bld->exec_mask, int_bld, pred, value,
+ lp_exec_mask_store(&bld->exec_mask, int_bld, value,
bld->addr[reg->Register.Index][chan_index]);
break;
- case TGSI_FILE_PREDICATE:
- assert(LLVMTypeOf(value) == float_bld->vec_type);
- value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
- lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value,
- bld->preds[reg->Register.Index][chan_index]);
- break;
-
default:
assert( 0 );
}
@@ -2037,18 +1941,14 @@ emit_store(
{
unsigned chan_index;
- struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode);
- if(info->num_dst) {
- LLVMValueRef pred[TGSI_NUM_CHANNELS];
-
- emit_fetch_predicate( bld, inst, pred );
+ if(info->num_dst) {
TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
if (tgsi_type_is_64bit(dtype) && (chan_index == 1 || chan_index == 3))
continue;
- emit_store_chan(bld_base, inst, 0, chan_index, pred[chan_index], dst[chan_index]);
+ emit_store_chan(bld_base, inst, 0, chan_index, dst[chan_index]);
}
}
}
@@ -2998,15 +2898,6 @@ lp_emit_declaration_soa(
}
break;
- case TGSI_FILE_PREDICATE:
- assert(last < LP_MAX_TGSI_PREDS);
- for (idx = first; idx <= last; ++idx) {
- for (i = 0; i < TGSI_NUM_CHANNELS; i++)
- bld->preds[idx][i] = lp_build_alloca(gallivm, vec_type,
- "predicate");
- }
- break;
-
case TGSI_FILE_SAMPLER_VIEW:
/*
* The target stored here MUST match whatever there actually
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h
index 7fb449fd0..afe8722b0 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -41,6 +41,10 @@
#include "pipe/p_compiler.h"
#include "gallivm/lp_bld.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
/**
* Native SIMD architecture width available at runtime.
*
@@ -449,5 +453,8 @@ lp_build_context_init(struct lp_build_context *bld,
unsigned
lp_build_count_ir_module(LLVMModuleRef module);
+#ifdef __cplusplus
+}
+#endif
#endif /* !LP_BLD_TYPE_H */