diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2017-12-31 07:12:27 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2017-12-31 07:12:27 +0000 |
commit | 051645c92924bf915d82bf219f2ed67309b5577a (patch) | |
tree | 4aae126dd8e5a18c6a9926a5468d1561e6038a07 /lib/mesa/src/gallium/auxiliary/gallivm | |
parent | 2dae6fe6f74cf7fb9fd65285302c0331d9786b00 (diff) |
Merge Mesa 17.2.8
Diffstat (limited to 'lib/mesa/src/gallium/auxiliary/gallivm')
27 files changed, 1871 insertions, 740 deletions
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 3ea073433..04f86bef2 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -1091,6 +1091,180 @@ lp_build_mul(struct lp_build_context *bld, return res; } +/* + * Widening mul, valid for 32x32 bit -> 64bit only. + * Result is low 32bits, high bits returned in res_hi. + * + * Emits code that is meant to be compiled for the host CPU. + */ +LLVMValueRef +lp_build_mul_32_lohi_cpu(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef b, + LLVMValueRef *res_hi) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + + assert(bld->type.width == 32); + assert(bld->type.floating == 0); + assert(bld->type.fixed == 0); + assert(bld->type.norm == 0); + + /* + * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces + * for x86 simd is atrocious (even if the high bits weren't required), + * trying to handle real 64bit inputs (which of course can't happen due + * to using 64bit umul with 32bit numbers zero-extended to 64bit, but + * apparently llvm does not recognize this widening mul). This includes 6 + * (instead of 2) pmuludq plus extra adds and shifts + * The same story applies to signed mul, albeit fixing this requires sse41. + * https://llvm.org/bugs/show_bug.cgi?id=30845 + * So, whip up our own code, albeit only for length 4 and 8 (which + * should be good enough)... + */ + if ((bld->type.length == 4 || bld->type.length == 8) && + ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) || + util_cpu_caps.has_sse4_1)) { + const char *intrinsic = NULL; + LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd; + LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec; + struct lp_type type_wide = lp_wider_type(bld->type); + LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide); + unsigned i; + for (i = 0; i < bld->type.length; i += 2) { + shuf[i] = lp_build_const_int32(gallivm, i+1); + shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + } + shuf_vec = LLVMConstVector(shuf, bld->type.length); + aeven = a; + beven = b; + aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, ""); + bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, ""); + + if (util_cpu_caps.has_avx2 && bld->type.length == 8) { + if (bld->type.sign) { + intrinsic = "llvm.x86.avx2.pmul.dq"; + } else { + intrinsic = "llvm.x86.avx2.pmulu.dq"; + } + muleven = lp_build_intrinsic_binary(builder, intrinsic, + wider_type, aeven, beven); + mulodd = lp_build_intrinsic_binary(builder, intrinsic, + wider_type, aodd, bodd); + } + else { + /* for consistent naming look elsewhere... */ + if (bld->type.sign) { + intrinsic = "llvm.x86.sse41.pmuldq"; + } else { + intrinsic = "llvm.x86.sse2.pmulu.dq"; + } + /* + * XXX If we only have AVX but not AVX2 this is a pain. + * lp_build_intrinsic_binary_anylength() can't handle it + * (due to src and dst type not being identical). + */ + if (bld->type.length == 8) { + LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi; + LLVMValueRef aoddlo, aoddhi, boddlo, boddhi; + LLVMValueRef muleven2[2], mulodd2[2]; + struct lp_type type_wide_half = type_wide; + LLVMTypeRef wtype_half; + type_wide_half.length = 2; + wtype_half = lp_build_vec_type(gallivm, type_wide_half); + aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4); + aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4); + bevenlo = lp_build_extract_range(gallivm, beven, 0, 4); + bevenhi = lp_build_extract_range(gallivm, beven, 4, 4); + aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4); + aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4); + boddlo = lp_build_extract_range(gallivm, bodd, 0, 4); + boddhi = lp_build_extract_range(gallivm, bodd, 4, 4); + muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic, + wtype_half, aevenlo, bevenlo); + mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic, + wtype_half, aoddlo, boddlo); + muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic, + wtype_half, aevenhi, bevenhi); + mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic, + wtype_half, aoddhi, boddhi); + muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2); + mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2); + + } + else { + muleven = lp_build_intrinsic_binary(builder, intrinsic, + wider_type, aeven, beven); + mulodd = lp_build_intrinsic_binary(builder, intrinsic, + wider_type, aodd, bodd); + } + } + muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, ""); + mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, ""); + + for (i = 0; i < bld->type.length; i += 2) { + shuf[i] = lp_build_const_int32(gallivm, i + 1); + shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length); + } + shuf_vec = LLVMConstVector(shuf, bld->type.length); + *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, ""); + + for (i = 0; i < bld->type.length; i += 2) { + shuf[i] = lp_build_const_int32(gallivm, i); + shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length); + } + shuf_vec = LLVMConstVector(shuf, bld->type.length); + return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, ""); + } + else { + return lp_build_mul_32_lohi(bld, a, b, res_hi); + } +} + + +/* + * Widening mul, valid for 32x32 bit -> 64bit only. + * Result is low 32bits, high bits returned in res_hi. + * + * Emits generic code. + */ +LLVMValueRef +lp_build_mul_32_lohi(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef b, + LLVMValueRef *res_hi) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef tmp, shift, res_lo; + struct lp_type type_tmp; + LLVMTypeRef wide_type, narrow_type; + + type_tmp = bld->type; + narrow_type = lp_build_vec_type(gallivm, type_tmp); + type_tmp.width *= 2; + wide_type = lp_build_vec_type(gallivm, type_tmp); + shift = lp_build_const_vec(gallivm, type_tmp, 32); + + if (bld->type.sign) { + a = LLVMBuildSExt(builder, a, wide_type, ""); + b = LLVMBuildSExt(builder, b, wide_type, ""); + } else { + a = LLVMBuildZExt(builder, a, wide_type, ""); + b = LLVMBuildZExt(builder, b, wide_type, ""); + } + tmp = LLVMBuildMul(builder, a, b, ""); + + res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, ""); + + /* Since we truncate anyway, LShr and AShr are equivalent. */ + tmp = LLVMBuildLShr(builder, tmp, shift, ""); + *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, ""); + + return res_lo; +} + /* a * b + c */ LLVMValueRef @@ -1198,7 +1372,9 @@ lp_build_div(struct lp_build_context *bld, return LLVMConstUDiv(a, b); } - if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || + /* fast rcp is disabled (just uses div), so makes no sense to try that */ + if(FALSE && + ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) && type.floating) return lp_build_mul(bld, a, lp_build_rcp(bld, b)); diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h index 622b930a9..2a4137a67 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_arit.h @@ -77,6 +77,18 @@ lp_build_mul(struct lp_build_context *bld, LLVMValueRef b); LLVMValueRef +lp_build_mul_32_lohi_cpu(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef b, + LLVMValueRef *res_hi); + +LLVMValueRef +lp_build_mul_32_lohi(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef b, + LLVMValueRef *res_hi); + +LLVMValueRef lp_build_mul_imm(struct lp_build_context *bld, LLVMValueRef a, int b); diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c index 69d24a55b..c688965a7 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_conv.c @@ -456,21 +456,21 @@ int lp_build_conv_auto(struct gallivm_state *gallivm, src_type.sign == dst_type->sign) return num_dsts; - /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub + /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8 */ - if (src_type.floating == 1 && - src_type.fixed == 0 && - src_type.sign == 1 && - src_type.norm == 0 && + if (src_type.norm == 0 && src_type.width == 32 && + src_type.fixed == 0 && dst_type->floating == 0 && dst_type->fixed == 0 && - dst_type->sign == 0 && - dst_type->norm == 1 && - dst_type->width == 8) - { - /* Special case 4x4f --> 1x16ub */ + dst_type->width == 8 && + + ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) || + (src_type.floating == 0 && dst_type->floating == 0 && + src_type.sign == dst_type->sign && dst_type->norm == 0))) { + + /* Special case 4x4x32 --> 1x16x8 */ if (src_type.length == 4 && (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec)) { @@ -481,7 +481,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm, return num_dsts; } - /* Special case 2x8f --> 1x16ub */ + /* Special case 2x8x32 --> 1x16x8 */ if (src_type.length == 8 && util_cpu_caps.has_avx) { @@ -497,8 +497,25 @@ int lp_build_conv_auto(struct gallivm_state *gallivm, if (src_type.width == dst_type->width) { lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); } else { - for (i = 0; i < num_srcs; ++i) { - lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1); + /* + * If dst_width is 16 bits and src_width 32 and the dst vector size + * 64bit, try feeding 2 vectors at once so pack intrinsics can be used. + * (For AVX, this isn't needed, since we usually get 256bit src and + * 128bit dst vectors which works ok. If we do AVX2 pack this should + * be extended but need to be able to tell conversion code about pack + * ordering first.) + */ + unsigned ratio = 1; + if (src_type.width == 2 * dst_type->width && + src_type.length == dst_type->length && + dst_type->floating == 0 && (num_srcs % 2 == 0) && + dst_type->width * dst_type->length == 64) { + ratio = 2; + num_dsts /= 2; + dst_type->length *= 2; + } + for (i = 0; i < num_dsts; i++) { + lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, &dst[i], 1); } } @@ -541,21 +558,25 @@ lp_build_conv(struct gallivm_state *gallivm, num_tmps = num_srcs; - /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub + /* + * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8 + * Only float -> s/unorm8 and (u)int32->(u)int8. + * XXX: This should cover all interesting backend cases for 8 bit, + * but should use same strategy if dst is 16 bit. */ - if (src_type.floating == 1 && - src_type.fixed == 0 && - src_type.sign == 1 && - src_type.norm == 0 && + if (src_type.norm == 0 && src_type.width == 32 && src_type.length == 4 && + src_type.fixed == 0 && dst_type.floating == 0 && dst_type.fixed == 0 && - dst_type.sign == 0 && - dst_type.norm == 1 && dst_type.width == 8 && + ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) || + (src_type.floating == 0 && dst_type.floating == 0 && + src_type.sign == dst_type.sign && dst_type.norm == 0)) && + ((dst_type.length == 16 && 4 * num_dsts == num_srcs) || (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) && @@ -564,7 +585,7 @@ lp_build_conv(struct gallivm_state *gallivm, struct lp_build_context bld; struct lp_type int16_type, int32_type; struct lp_type dst_type_ext = dst_type; - LLVMValueRef const_255f; + LLVMValueRef const_scale; unsigned i, j; lp_build_context_init(&bld, gallivm, src_type); @@ -580,14 +601,54 @@ lp_build_conv(struct gallivm_state *gallivm, int32_type.length /= 4; int32_type.sign = 1; - const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); + const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type)); for (i = 0; i < num_dsts; ++i, src += 4) { LLVMValueRef lo, hi; - for (j = 0; j < dst_type.length / 4; ++j) { - tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, ""); - tmp[j] = lp_build_iround(&bld, tmp[j]); + if (src_type.floating) { + for (j = 0; j < dst_type.length / 4; ++j) { + /* + * XXX This is not actually fully correct. The float to int + * conversion will produce 0x80000000 value for everything + * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq). + * Hence, NaNs and negatives will get clamped just fine to zero + * (relying on clamping pack behavior) when converting to unorm, + * however too large values (both finite and infinite) will also + * end up as zero, not 255. + * For snorm, for now we'll keep bug compatibility with generic + * conversion path (meaning too large values are fine, but + * NaNs get converted to -128 (purely by luck, as we don't + * specify nan behavior for the max there) instead of 0). + */ + if (dst_type.sign) { + tmp[j] = lp_build_min(&bld, bld.one, src[j]); + + } + else { + if (0) { + tmp[j] = lp_build_min_ext(&bld, bld.one, src[j], + GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); + } + tmp[j] = src[j]; + } + tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, ""); + tmp[j] = lp_build_iround(&bld, tmp[j]); + } + } else { + for (j = 0; j < dst_type.length / 4; ++j) { + if (!dst_type.sign) { + /* + * Pack clamp is always signed->unsigned (or signed->signed). + * Hence need min. + */ + LLVMValueRef const_max; + const_max = lp_build_const_int_vec(gallivm, src_type, 255); + tmp[j] = lp_build_min(&bld, src[j], const_max); + } else { + tmp[j] = src[j]; + } + } } if (num_srcs == 1) { @@ -612,20 +673,20 @@ lp_build_conv(struct gallivm_state *gallivm, return; } - /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub + /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8 */ - else if (src_type.floating == 1 && - src_type.fixed == 0 && - src_type.sign == 1 && - src_type.norm == 0 && - src_type.width == 32 && - src_type.length == 8 && - - dst_type.floating == 0 && - dst_type.fixed == 0 && - dst_type.sign == 0 && - dst_type.norm == 1 && - dst_type.width == 8 && + else if (src_type.norm == 0 && + src_type.width == 32 && + src_type.length == 8 && + src_type.fixed == 0 && + + dst_type.floating == 0 && + dst_type.fixed == 0 && + dst_type.width == 8 && + + ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) || + (src_type.floating == 0 && dst_type.floating == 0 && + src_type.sign == dst_type.sign && dst_type.norm == 0)) && ((dst_type.length == 16 && 2 * num_dsts == num_srcs) || (num_dsts == 1 && dst_type.length * num_srcs == 8)) && @@ -635,7 +696,7 @@ lp_build_conv(struct gallivm_state *gallivm, struct lp_build_context bld; struct lp_type int16_type, int32_type; struct lp_type dst_type_ext = dst_type; - LLVMValueRef const_255f; + LLVMValueRef const_scale; unsigned i; lp_build_context_init(&bld, gallivm, src_type); @@ -651,30 +712,44 @@ lp_build_conv(struct gallivm_state *gallivm, int32_type.length /= 4; int32_type.sign = 1; - const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); + const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type)); for (i = 0; i < num_dsts; ++i, src += 2) { - LLVMValueRef lo, hi, a, b; - - a = LLVMBuildFMul(builder, src[0], const_255f, ""); - a = lp_build_iround(&bld, a); - tmp[0] = lp_build_extract_range(gallivm, a, 0, 4); - tmp[1] = lp_build_extract_range(gallivm, a, 4, 4); - /* relying on clamping behavior of sse2 intrinsics here */ - lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); - - if (num_srcs == 1) { - hi = lo; + unsigned j; + for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) { + LLVMValueRef lo, hi, a; + + a = src[j]; + if (src_type.floating) { + if (dst_type.sign) { + a = lp_build_min(&bld, bld.one, a); + + } + else { + if (0) { + a = lp_build_min_ext(&bld, bld.one, a, + GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); + } + } + a = LLVMBuildFMul(builder, a, const_scale, ""); + a = lp_build_iround(&bld, a); + } else { + if (!dst_type.sign) { + LLVMValueRef const_max; + const_max = lp_build_const_int_vec(gallivm, src_type, 255); + a = lp_build_min(&bld, a, const_max); + } + } + lo = lp_build_extract_range(gallivm, a, 0, 4); + hi = lp_build_extract_range(gallivm, a, 4, 4); + /* relying on clamping behavior of sse2 intrinsics here */ + tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi); } - else { - b = LLVMBuildFMul(builder, src[1], const_255f, ""); - b = lp_build_iround(&bld, b); - tmp[2] = lp_build_extract_range(gallivm, b, 0, 4); - tmp[3] = lp_build_extract_range(gallivm, b, 4, 4); - hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); + if (num_srcs == 1) { + tmp[1] = tmp[0]; } - dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi); + dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]); } if (num_srcs == 1) { @@ -841,6 +916,10 @@ lp_build_conv(struct gallivm_state *gallivm, new_type.width = dst_type.width; new_type.length = dst_type.length; + /* + * Note that resize when using packs can sometimes get min/max + * clamping for free. Should be able to exploit this... + */ lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); tmp_type = new_type; diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h index 5c866f420..6540caaa2 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format.h @@ -143,6 +143,7 @@ void lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, + boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offsets, LLVMValueRef i, diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index 9f6b9e9fb..2f723857f 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -38,6 +38,7 @@ #include "util/u_math.h" #include "util/u_pointer.h" #include "util/u_string.h" +#include "util/u_cpu_detect.h" #include "lp_bld_arit.h" #include "lp_bld_init.h" @@ -49,7 +50,10 @@ #include "lp_bld_gather.h" #include "lp_bld_debug.h" #include "lp_bld_format.h" +#include "lp_bld_pack.h" #include "lp_bld_intr.h" +#include "lp_bld_logic.h" +#include "lp_bld_bitarit.h" /** @@ -137,6 +141,73 @@ format_matches_type(const struct util_format_description *desc, return TRUE; } +/* + * Do rounding when converting small unorm values to larger ones. + * Not quite 100% accurate, as it's done by appending MSBs, but + * should be good enough. + */ + +static inline LLVMValueRef +scale_bits_up(struct gallivm_state *gallivm, + int src_bits, + int dst_bits, + LLVMValueRef src, + struct lp_type src_type) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef result = src; + + if (src_bits == 1 && dst_bits > 1) { + /* + * Useful for a1 - we'd need quite some repeated copies otherwise. + */ + struct lp_build_context bld; + LLVMValueRef dst_mask; + lp_build_context_init(&bld, gallivm, src_type); + dst_mask = lp_build_const_int_vec(gallivm, src_type, + (1 << dst_bits) - 1), + result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src, + lp_build_const_int_vec(gallivm, src_type, 0)); + result = lp_build_andnot(&bld, dst_mask, result); + } + else if (dst_bits > src_bits) { + /* Scale up bits */ + int db = dst_bits - src_bits; + + /* Shift left by difference in bits */ + result = LLVMBuildShl(builder, + src, + lp_build_const_int_vec(gallivm, src_type, db), + ""); + + if (db <= src_bits) { + /* Enough bits in src to fill the remainder */ + LLVMValueRef lower = LLVMBuildLShr(builder, + src, + lp_build_const_int_vec(gallivm, src_type, + src_bits - db), + ""); + + result = LLVMBuildOr(builder, result, lower, ""); + } else if (db > src_bits) { + /* Need to repeatedly copy src bits to fill remainder in dst */ + unsigned n; + + for (n = src_bits; n < dst_bits; n *= 2) { + LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n); + + result = LLVMBuildOr(builder, + result, + LLVMBuildLShr(builder, result, shuv, ""), + ""); + } + } + } else { + assert (dst_bits == src_bits); + } + + return result; +} /** * Unpack a single pixel into its XYZW components. @@ -156,6 +227,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef shifts[4]; LLVMValueRef masks[4]; LLVMValueRef scales[4]; + LLVMTypeRef vec32_type; boolean normalized; boolean needs_uitofp; @@ -171,19 +243,17 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, * matches floating point size */ assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context)); + vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4); + /* Broadcast the packed value to all four channels * before: packed = BGRA * after: packed = {BGRA, BGRA, BGRA, BGRA} */ - packed = LLVMBuildInsertElement(builder, - LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)), - packed, + packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed, LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)), ""); - packed = LLVMBuildShuffleVector(builder, - packed, - LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)), - LLVMConstNull(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)), + packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type), + LLVMConstNull(vec32_type), ""); /* Initialize vector constants */ @@ -224,8 +294,40 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW} * into masked = {X, Y, Z, W} */ - shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); - masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); + if (desc->block.bits < 32 && normalized) { + /* + * Note: we cannot do the shift below on x86 natively until AVX2. + * + * Old llvm versions will resort to scalar extract/shift insert, + * which is definitely terrible, new versions will just do + * several vector shifts and shuffle/blend results together. + * We could turn this into a variable left shift plus a constant + * right shift, and llvm would then turn the variable left shift + * into a mul for us (albeit without sse41 the mul needs emulation + * too...). However, since we're going to do a float mul + * anyway, we just adjust that mul instead (plus the mask), skipping + * the shift completely. + * We could also use a extra mul when the format isn't normalized and + * we don't have AVX2 support, but don't bother for now. Unfortunately, + * this strategy doesn't work for 32bit formats (such as rgb10a2 or even + * rgba8 if it ends up here), as that would require UIToFP, albeit that + * would be fixable with easy 16bit shuffle (unless there's channels + * crossing 16bit boundaries). + */ + for (i = 0; i < 4; ++i) { + if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) { + unsigned bits = desc->channel[i].size; + unsigned shift = desc->channel[i].shift; + unsigned long long mask = ((1ULL << bits) - 1) << shift; + scales[i] = lp_build_const_float(gallivm, 1.0 / mask); + masks[i] = lp_build_const_int32(gallivm, mask); + } + } + masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), ""); + } else { + shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); + masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); + } if (!needs_uitofp) { /* UIToFP can't be expressed in SSE2 */ @@ -234,8 +336,10 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), ""); } - /* At this point 'casted' may be a vector of floats such as - * {255.0, 255.0, 255.0, 255.0}. Next, if the pixel values are normalized + /* + * At this point 'casted' may be a vector of floats such as + * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied + * by powers of two). Next, if the pixel values are normalized * we'll scale this to {1.0, 1.0, 1.0, 1.0}. */ @@ -391,9 +495,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, if (format_matches_type(format_desc, type) && format_desc->block.bits <= type.width * 4 && + /* XXX this shouldn't be needed */ util_is_power_of_two(format_desc->block.bits)) { LLVMValueRef packed; LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type); + struct lp_type fetch_type; unsigned vec_len = type.width * type.length; /* @@ -401,8 +507,9 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, * scaling or converting. */ + fetch_type = lp_type_uint(type.width*4); packed = lp_build_gather(gallivm, type.length/4, - format_desc->block.bits, type.width*4, + format_desc->block.bits, fetch_type, aligned, base_ptr, offset, TRUE); assert(format_desc->block.bits <= vec_len); @@ -413,6 +520,86 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, } /* + * Bit arithmetic for converting small_unorm to unorm8. + * + * This misses some opportunities for optimizations (like skipping mask + * for the highest channel for instance, or doing bit scaling in parallel + * for channels with the same bit width) but it should be passable for + * all arithmetic formats. + */ + if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB && + util_format_fits_8unorm(format_desc) && + type.width == 8 && type.norm == 1 && type.sign == 0 && + type.fixed == 0 && type.floating == 0) { + LLVMValueRef packed, res, chans[4], rgba[4]; + LLVMTypeRef dst_vec_type, conv_vec_type; + struct lp_type fetch_type, conv_type; + struct lp_build_context bld_conv; + unsigned j; + + fetch_type = lp_type_uint(type.width*4); + conv_type = lp_type_int_vec(type.width*4, type.width * type.length); + dst_vec_type = lp_build_vec_type(gallivm, type); + conv_vec_type = lp_build_vec_type(gallivm, conv_type); + lp_build_context_init(&bld_conv, gallivm, conv_type); + + packed = lp_build_gather(gallivm, type.length/4, + format_desc->block.bits, fetch_type, + aligned, base_ptr, offset, TRUE); + + assert(format_desc->block.bits * type.length / 4 <= + type.width * type.length); + + packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, ""); + + for (j = 0; j < format_desc->nr_channels; ++j) { + unsigned mask = 0; + unsigned sa = format_desc->channel[j].shift; + + mask = (1 << format_desc->channel[j].size) - 1; + + /* Extract bits from source */ + chans[j] = LLVMBuildLShr(builder, packed, + lp_build_const_int_vec(gallivm, conv_type, sa), + ""); + + chans[j] = LLVMBuildAnd(builder, chans[j], + lp_build_const_int_vec(gallivm, conv_type, mask), + ""); + + /* Scale bits */ + if (type.norm) { + chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size, + type.width, chans[j], conv_type); + } + } + /* + * This is a hacked lp_build_format_swizzle_soa() since we need a + * normalized 1 but only 8 bits in a 32bit vector... + */ + for (j = 0; j < 4; ++j) { + enum pipe_swizzle swizzle = format_desc->swizzle[j]; + if (swizzle == PIPE_SWIZZLE_1) { + rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1); + } else { + rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle); + } + if (j == 0) { + res = rgba[j]; + } else { + rgba[j] = LLVMBuildShl(builder, rgba[j], + lp_build_const_int_vec(gallivm, conv_type, + j * type.width), ""); + res = LLVMBuildOr(builder, res, rgba[j], ""); + } + } + res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, ""); + + return res; + } + + /* * Bit arithmetic */ @@ -421,6 +608,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && + /* XXX this shouldn't be needed */ util_is_power_of_two(format_desc->block.bits) && format_desc->block.bits <= 32 && format_desc->is_bitmask && @@ -430,8 +618,15 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, !format_desc->channel[0].pure_integer) { LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; - LLVMValueRef res; - unsigned k; + LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128]; + struct lp_type conv_type; + unsigned k, num_conv_src, num_conv_dst; + + /* + * Note this path is generally terrible for fetching multiple pixels. + * We should make sure we cannot hit this code path for anything but + * single pixels. + */ /* * Unpack a pixel at a time into a <4 x float> RGBA vector @@ -461,12 +656,38 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, __FUNCTION__, format_desc->short_name); } - lp_build_conv(gallivm, - lp_float32_vec4_type(), - type, - tmps, num_pixels, &res, 1); + conv_type = lp_float32_vec4_type(); + num_conv_src = num_pixels; + num_conv_dst = 1; + + if (num_pixels % 8 == 0) { + lp_build_concat_n(gallivm, lp_float32_vec4_type(), + tmps, num_pixels, tmps, num_pixels / 2); + conv_type.length *= num_pixels / 4; + num_conv_src = 4 * num_pixels / 8; + if (type.width == 8 && type.floating == 0 && type.fixed == 0) { + /* + * FIXME: The fast float->unorm path (which is basically + * skipping the MIN/MAX which are extremely pointless in any + * case) requires that there's 2 destinations... + * In any case, we really should make sure we don't hit this + * code with multiple pixels for unorm8 dst types, it's + * completely hopeless even if we do hit the right conversion. + */ + type.length /= num_pixels / 4; + num_conv_dst = num_pixels / 4; + } + } + + lp_build_conv(gallivm, conv_type, type, + tmps, num_conv_src, res, num_conv_dst); + + if (num_pixels % 8 == 0 && + (type.width == 8 && type.floating == 0 && type.fixed == 0)) { + lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1); + } - return lp_build_format_swizzle_aos(format_desc, &bld, res); + return lp_build_format_swizzle_aos(format_desc, &bld, res[0]); } /* If all channels are of same type and we are not using half-floats */ diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c index 8cad3a6fc..636a4a623 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c @@ -70,7 +70,14 @@ lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm, src_vec_type = lp_build_vec_type(gallivm, src_type); - /* Read whole vector from memory, unaligned */ + /* + * Read whole vector from memory, unaligned. + * XXX: Note it's actually aligned to element type. Not sure if all + * callers are able to guarantee that (whereas for others, we should + * be able to use full alignment when there's 2 or 4 channels). + * (If all callers can guarantee element type alignment, we should + * relax alignment restrictions elsewhere.) + */ ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, ""); ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), ""); res = LLVMBuildLoad(builder, ptr, ""); diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c index 7fc4e8d24..22c19b10d 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c @@ -31,6 +31,7 @@ #include "util/u_format.h" #include "util/u_memory.h" #include "util/u_string.h" +#include "util/u_math.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -40,6 +41,39 @@ #include "lp_bld_debug.h" #include "lp_bld_format.h" #include "lp_bld_arit.h" +#include "lp_bld_pack.h" + + +static void +convert_to_soa(struct gallivm_state *gallivm, + LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32], + LLVMValueRef dst_soa[4], + const struct lp_type soa_type) +{ + unsigned j, k; + struct lp_type aos_channel_type = soa_type; + + LLVMValueRef aos_channels[4]; + unsigned pixels_per_channel = soa_type.length / 4; + + debug_assert((soa_type.length % 4) == 0); + + aos_channel_type.length >>= 1; + + for (j = 0; j < 4; ++j) { + LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 }; + + assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH); + + for (k = 0; k < pixels_per_channel; ++k) { + channel[k] = src_aos[j + 4 * k]; + } + + aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel); + } + + lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa); +} void @@ -48,9 +82,6 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc, const LLVMValueRef *unswizzled, LLVMValueRef swizzled_out[4]) { - assert(PIPE_SWIZZLE_0 == (int)PIPE_SWIZZLE_0); - assert(PIPE_SWIZZLE_1 == (int)PIPE_SWIZZLE_1); - if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { enum pipe_swizzle swizzle; LLVMValueRef depth_or_stencil; @@ -83,6 +114,166 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc, } + +static LLVMValueRef +lp_build_extract_soa_chan(struct lp_build_context *bld, + unsigned blockbits, + boolean srgb_chan, + struct util_format_channel_description chan_desc, + LLVMValueRef packed) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_type type = bld->type; + LLVMValueRef input = packed; + const unsigned width = chan_desc.size; + const unsigned start = chan_desc.shift; + const unsigned stop = start + width; + + /* Decode the input vector component */ + + switch(chan_desc.type) { + case UTIL_FORMAT_TYPE_VOID: + input = bld->undef; + break; + + case UTIL_FORMAT_TYPE_UNSIGNED: + /* + * Align the LSB + */ + if (start) { + input = LLVMBuildLShr(builder, input, + lp_build_const_int_vec(gallivm, type, start), ""); + } + + /* + * Zero the MSBs + */ + if (stop < blockbits) { + unsigned mask = ((unsigned long long)1 << width) - 1; + input = LLVMBuildAnd(builder, input, + lp_build_const_int_vec(gallivm, type, mask), ""); + } + + /* + * Type conversion + */ + if (type.floating) { + if (srgb_chan) { + struct lp_type conv_type = lp_uint_type(type); + input = lp_build_srgb_to_linear(gallivm, conv_type, width, input); + } + else { + if(chan_desc.normalized) + input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); + else + input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); + } + } + else if (chan_desc.pure_integer) { + /* Nothing to do */ + } else { + /* FIXME */ + assert(0); + } + break; + + case UTIL_FORMAT_TYPE_SIGNED: + /* + * Align the sign bit first. + */ + if (stop < type.width) { + unsigned bits = type.width - stop; + LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); + input = LLVMBuildShl(builder, input, bits_val, ""); + } + + /* + * Align the LSB (with an arithmetic shift to preserve the sign) + */ + if (chan_desc.size < type.width) { + unsigned bits = type.width - chan_desc.size; + LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); + input = LLVMBuildAShr(builder, input, bits_val, ""); + } + + /* + * Type conversion + */ + if (type.floating) { + input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); + if (chan_desc.normalized) { + double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1); + LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); + input = LLVMBuildFMul(builder, input, scale_val, ""); + /* + * The formula above will produce value below -1.0 for most negative + * value but everything seems happy with that hence disable for now. + */ + if (0) + input = lp_build_max(bld, input, + lp_build_const_vec(gallivm, type, -1.0f)); + } + } + else if (chan_desc.pure_integer) { + /* Nothing to do */ + } else { + /* FIXME */ + assert(0); + } + break; + + case UTIL_FORMAT_TYPE_FLOAT: + if (type.floating) { + if (chan_desc.size == 16) { + struct lp_type f16i_type = type; + f16i_type.width /= 2; + f16i_type.floating = 0; + if (start) { + input = LLVMBuildLShr(builder, input, + lp_build_const_int_vec(gallivm, type, start), ""); + } + input = LLVMBuildTrunc(builder, input, + lp_build_vec_type(gallivm, f16i_type), ""); + input = lp_build_half_to_float(gallivm, input); + } else { + assert(start == 0); + assert(stop == 32); + assert(type.width == 32); + } + input = LLVMBuildBitCast(builder, input, bld->vec_type, ""); + } + else { + /* FIXME */ + assert(0); + input = bld->undef; + } + break; + + case UTIL_FORMAT_TYPE_FIXED: + if (type.floating) { + double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1); + LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); + input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); + input = LLVMBuildFMul(builder, input, scale_val, ""); + } + else { + /* FIXME */ + assert(0); + input = bld->undef; + } + break; + + default: + assert(0); + input = bld->undef; + break; + } + + return input; +} + + /** * Unpack several pixels in SoA. * @@ -113,7 +304,6 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm, LLVMValueRef packed, LLVMValueRef rgba_out[4]) { - LLVMBuilderRef builder = gallivm->builder; struct lp_build_context bld; LLVMValueRef inputs[4]; unsigned chan; @@ -129,149 +319,19 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm, /* Decode the input vector components */ for (chan = 0; chan < format_desc->nr_channels; ++chan) { - const unsigned width = format_desc->channel[chan].size; - const unsigned start = format_desc->channel[chan].shift; - const unsigned stop = start + width; - LLVMValueRef input; - - input = packed; - - switch(format_desc->channel[chan].type) { - case UTIL_FORMAT_TYPE_VOID: - input = lp_build_undef(gallivm, type); - break; - - case UTIL_FORMAT_TYPE_UNSIGNED: - /* - * Align the LSB - */ - - if (start) { - input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), ""); - } - - /* - * Zero the MSBs - */ - - if (stop < format_desc->block.bits) { - unsigned mask = ((unsigned long long)1 << width) - 1; - input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), ""); - } - - /* - * Type conversion - */ - - if (type.floating) { - if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { - if (format_desc->swizzle[3] == chan) { - input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); - } - else { - struct lp_type conv_type = lp_uint_type(type); - input = lp_build_srgb_to_linear(gallivm, conv_type, width, input); - } - } - else { - if(format_desc->channel[chan].normalized) - input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); - else - input = LLVMBuildSIToFP(builder, input, - lp_build_vec_type(gallivm, type), ""); - } - } - else if (format_desc->channel[chan].pure_integer) { - /* Nothing to do */ - } else { - /* FIXME */ - assert(0); - } - - break; - - case UTIL_FORMAT_TYPE_SIGNED: - /* - * Align the sign bit first. - */ - - if (stop < type.width) { - unsigned bits = type.width - stop; - LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); - input = LLVMBuildShl(builder, input, bits_val, ""); - } - - /* - * Align the LSB (with an arithmetic shift to preserve the sign) - */ - - if (format_desc->channel[chan].size < type.width) { - unsigned bits = type.width - format_desc->channel[chan].size; - LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); - input = LLVMBuildAShr(builder, input, bits_val, ""); - } - - /* - * Type conversion - */ + struct util_format_channel_description chan_desc = format_desc->channel[chan]; + boolean srgb_chan = FALSE; - if (type.floating) { - input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); - if (format_desc->channel[chan].normalized) { - double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1); - LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); - input = LLVMBuildFMul(builder, input, scale_val, ""); - /* the formula above will produce value below -1.0 for most negative - * value but everything seems happy with that hence disable for now */ - if (0) - input = lp_build_max(&bld, input, - lp_build_const_vec(gallivm, type, -1.0f)); - } - } - else if (format_desc->channel[chan].pure_integer) { - /* Nothing to do */ - } else { - /* FIXME */ - assert(0); - } - - break; - - case UTIL_FORMAT_TYPE_FLOAT: - if (type.floating) { - assert(start == 0); - assert(stop == 32); - assert(type.width == 32); - input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), ""); - } - else { - /* FIXME */ - assert(0); - input = lp_build_undef(gallivm, type); - } - break; - - case UTIL_FORMAT_TYPE_FIXED: - if (type.floating) { - double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1); - LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); - input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); - input = LLVMBuildFMul(builder, input, scale_val, ""); - } - else { - /* FIXME */ - assert(0); - input = lp_build_undef(gallivm, type); - } - break; - - default: - assert(0); - input = lp_build_undef(gallivm, type); - break; + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && + format_desc->swizzle[3] != chan) { + srgb_chan = TRUE; } - inputs[chan] = input; + inputs[chan] = lp_build_extract_soa_chan(&bld, + format_desc->block.bits, + srgb_chan, + chan_desc, + packed); } lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out); @@ -336,6 +396,7 @@ lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm, * * \param type the desired return type for 'rgba'. The vector length * is the number of texels to fetch + * \param aligned if the offset is guaranteed to be aligned to element width * * \param base_ptr points to the base of the texture mip tree. * \param offset offset to start of the texture image block. For non- @@ -352,6 +413,7 @@ void lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, + boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, @@ -360,6 +422,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; + enum pipe_format format = format_desc->format; + struct lp_type fetch_type; if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || @@ -369,7 +433,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, format_desc->block.height == 1 && format_desc->block.bits <= type.width && (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT || - format_desc->channel[0].size == 32)) + format_desc->channel[0].size == 32 || + format_desc->channel[0].size == 16)) { /* * The packed pixel fits into an element of the destination format. Put @@ -384,11 +449,12 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, * Ex: packed = {XYZW, XYZW, XYZW, XYZW} */ assert(format_desc->block.bits <= type.width); + fetch_type = lp_type_uint(type.width); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, - type.width, - TRUE, + fetch_type, + aligned, base_ptr, offset, FALSE); /* @@ -401,22 +467,232 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, return; } - if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT || - format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) { + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) && + format_desc->block.width == 1 && + format_desc->block.height == 1 && + format_desc->block.bits > type.width && + ((format_desc->block.bits <= type.width * type.length && + format_desc->channel[0].size <= type.width) || + (format_desc->channel[0].size == 64 && + format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && + type.floating))) + { + /* + * Similar to above, but the packed pixel is larger than what fits + * into an element of the destination format. The packed pixels will be + * shuffled into SoA vectors appropriately, and then the extraction will + * be done in parallel as much as possible. + * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so + * the gathered vectors can be shuffled easily (even with avx). + * 64xn float -> 32xn float is handled too but it's a bit special as + * it does the conversion pre-shuffle. + */ + + LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32]; + struct lp_type fetch_type, gather_type = type; + unsigned num_gather, fetch_width, i, j; + struct lp_build_context bld; + boolean fp64 = format_desc->channel[0].size == 64; + + lp_build_context_init(&bld, gallivm, type); + + assert(type.width == 32); + assert(format_desc->block.bits > type.width); + + /* + * First, figure out fetch order. + */ + fetch_width = util_next_power_of_two(format_desc->block.bits); + /* + * fp64 are treated like fp32 except we fetch twice wide values + * (as we shuffle after trunc). The shuffles for that work out + * mostly fine (slightly suboptimal for 4-wide, perfect for AVX) + * albeit we miss the potential opportunity for hw gather (as it + * only handles native size). + */ + num_gather = fetch_width / type.width; + gather_type.width *= num_gather; + if (fp64) { + num_gather /= 2; + } + gather_type.length /= num_gather; + + for (i = 0; i < num_gather; i++) { + LLVMValueRef offsetr, shuf_vec; + if(num_gather == 4) { + for (j = 0; j < gather_type.length; j++) { + unsigned idx = i + 4*j; + shuffles[j] = lp_build_const_int32(gallivm, idx); + } + shuf_vec = LLVMConstVector(shuffles, gather_type.length); + offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); + + } + else if (num_gather == 2) { + assert(num_gather == 2); + for (j = 0; j < gather_type.length; j++) { + unsigned idx = i*2 + (j%2) + (j/2)*4; + shuffles[j] = lp_build_const_int32(gallivm, idx); + } + shuf_vec = LLVMConstVector(shuffles, gather_type.length); + offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); + } + else { + assert(num_gather == 1); + offsetr = offset; + } + if (gather_type.length == 1) { + LLVMValueRef zero = lp_build_const_int32(gallivm, 0); + offsetr = LLVMBuildExtractElement(builder, offsetr, zero, ""); + } + + /* + * Determine whether to use float or int loads. This is mostly + * to outsmart the (stupid) llvm int/float shuffle logic, we + * don't really care much if the data is floats or ints... + * But llvm will refuse to use single float shuffle with int data + * and instead use 3 int shuffles instead, the code looks atrocious. + * (Note bitcasts often won't help, as llvm is too smart to be + * fooled by that.) + * Nobody cares about simd float<->int domain transition penalties, + * which usually don't even exist for shuffles anyway. + * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is + * going into transpose, which is unpacks, so doesn't really matter + * much). + * With 2x32bit or 4x16bit fetch, we use float vec, since those + * go into the weird channel separation shuffle. With floats, + * this is (with 128bit vectors): + * - 2 movq, 2 movhpd, 2 shufps + * With ints it would be: + * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw + * I've seen texture functions increase in code size by 15% just due + * to that (there's lots of such fetches in them...) + * (We could chose a different gather order to improve this somewhat + * for the int path, but it would basically just drop the blends, + * so the float path with this order really is optimal.) + * Albeit it is tricky sometimes llvm doesn't ignore the float->int + * casts so must avoid them until we're done with the float shuffle... + * 3x16bit formats (the same is also true for 3x8) are pretty bad but + * there's nothing we can do about them (we could overallocate by + * those couple bytes and use unaligned but pot sized load). + * Note that this is very much x86 specific. I don't know if this + * affect other archs at all. + */ + if (num_gather > 1) { + /* + * We always want some float type here (with x86) + * due to shuffles being float ones afterwards (albeit for + * the num_gather == 4 case int should work fine too + * (unless there's some problems with avx but not avx2). + */ + if (format_desc->channel[0].size == 64) { + fetch_type = lp_type_float_vec(64, gather_type.width); + } else { + fetch_type = lp_type_int_vec(32, gather_type.width); + } + } + else { + /* type doesn't matter much */ + if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && + (format_desc->channel[0].size == 32 || + format_desc->channel[0].size == 64)) { + fetch_type = lp_type_float(gather_type.width); + } else { + fetch_type = lp_type_uint(gather_type.width); + } + } + + /* Now finally gather the values */ + packed[i] = lp_build_gather(gallivm, gather_type.length, + format_desc->block.bits, + fetch_type, aligned, + base_ptr, offsetr, FALSE); + if (fp64) { + struct lp_type conv_type = type; + conv_type.width *= 2; + packed[i] = LLVMBuildBitCast(builder, packed[i], + lp_build_vec_type(gallivm, conv_type), ""); + packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, ""); + } + } + + /* shuffle the gathered values to SoA */ + if (num_gather == 2) { + for (i = 0; i < num_gather; i++) { + for (j = 0; j < type.length; j++) { + unsigned idx = (j%2)*2 + (j/4)*4 + i; + if ((j/2)%2) + idx += type.length; + shuffles[j] = lp_build_const_int32(gallivm, idx); + } + dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1], + LLVMConstVector(shuffles, type.length), ""); + } + } + else if (num_gather == 4) { + lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst); + } + else { + assert(num_gather == 1); + dst[0] = packed[0]; + } + + /* + * And finally unpack exactly as above, except that + * chan shift is adjusted and the right vector selected. + */ + if (!fp64) { + for (i = 0; i < num_gather; i++) { + dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, ""); + } + for (i = 0; i < format_desc->nr_channels; i++) { + struct util_format_channel_description chan_desc = format_desc->channel[i]; + unsigned blockbits = type.width; + unsigned vec_nr; + +#ifdef PIPE_ARCH_BIG_ENDIAN + vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width; +#else + vec_nr = chan_desc.shift / type.width; +#endif + chan_desc.shift %= type.width; + + output[i] = lp_build_extract_soa_chan(&bld, + blockbits, + FALSE, + chan_desc, + dst[vec_nr]); + } + } + else { + for (i = 0; i < format_desc->nr_channels; i++) { + output[i] = dst[i]; + } + } + + lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out); + return; + } + + if (format == PIPE_FORMAT_R11G11B10_FLOAT || + format == PIPE_FORMAT_R9G9B9E5_FLOAT) { /* * similar conceptually to above but requiring special * AoS packed -> SoA float conversion code. */ LLVMValueRef packed; + struct lp_type fetch_type = lp_type_uint(type.width); assert(type.floating); assert(type.width == 32); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, - type.width, TRUE, + fetch_type, aligned, base_ptr, offset, FALSE); - if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) { + if (format == PIPE_FORMAT_R11G11B10_FLOAT) { lp_build_r11g11b10_to_float(gallivm, packed, rgba_out); } else { @@ -432,8 +708,9 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, * 32bit (or 8bit) from each block. */ LLVMValueRef packed; + struct lp_type fetch_type = lp_type_uint(type.width); - if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) { + if (format == PIPE_FORMAT_X32_S8X24_UINT) { /* * for stencil simply fix up offsets - could in fact change * base_ptr instead even outside the shader. @@ -441,15 +718,15 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, unsigned mask = (1 << 8) - 1; LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4); offset = LLVMBuildAdd(builder, offset, s_offset, ""); - packed = lp_build_gather(gallivm, type.length, 32, type.width, - TRUE, base_ptr, offset, FALSE); + packed = lp_build_gather(gallivm, type.length, 32, fetch_type, + aligned, base_ptr, offset, FALSE); packed = LLVMBuildAnd(builder, packed, lp_build_const_int_vec(gallivm, type, mask), ""); } else { - assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); - packed = lp_build_gather(gallivm, type.length, 32, type.width, - TRUE, base_ptr, offset, TRUE); + assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); + packed = lp_build_gather(gallivm, type.length, 32, fetch_type, + aligned, base_ptr, offset, TRUE); packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(gallivm, type), ""); } @@ -461,63 +738,69 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, /* * Try calling lp_build_fetch_rgba_aos for all pixels. + * Should only really hit subsampled, compressed + * (for s3tc srgb too, for rgtc the unorm ones only) by now. + * (This is invalid for plain 8unorm formats because we're lazy with + * the swizzle since some results would arrive swizzled, some not.) */ - if (util_format_fits_8unorm(format_desc) && + if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) && + (util_format_fits_8unorm(format_desc) || + format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) && type.floating && type.width == 32 && (type.length == 1 || (type.length % 4 == 0))) { struct lp_type tmp_type; - LLVMValueRef tmp; + struct lp_build_context bld; + LLVMValueRef packed, rgba[4]; + const struct util_format_description *flinear_desc; + const struct util_format_description *frgba8_desc; + unsigned chan; + + lp_build_context_init(&bld, gallivm, type); + /* + * Make sure the conversion in aos really only does convert to rgba8 + * and not anything more (so use linear format, adjust type). + */ + flinear_desc = util_format_description(util_format_linear(format)); memset(&tmp_type, 0, sizeof tmp_type); tmp_type.width = 8; tmp_type.length = type.length * 4; tmp_type.norm = TRUE; - tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, - TRUE, base_ptr, offset, i, j, cache); + packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type, + aligned, base_ptr, offset, i, j, cache); + packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, ""); - lp_build_rgba8_to_fi32_soa(gallivm, - type, - tmp, - rgba_out); - - return; - } - - if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && - /* non-srgb case is already handled above */ - format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && - type.floating && type.width == 32 && - (type.length == 1 || (type.length % 4 == 0)) && - cache) { - const struct util_format_description *format_decompressed; - const struct util_format_description *flinear_desc; - LLVMValueRef packed; - flinear_desc = util_format_description(util_format_linear(format_desc->format)); - packed = lp_build_fetch_cached_texels(gallivm, - flinear_desc, - type.length, - base_ptr, - offset, - i, j, - cache); - packed = LLVMBuildBitCast(builder, packed, - lp_build_int_vec_type(gallivm, type), ""); /* - * The values are now packed so they match ordinary srgb RGBA8 format, + * The values are now packed so they match ordinary (srgb) RGBA8 format, * hence need to use matching format for unpack. */ - format_decompressed = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB); - + frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM); + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { + assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC); + frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB); + } lp_build_unpack_rgba_soa(gallivm, - format_decompressed, + frgba8_desc, type, - packed, rgba_out); + packed, rgba); + /* + * We converted 4 channels. Make sure llvm can drop unneeded ones + * (luckily the rgba order is fixed, only LA needs special case). + */ + for (chan = 0; chan < 4; chan++) { + enum pipe_swizzle swizzle = format_desc->swizzle[chan]; + if (chan == 3 && util_format_is_luminance_alpha(format)) { + swizzle = PIPE_SWIZZLE_W; + } + rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle); + } return; } + /* * Fallback to calling lp_build_fetch_rgba_aos for each pixel. * @@ -525,30 +808,40 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, * miss some opportunities to do vectorization, but this is * convenient for formats or scenarios for which there was no * opportunity or incentive to optimize. + * + * We do NOT want to end up here, this typically is quite terrible, + * in particular if the formats have less than 4 channels. + * + * Right now, this should only be hit for: + * - RGTC snorm formats + * (those miss fast fetch functions hence they are terrible anyway) */ { - unsigned k, chan; + unsigned k; struct lp_type tmp_type; + LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32]; if (gallivm_debug & GALLIVM_DEBUG_PERF) { - debug_printf("%s: scalar unpacking of %s\n", + debug_printf("%s: AoS fetch fallback for %s\n", __FUNCTION__, format_desc->short_name); } tmp_type = type; tmp_type.length = 4; - for (chan = 0; chan < 4; ++chan) { - rgba_out[chan] = lp_build_undef(gallivm, type); - } + /* + * Note that vector transpose can be worse compared to insert/extract + * for aos->soa conversion (for formats with 1 or 2 channels). However, + * we should try to avoid getting here for just about all formats, so + * don't bother. + */ /* loop over number of pixels */ for(k = 0; k < type.length; ++k) { LLVMValueRef index = lp_build_const_int32(gallivm, k); LLVMValueRef offset_elem; LLVMValueRef i_elem, j_elem; - LLVMValueRef tmp; offset_elem = LLVMBuildExtractElement(builder, offset, index, ""); @@ -557,20 +850,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, j_elem = LLVMBuildExtractElement(builder, j, index, ""); /* Get a single float[4]={R,G,B,A} pixel */ - tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, - TRUE, base_ptr, offset_elem, - i_elem, j_elem, cache); + aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, + aligned, base_ptr, offset_elem, + i_elem, j_elem, cache); - /* - * Insert the AoS tmp value channels into the SoA result vectors at - * position = 'index'. - */ - for (chan = 0; chan < 4; ++chan) { - LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan), - tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, ""); - rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan], - tmp_chan, index, ""); - } } + convert_to_soa(gallivm, aos_fetch, rgba_out, type); } } diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c index fa0e8b656..d6d755298 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c @@ -491,13 +491,15 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm, { LLVMValueRef packed; LLVMValueRef rgba; + struct lp_type fetch_type; assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED); assert(format_desc->block.bits == 32); assert(format_desc->block.width == 2); assert(format_desc->block.height == 1); - packed = lp_build_gather(gallivm, n, 32, 32, TRUE, base_ptr, offset, FALSE); + fetch_type = lp_type_uint(32); + packed = lp_build_gather(gallivm, n, 32, fetch_type, TRUE, base_ptr, offset, FALSE); (void)j; diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c index 439bbb679..7d11dcd3b 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.c @@ -28,13 +28,16 @@ #include "util/u_debug.h" #include "util/u_cpu_detect.h" +#include "util/u_math.h" #include "lp_bld_debug.h" #include "lp_bld_const.h" #include "lp_bld_format.h" #include "lp_bld_gather.h" #include "lp_bld_swizzle.h" +#include "lp_bld_type.h" #include "lp_bld_init.h" #include "lp_bld_intr.h" +#include "lp_bld_pack.h" /** @@ -113,14 +116,29 @@ lp_build_gather_elem(struct gallivm_state *gallivm, * translation of offsets to first_elem in sampler_views it actually seems * gallium could not do anything else except 16 no matter what... */ - if (!aligned) { + if (!aligned) { LLVMSetAlignment(res, 1); + } else if (!util_is_power_of_two(src_width)) { + /* + * Full alignment is impossible, assume the caller really meant + * the individual elements were aligned (e.g. 3x32bit format). + * And yes the generated code may otherwise crash, llvm will + * really assume 128bit alignment with a 96bit fetch (I suppose + * that makes sense as it can just assume the upper 32bit to be + * whatever). + * Maybe the caller should be able to explicitly set this, but + * this should cover all the 3-channel formats. + */ + if (((src_width / 24) * 24 == src_width) && + util_is_power_of_two(src_width / 24)) { + LLVMSetAlignment(res, src_width / 24); + } else { + LLVMSetAlignment(res, 1); + } } assert(src_width <= dst_width); - if (src_width > dst_width) { - res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, ""); - } else if (src_width < dst_width) { + if (src_width < dst_width) { res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); if (vector_justify) { #ifdef PIPE_ARCH_BIG_ENDIAN @@ -134,28 +152,162 @@ lp_build_gather_elem(struct gallivm_state *gallivm, } +/** + * Gather one element from scatter positions in memory. + * Nearly the same as above, however the individual elements + * may be vectors themselves, and fetches may be float type. + * Can also do pad vector instead of ZExt. + * + * @sa lp_build_gather() + */ +static LLVMValueRef +lp_build_gather_elem_vec(struct gallivm_state *gallivm, + unsigned length, + unsigned src_width, + LLVMTypeRef src_type, + struct lp_type dst_type, + boolean aligned, + LLVMValueRef base_ptr, + LLVMValueRef offsets, + unsigned i, + boolean vector_justify) +{ + LLVMValueRef ptr, res; + LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); + assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); + + ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i); + ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, ""); + res = LLVMBuildLoad(gallivm->builder, ptr, ""); + + /* XXX + * On some archs we probably really want to avoid having to deal + * with alignments lower than 4 bytes (if fetch size is a power of + * two >= 32). On x86 it doesn't matter, however. + * We should be able to guarantee full alignment for any kind of texture + * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch + * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends + * but I don't think that's quite what we wanted). + * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT + * looks like a good fit, but it seems this cap bit (and OpenGL) aren't + * enforcing what we want (which is what d3d10 does, the offset needs to + * be aligned to element size, but GL has bytes regardless of element + * size which would only leave us with minimum alignment restriction of 16 + * which doesn't make much sense if the type isn't 4x32bit). Due to + * translation of offsets to first_elem in sampler_views it actually seems + * gallium could not do anything else except 16 no matter what... + */ + if (!aligned) { + LLVMSetAlignment(res, 1); + } else if (!util_is_power_of_two(src_width)) { + /* + * Full alignment is impossible, assume the caller really meant + * the individual elements were aligned (e.g. 3x32bit format). + * And yes the generated code may otherwise crash, llvm will + * really assume 128bit alignment with a 96bit fetch (I suppose + * that makes sense as it can just assume the upper 32bit to be + * whatever). + * Maybe the caller should be able to explicitly set this, but + * this should cover all the 3-channel formats. + */ + if (((src_width / 24) * 24 == src_width) && + util_is_power_of_two(src_width / 24)) { + LLVMSetAlignment(res, src_width / 24); + } else { + LLVMSetAlignment(res, 1); + } + } + + assert(src_width <= dst_type.width * dst_type.length); + if (src_width < dst_type.width * dst_type.length) { + if (dst_type.length > 1) { + res = lp_build_pad_vector(gallivm, res, dst_type.length); + /* + * vector_justify hopefully a non-issue since we only deal + * with src_width >= 32 here? + */ + } else { + LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type); + + /* + * Only valid if src_ptr_type is int type... + */ + res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); + +#ifdef PIPE_ARCH_BIG_ENDIAN + if (vector_justify) { + res = LLVMBuildShl(gallivm->builder, res, + LLVMConstInt(dst_elem_type, + dst_type.width - src_width, 0), ""); + } + if (src_width == 48) { + /* Load 3x16 bit vector. + * The sequence of loads on big-endian hardware proceeds as follows. + * 16-bit fields are denoted by X, Y, Z, and 0. In memory, the sequence + * of three fields appears in the order X, Y, Z. + * + * Load 32-bit word: 0.0.X.Y + * Load 16-bit halfword: 0.0.0.Z + * Rotate left: 0.X.Y.0 + * Bitwise OR: 0.X.Y.Z + * + * The order in which we need the fields in the result is 0.Z.Y.X, + * the same as on little-endian; permute 16-bit fields accordingly + * within 64-bit register: + */ + LLVMValueRef shuffles[4] = { + lp_build_const_int32(gallivm, 2), + lp_build_const_int32(gallivm, 1), + lp_build_const_int32(gallivm, 0), + lp_build_const_int32(gallivm, 3), + }; + res = LLVMBuildBitCast(gallivm->builder, res, + lp_build_vec_type(gallivm, lp_type_uint_vec(16, 4*16)), ""); + res = LLVMBuildShuffleVector(gallivm->builder, res, res, LLVMConstVector(shuffles, 4), ""); + res = LLVMBuildBitCast(gallivm->builder, res, dst_elem_type, ""); + } +#endif + } + } + return res; +} + + + + static LLVMValueRef lp_build_gather_avx2(struct gallivm_state *gallivm, unsigned length, unsigned src_width, - unsigned dst_width, + struct lp_type dst_type, LLVMValueRef base_ptr, LLVMValueRef offsets) { LLVMBuilderRef builder = gallivm->builder; - LLVMTypeRef dst_type = LLVMIntTypeInContext(gallivm->context, dst_width); - LLVMTypeRef dst_vec_type = LLVMVectorType(dst_type, length); - LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width); - LLVMTypeRef src_vec_type = LLVMVectorType(src_type, length); + LLVMTypeRef src_type, src_vec_type; LLVMValueRef res; + struct lp_type res_type = dst_type; + res_type.length *= length; + + if (dst_type.floating) { + src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) : + LLVMFloatTypeInContext(gallivm->context); + } else { + src_type = LLVMIntTypeInContext(gallivm->context, src_width); + } + src_vec_type = LLVMVectorType(src_type, length); + /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */ assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); if (0) { /* * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but - * will not use the AVX2 gather instrinsics. See + * will not use the AVX2 gather instrinsics (even with llvm 4.0), at + * least with Haswell. See * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html + * And the generated code doing the emulation is quite a bit worse + * than what we get by doing it ourselves too. */ LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32); LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length); @@ -175,7 +327,8 @@ lp_build_gather_avx2(struct gallivm_state *gallivm, src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep"); char intrinsic[64]; - util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%ui%u", length, src_width); + util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u", + length, dst_type.floating ? "f" : "i", src_width); LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0); LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type); LLVMValueRef passthru = LLVMGetUndef(src_vec_type); @@ -184,26 +337,35 @@ lp_build_gather_avx2(struct gallivm_state *gallivm, res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0); } else { - assert(src_width == 32); - LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8); - - /* - * We should get the caller to give more type information so we can use - * the intrinsics for the right int/float domain. Int should be the most - * common. - */ const char *intrinsic = NULL; - switch (length) { - case 4: - intrinsic = "llvm.x86.avx2.gather.d.d"; - break; - case 8: - intrinsic = "llvm.x86.avx2.gather.d.d.256"; - break; - default: - assert(0); + unsigned l_idx = 0; + + assert(src_width == 32 || src_width == 64); + if (src_width == 32) { + assert(length == 4 || length == 8); + } else { + assert(length == 2 || length == 4); + } + + static const char *intrinsics[2][2][2] = { + + {{"llvm.x86.avx2.gather.d.d", + "llvm.x86.avx2.gather.d.d.256"}, + {"llvm.x86.avx2.gather.d.q", + "llvm.x86.avx2.gather.d.q.256"}}, + + {{"llvm.x86.avx2.gather.d.ps", + "llvm.x86.avx2.gather.d.ps.256"}, + {"llvm.x86.avx2.gather.d.pd", + "llvm.x86.avx2.gather.d.pd.256"}}, + }; + + if ((src_width == 32 && length == 8) || + (src_width == 64 && length == 4)) { + l_idx = 1; } + intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx]; LLVMValueRef passthru = LLVMGetUndef(src_vec_type); LLVMValueRef mask = LLVMConstAllOnes(src_vec_type); @@ -214,12 +376,7 @@ lp_build_gather_avx2(struct gallivm_state *gallivm, res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0); } - - if (src_width > dst_width) { - res = LLVMBuildTrunc(builder, res, dst_vec_type, ""); - } else if (src_width < dst_width) { - res = LLVMBuildZExt(builder, res, dst_vec_type, ""); - } + res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), ""); return res; } @@ -240,9 +397,11 @@ lp_build_gather_avx2(struct gallivm_state *gallivm, * * @param length length of the offsets * @param src_width src element width in bits - * @param dst_width result element width in bits (src will be expanded to fit) + * @param dst_type result element type (src will be expanded to fit, + * but truncation is not allowed) + * (this may be a vector, must be pot sized) * @param aligned whether the data is guaranteed to be aligned (to src_width) - * @param base_ptr base pointer, should be a i8 pointer type. + * @param base_ptr base pointer, needs to be a i8 pointer type. * @param offsets vector with offsets * @param vector_justify select vector rather than integer justification */ @@ -250,36 +409,174 @@ LLVMValueRef lp_build_gather(struct gallivm_state *gallivm, unsigned length, unsigned src_width, - unsigned dst_width, + struct lp_type dst_type, boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offsets, boolean vector_justify) { LLVMValueRef res; + boolean need_expansion = src_width < dst_type.width * dst_type.length; + boolean vec_fetch; + struct lp_type fetch_type, fetch_dst_type; + LLVMTypeRef src_type; + + assert(src_width <= dst_type.width * dst_type.length); + + /* + * This is quite a mess... + * Figure out if the fetch should be done as: + * a) scalar or vector + * b) float or int + * + * As an example, for a 96bit fetch expanded into 4x32bit, it is better + * to use (3x32bit) vector type (then pad the vector). Otherwise, the + * zext will cause extra instructions. + * However, the same isn't true for 3x16bit (the codegen for that is + * completely worthless on x86 simd, and for 3x8bit is is way worse + * still, don't try that... (To get really good code out of llvm for + * these cases, the only way is to decompose the fetches manually + * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter + * case requires sse41, otherwise simple scalar zext is way better. + * But probably not important enough, so don't bother.) + * Also, we try to honor the floating bit of destination (but isn't + * possible if caller asks for instance for 2x32bit dst_type with + * 48bit fetch - the idea would be to use 3x16bit fetch, pad and + * cast to 2x32f type, so the fetch is always int and on top of that + * we avoid the vec pad and use scalar zext due the above mentioned + * issue). + * Note this is optimized for x86 sse2 and up backend. Could be tweaked + * for other archs if necessary... + */ + if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) && + (dst_type.length > 1)) { + /* use vector fetch (if dst_type is vector) */ + vec_fetch = TRUE; + if (dst_type.floating) { + fetch_type = lp_type_float_vec(dst_type.width, src_width); + } else { + fetch_type = lp_type_int_vec(dst_type.width, src_width); + } + /* intentionally not using lp_build_vec_type here */ + src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type), + fetch_type.length); + fetch_dst_type = fetch_type; + fetch_dst_type.length = dst_type.length; + } else { + /* use scalar fetch */ + vec_fetch = FALSE; + if (dst_type.floating && ((src_width == 32) || (src_width == 64))) { + fetch_type = lp_type_float(src_width); + } else { + fetch_type = lp_type_int(src_width); + } + src_type = lp_build_vec_type(gallivm, fetch_type); + fetch_dst_type = fetch_type; + fetch_dst_type.width = dst_type.width * dst_type.length; + } if (length == 1) { /* Scalar */ - return lp_build_gather_elem(gallivm, length, - src_width, dst_width, aligned, - base_ptr, offsets, 0, vector_justify); - } else if (util_cpu_caps.has_avx2 && src_width == 32 && (length == 4 || length == 8)) { - return lp_build_gather_avx2(gallivm, length, src_width, dst_width, base_ptr, offsets); + res = lp_build_gather_elem_vec(gallivm, length, + src_width, src_type, fetch_dst_type, + aligned, base_ptr, offsets, 0, + vector_justify); + return LLVMBuildBitCast(gallivm->builder, res, + lp_build_vec_type(gallivm, dst_type), ""); + /* + * Excluding expansion from these paths because if you need it for + * 32bit/64bit fetches you're doing it wrong (this is gather, not + * conversion) and it would be awkward for floats. + */ + } else if (util_cpu_caps.has_avx2 && !need_expansion && + src_width == 32 && (length == 4 || length == 8)) { + return lp_build_gather_avx2(gallivm, length, src_width, dst_type, + base_ptr, offsets); + /* + * This looks bad on paper wrt throughtput/latency on Haswell. + * Even on Broadwell it doesn't look stellar. + * Albeit no measurements were done (but tested to work). + * Should definitely enable on Skylake. + * (In general, should be more of a win if the fetch is 256bit wide - + * this is true for the 32bit case above too.) + */ + } else if (0 && util_cpu_caps.has_avx2 && !need_expansion && + src_width == 64 && (length == 2 || length == 4)) { + return lp_build_gather_avx2(gallivm, length, src_width, dst_type, + base_ptr, offsets); } else { /* Vector */ - LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width); - LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length); + LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8]; unsigned i; - - res = LLVMGetUndef(dst_vec_type); + boolean vec_zext = FALSE; + struct lp_type res_type, gather_res_type; + LLVMTypeRef res_t, gather_res_t; + + res_type = fetch_dst_type; + res_type.length *= length; + gather_res_type = res_type; + + if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) { + /* + * Note that llvm is never able to optimize zext/insert combos + * directly (i.e. zero the simd reg, then place the elements into + * the appropriate place directly). (I think this has to do with + * scalar/vector transition.) And scalar 16->32bit zext simd loads + * aren't possible (instead loading to scalar reg first). + * No idea about other archs... + * We could do this manually, but instead we just use a vector + * zext, which is simple enough (and, in fact, llvm might optimize + * this away). + * (We're not trying that with other bit widths as that might not be + * easier, in particular with 8 bit values at least with only sse2.) + */ + assert(vec_fetch == FALSE); + gather_res_type.width /= 2; + fetch_dst_type = fetch_type; + src_type = lp_build_vec_type(gallivm, fetch_type); + vec_zext = TRUE; + } + res_t = lp_build_vec_type(gallivm, res_type); + gather_res_t = lp_build_vec_type(gallivm, gather_res_type); + res = LLVMGetUndef(gather_res_t); for (i = 0; i < length; ++i) { LLVMValueRef index = lp_build_const_int32(gallivm, i); - LLVMValueRef elem; - elem = lp_build_gather_elem(gallivm, length, - src_width, dst_width, aligned, - base_ptr, offsets, i, vector_justify); - res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, ""); + elems[i] = lp_build_gather_elem_vec(gallivm, length, + src_width, src_type, fetch_dst_type, + aligned, base_ptr, offsets, i, + vector_justify); + if (!vec_fetch) { + res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, ""); + } + } + if (vec_zext) { + res = LLVMBuildZExt(gallivm->builder, res, res_t, ""); + if (vector_justify) { +#ifdef PIPE_ARCH_BIG_ENDIAN + unsigned sv = dst_type.width - src_width; + res = LLVMBuildShl(gallivm->builder, res, + lp_build_const_int_vec(gallivm, res_type, sv), ""); +#endif + } + } + if (vec_fetch) { + /* + * Do bitcast now otherwise llvm might get some funny ideas wrt + * float/int types... + */ + for (i = 0; i < length; i++) { + elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i], + lp_build_vec_type(gallivm, dst_type), ""); + } + res = lp_build_concat(gallivm, elems, dst_type, length); + } else { + struct lp_type really_final_type = dst_type; + assert(res_type.length * res_type.width == + dst_type.length * dst_type.width * length); + really_final_type.length *= length; + res = LLVMBuildBitCast(gallivm->builder, res, + lp_build_vec_type(gallivm, really_final_type), ""); } } diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h index 3ede4763a..7930864e6 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_gather.h @@ -55,7 +55,7 @@ LLVMValueRef lp_build_gather(struct gallivm_state *gallivm, unsigned length, unsigned src_width, - unsigned dst_width, + struct lp_type dst_type, boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offsets, diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c index fed43e99e..c456a97eb 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -48,8 +48,12 @@ # define USE_MCJIT 1 #elif defined(PIPE_ARCH_PPC_64) || defined(PIPE_ARCH_S390) || defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64) # define USE_MCJIT 1 +#endif + +#if defined(USE_MCJIT) +static const bool use_mcjit = USE_MCJIT; #else -static bool USE_MCJIT = 0; +static bool use_mcjit = FALSE; #endif @@ -121,19 +125,6 @@ create_pass_manager(struct gallivm_state *gallivm) LLVMAddTargetData(gallivm->target, gallivm->passmgr); #endif - /* Setting the module's DataLayout to an empty string will cause the - * ExecutionEngine to copy to the DataLayout string from its target - * machine to the module. As of LLVM 3.8 the module and the execution - * engine are required to have the same DataLayout. - * - * TODO: This is just a temporary work-around. The correct solution is - * for gallivm_init_state() to create a TargetMachine and pull the - * DataLayout from there. Currently, the TargetMachine used by llvmpipe - * is being implicitly created by the EngineBuilder in - * lp_build_create_jit_compiler_for_module() - */ - -#if HAVE_LLVM < 0x0308 { char *td_str; // New ones from the Module. @@ -141,9 +132,6 @@ create_pass_manager(struct gallivm_state *gallivm) LLVMSetDataLayout(gallivm->module, td_str); free(td_str); } -#else - LLVMSetDataLayout(gallivm->module, ""); -#endif if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) { /* These are the passes currently listed in llvm-c/Transforms/Scalar.h, @@ -190,7 +178,7 @@ gallivm_free_ir(struct gallivm_state *gallivm) FREE(gallivm->module_name); - if (!USE_MCJIT) { + if (!use_mcjit) { /* Don't free the TargetData, it's owned by the exec engine */ } else { if (gallivm->target) { @@ -248,7 +236,7 @@ init_gallivm_engine(struct gallivm_state *gallivm) gallivm->module, gallivm->memorymgr, (unsigned) optlevel, - USE_MCJIT, + use_mcjit, &error); if (ret) { _debug_printf("%s\n", error); @@ -257,7 +245,7 @@ init_gallivm_engine(struct gallivm_state *gallivm) } } - if (!USE_MCJIT) { + if (!use_mcjit) { gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine); if (!gallivm->target) goto fail; @@ -336,7 +324,7 @@ init_gallivm_state(struct gallivm_state *gallivm, const char *name, * complete when MC-JIT is created. So defer the MC-JIT engine creation for * now. */ - if (!USE_MCJIT) { + if (!use_mcjit) { if (!init_gallivm_engine(gallivm)) { goto fail; } @@ -395,10 +383,21 @@ lp_build_init(void) if (gallivm_initialized) return TRUE; - LLVMLinkInMCJIT(); -#if !defined(USE_MCJIT) - USE_MCJIT = debug_get_bool_option("GALLIVM_MCJIT", 0); + + /* LLVMLinkIn* are no-ops at runtime. They just ensure the respective + * component is linked at buildtime, which is sufficient for its static + * constructors to be called at load time. + */ +#if defined(USE_MCJIT) +# if USE_MCJIT + LLVMLinkInMCJIT(); +# else + LLVMLinkInJIT(); +# endif +#else + use_mcjit = debug_get_bool_option("GALLIVM_MCJIT", FALSE); LLVMLinkInJIT(); + LLVMLinkInMCJIT(); #endif #ifdef DEBUG @@ -457,7 +456,7 @@ lp_build_init(void) util_cpu_caps.has_f16c = 0; util_cpu_caps.has_fma = 0; } - if (HAVE_LLVM < 0x0304 || !USE_MCJIT) { + if (HAVE_LLVM < 0x0304 || !use_mcjit) { /* AVX2 support has only been tested with LLVM 3.4, and it requires * MCJIT. */ util_cpu_caps.has_avx2 = 0; @@ -607,12 +606,30 @@ gallivm_compile_module(struct gallivm_state *gallivm) LLVMWriteBitcodeToFile(gallivm->module, filename); debug_printf("%s written\n", filename); debug_printf("Invoke as \"llc %s%s -o - %s\"\n", - (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option] " : "", + (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option>] " : "", "[-mattr=<-mattr option(s)>]", filename); } - if (USE_MCJIT) { + if (use_mcjit) { + /* Setting the module's DataLayout to an empty string will cause the + * ExecutionEngine to copy to the DataLayout string from its target + * machine to the module. As of LLVM 3.8 the module and the execution + * engine are required to have the same DataLayout. + * + * We must make sure we do this after running the optimization passes, + * because those passes need a correct datalayout string. For example, + * if those optimization passes see an empty datalayout, they will assume + * this is a little endian target and will do optimizations that break big + * endian machines. + * + * TODO: This is just a temporary work-around. The correct solution is + * for gallivm_init_state() to create a TargetMachine and pull the + * DataLayout from there. Currently, the TargetMachine used by llvmpipe + * is being implicitly created by the EngineBuilder in + * lp_build_create_jit_compiler_for_module() + */ + LLVMSetDataLayout(gallivm->module, ""); assert(!gallivm->engine); if (!init_gallivm_engine(gallivm)) { assert(0); diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c index f12e735b5..b92455593 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.c @@ -46,6 +46,7 @@ #include "util/u_debug.h" #include "util/u_string.h" +#include "util/bitscan.h" #include "lp_bld_const.h" #include "lp_bld_intr.h" @@ -120,16 +121,113 @@ lp_declare_intrinsic(LLVMModuleRef module, } +#if HAVE_LLVM < 0x0400 +static LLVMAttribute lp_attr_to_llvm_attr(enum lp_func_attr attr) +{ + switch (attr) { + case LP_FUNC_ATTR_ALWAYSINLINE: return LLVMAlwaysInlineAttribute; + case LP_FUNC_ATTR_BYVAL: return LLVMByValAttribute; + case LP_FUNC_ATTR_INREG: return LLVMInRegAttribute; + case LP_FUNC_ATTR_NOALIAS: return LLVMNoAliasAttribute; + case LP_FUNC_ATTR_NOUNWIND: return LLVMNoUnwindAttribute; + case LP_FUNC_ATTR_READNONE: return LLVMReadNoneAttribute; + case LP_FUNC_ATTR_READONLY: return LLVMReadOnlyAttribute; + default: + _debug_printf("Unhandled function attribute: %x\n", attr); + return 0; + } +} + +#else + +static const char *attr_to_str(enum lp_func_attr attr) +{ + switch (attr) { + case LP_FUNC_ATTR_ALWAYSINLINE: return "alwaysinline"; + case LP_FUNC_ATTR_BYVAL: return "byval"; + case LP_FUNC_ATTR_INREG: return "inreg"; + case LP_FUNC_ATTR_NOALIAS: return "noalias"; + case LP_FUNC_ATTR_NOUNWIND: return "nounwind"; + case LP_FUNC_ATTR_READNONE: return "readnone"; + case LP_FUNC_ATTR_READONLY: return "readonly"; + case LP_FUNC_ATTR_WRITEONLY: return "writeonly"; + case LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY: return "inaccessiblememonly"; + case LP_FUNC_ATTR_CONVERGENT: return "convergent"; + default: + _debug_printf("Unhandled function attribute: %x\n", attr); + return 0; + } +} + +#endif + +void +lp_add_function_attr(LLVMValueRef function_or_call, + int attr_idx, enum lp_func_attr attr) +{ + +#if HAVE_LLVM < 0x0400 + LLVMAttribute llvm_attr = lp_attr_to_llvm_attr(attr); + if (LLVMIsAFunction(function_or_call)) { + if (attr_idx == -1) { + LLVMAddFunctionAttr(function_or_call, llvm_attr); + } else { + LLVMAddAttribute(LLVMGetParam(function_or_call, attr_idx - 1), llvm_attr); + } + } else { + LLVMAddInstrAttribute(function_or_call, attr_idx, llvm_attr); + } +#else + + LLVMModuleRef module; + if (LLVMIsAFunction(function_or_call)) { + module = LLVMGetGlobalParent(function_or_call); + } else { + LLVMBasicBlockRef bb = LLVMGetInstructionParent(function_or_call); + LLVMValueRef function = LLVMGetBasicBlockParent(bb); + module = LLVMGetGlobalParent(function); + } + LLVMContextRef ctx = LLVMGetModuleContext(module); + + const char *attr_name = attr_to_str(attr); + unsigned kind_id = LLVMGetEnumAttributeKindForName(attr_name, + strlen(attr_name)); + LLVMAttributeRef llvm_attr = LLVMCreateEnumAttribute(ctx, kind_id, 0); + + if (LLVMIsAFunction(function_or_call)) + LLVMAddAttributeAtIndex(function_or_call, attr_idx, llvm_attr); + else + LLVMAddCallSiteAttribute(function_or_call, attr_idx, llvm_attr); +#endif +} + +static void +lp_add_func_attributes(LLVMValueRef function, unsigned attrib_mask) +{ + /* NoUnwind indicates that the intrinsic never raises a C++ exception. + * Set it for all intrinsics. + */ + attrib_mask |= LP_FUNC_ATTR_NOUNWIND; + attrib_mask &= ~LP_FUNC_ATTR_LEGACY; + + while (attrib_mask) { + enum lp_func_attr attr = 1u << u_bit_scan(&attrib_mask); + lp_add_function_attr(function, -1, attr); + } +} + LLVMValueRef lp_build_intrinsic(LLVMBuilderRef builder, const char *name, LLVMTypeRef ret_type, LLVMValueRef *args, unsigned num_args, - LLVMAttribute attr) + unsigned attr_mask) { LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder))); - LLVMValueRef function; + LLVMValueRef function, call; + bool set_callsite_attrs = HAVE_LLVM >= 0x0400 && + !(attr_mask & LP_FUNC_ATTR_LEGACY); function = LLVMGetNamedFunction(module, name); if(!function) { @@ -145,17 +243,18 @@ lp_build_intrinsic(LLVMBuilderRef builder, function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args); - /* NoUnwind indicates that the intrinsic never raises a C++ exception. - * Set it for all intrinsics. - */ - LLVMAddFunctionAttr(function, attr | LLVMNoUnwindAttribute); + if (!set_callsite_attrs) + lp_add_func_attributes(function, attr_mask); if (gallivm_debug & GALLIVM_DEBUG_IR) { lp_debug_dump_value(function); } } - return LLVMBuildCall(builder, function, args, num_args, ""); + call = LLVMBuildCall(builder, function, args, num_args, ""); + if (set_callsite_attrs) + lp_add_func_attributes(call, attr_mask); + return call; } @@ -243,9 +342,9 @@ lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm, unsigned num_vec = src_type.length / intrin_length; LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; - /*Â don't support arbitrary size here as this is so yuck */ + /* don't support arbitrary size here as this is so yuck */ if (src_type.length % intrin_length) { - /*Â FIXME: This is something which should be supported + /* FIXME: This is something which should be supported * but there doesn't seem to be any need for it currently * so crash and burn. */ diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h index 7d80ac28f..0a929c519 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_intr.h @@ -46,6 +46,24 @@ */ #define LP_MAX_FUNC_ARGS 32 +enum lp_func_attr { + LP_FUNC_ATTR_ALWAYSINLINE = (1 << 0), + LP_FUNC_ATTR_BYVAL = (1 << 1), + LP_FUNC_ATTR_INREG = (1 << 2), + LP_FUNC_ATTR_NOALIAS = (1 << 3), + LP_FUNC_ATTR_NOUNWIND = (1 << 4), + LP_FUNC_ATTR_READNONE = (1 << 5), + LP_FUNC_ATTR_READONLY = (1 << 6), + LP_FUNC_ATTR_WRITEONLY = HAVE_LLVM >= 0x0400 ? (1 << 7) : 0, + LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY = HAVE_LLVM >= 0x0400 ? (1 << 8) : 0, + LP_FUNC_ATTR_CONVERGENT = HAVE_LLVM >= 0x0400 ? (1 << 9) : 0, + + /* Legacy intrinsic that needs attributes on function declarations + * and they must match the internal LLVM definition exactly, otherwise + * intrinsic selection fails. + */ + LP_FUNC_ATTR_LEGACY = (1u << 31), +}; void lp_format_intrinsic(char *name, @@ -60,13 +78,17 @@ lp_declare_intrinsic(LLVMModuleRef module, LLVMTypeRef *arg_types, unsigned num_args); +void +lp_add_function_attr(LLVMValueRef function_or_call, + int attr_idx, enum lp_func_attr attr); + LLVMValueRef lp_build_intrinsic(LLVMBuilderRef builder, const char *name, LLVMTypeRef ret_type, LLVMValueRef *args, unsigned num_args, - LLVMAttribute attr); + unsigned attr_mask); LLVMValueRef diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h index 32addec97..354e2a46b 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_limits.h @@ -49,8 +49,6 @@ #define LP_MAX_TGSI_IMMEDIATES 4096 -#define LP_MAX_TGSI_PREDS 16 - #define LP_MAX_TGSI_CONSTS 4096 #define LP_MAX_TGSI_CONST_BUFFERS 16 @@ -109,8 +107,6 @@ gallivm_get_shader_param(enum pipe_shader_cap param) return PIPE_MAX_CONSTANT_BUFFERS; case PIPE_SHADER_CAP_MAX_TEMPS: return LP_MAX_TGSI_TEMPS; - case PIPE_SHADER_CAP_MAX_PREDS: - return LP_MAX_TGSI_PREDS; case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: return 1; case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: @@ -133,13 +129,13 @@ gallivm_get_shader_param(enum pipe_shader_cap param) case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: return 1; - case PIPE_SHADER_CAP_DOUBLES: - return 1; case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: + case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c index 1a50e82c2..524917abe 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_logic.c @@ -327,6 +327,8 @@ lp_build_select(struct lp_build_context *bld, * supported yet for a long time, and LLVM will generate poor code when * the mask is not the result of a comparison. * Also, llvm 3.7 may miscompile them (bug 94972). + * XXX: Even if the instruction was an SExt, this may still produce + * terrible code. Try piglit stencil-twoside. */ /* Convert the mask to a vector of booleans. diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp index 3efb6a8e7..d988910a7 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp @@ -32,14 +32,6 @@ */ -#ifndef __STDC_LIMIT_MACROS -#define __STDC_LIMIT_MACROS -#endif - -#ifndef __STDC_CONSTANT_MACROS -#define __STDC_CONSTANT_MACROS -#endif - // Undef these vars just to silence warnings #undef PACKAGE_BUGREPORT #undef PACKAGE_NAME @@ -57,6 +49,9 @@ #endif #include <llvm-c/Core.h> +#if HAVE_LLVM >= 0x0306 +#include <llvm-c/Support.h> +#endif #include <llvm-c/ExecutionEngine.h> #include <llvm/Target/TargetOptions.h> #include <llvm/ExecutionEngine/ExecutionEngine.h> @@ -77,6 +72,9 @@ #include <llvm/Support/TargetSelect.h> +#if HAVE_LLVM >= 0x0305 +#include <llvm/IR/CallSite.h> +#endif #include <llvm/IR/IRBuilder.h> #include <llvm/IR/Module.h> #include <llvm/Support/CBindingWrapping.h> @@ -127,20 +125,26 @@ static void init_native_targets() llvm::InitializeNativeTargetAsmPrinter(); llvm::InitializeNativeTargetDisassembler(); -} - -/** - * The llvm target registry is not thread-safe, so drivers and state-trackers - * that want to initialize targets should use the gallivm_init_llvm_targets() - * function to safely initialize targets. - * - * LLVM targets should be initialized before the driver or state-tracker tries - * to access the registry. - */ -extern "C" void -gallivm_init_llvm_targets(void) -{ - call_once(&init_native_targets_once_flag, init_native_targets); +#if DEBUG && HAVE_LLVM >= 0x0306 + { + char *env_llc_options = getenv("GALLIVM_LLC_OPTIONS"); + if (env_llc_options) { + char *option; + char *options[64] = {(char *) "llc"}; // Warning without cast + int n; + for (n = 0, option = strtok(env_llc_options, " "); option; n++, option = strtok(NULL, " ")) { + options[n + 1] = option; + } + if (gallivm_debug & (GALLIVM_DEBUG_IR | GALLIVM_DEBUG_ASM | GALLIVM_DEBUG_DUMP_BC)) { + debug_printf("llc additional options (%d):\n", n); + for (int i = 1; i <= n; i++) + debug_printf("\t%s\n", options[i]); + debug_printf("\n"); + } + LLVMParseCommandLineOptions(n + 1, options, NULL); + } + } +#endif } extern "C" void @@ -155,7 +159,14 @@ lp_set_target_options(void) llvm::DisablePrettyStackTrace = true; #endif - gallivm_init_llvm_targets(); + /* The llvm target registry is not thread-safe, so drivers and state-trackers + * that want to initialize targets should use the lp_set_target_options() + * function to safely initialize targets. + * + * LLVM targets should be initialized before the driver or state-tracker tries + * to access the registry. + */ + call_once(&init_native_targets_once_flag, init_native_targets); } extern "C" @@ -347,14 +358,20 @@ class DelegatingJITMemoryManager : public BaseMemoryManager { virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) { mgr()->registerEHFrames(Addr, LoadAddr, Size); } - virtual void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) { - mgr()->deregisterEHFrames(Addr, LoadAddr, Size); - } #else virtual void registerEHFrames(llvm::StringRef SectionData) { mgr()->registerEHFrames(SectionData); } #endif +#if HAVE_LLVM >= 0x0500 + virtual void deregisterEHFrames() { + mgr()->deregisterEHFrames(); + } +#elif HAVE_LLVM >= 0x0304 + virtual void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) { + mgr()->deregisterEHFrames(Addr, LoadAddr, Size); + } +#endif virtual void *getPointerToNamedFunction(const std::string &Name, bool AbortOnFailure=true) { return mgr()->getPointerToNamedFunction(Name, AbortOnFailure); @@ -540,6 +557,20 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, llvm::SmallVector<std::string, 16> MAttrs; #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) +#if HAVE_LLVM >= 0x0400 + /* llvm-3.7+ implements sys::getHostCPUFeatures for x86, + * which allows us to enable/disable code generation based + * on the results of cpuid. + */ + llvm::StringMap<bool> features; + llvm::sys::getHostCPUFeatures(features); + + for (StringMapIterator<bool> f = features.begin(); + f != features.end(); + ++f) { + MAttrs.push_back(((*f).second ? "+" : "-") + (*f).first().str()); + } +#else /* * We need to unset attributes because sometimes LLVM mistakenly assumes * certain features are present given the processor name. @@ -594,27 +625,51 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, MAttrs.push_back("-avx512vl"); #endif #endif +#endif #if defined(PIPE_ARCH_PPC) MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec"); #if (HAVE_LLVM >= 0x0304) -#if (HAVE_LLVM <= 0x0307) || (HAVE_LLVM == 0x0308 && MESA_LLVM_VERSION_PATCH == 0) +#if (HAVE_LLVM < 0x0400) /* * Make sure VSX instructions are disabled - * See LLVM bug https://llvm.org/bugs/show_bug.cgi?id=25503#c7 + * See LLVM bugs: + * https://llvm.org/bugs/show_bug.cgi?id=25503#c7 (fixed in 3.8.1) + * https://llvm.org/bugs/show_bug.cgi?id=26775 (fixed in 3.8.1) + * https://llvm.org/bugs/show_bug.cgi?id=33531 (fixed in 4.0) + * https://llvm.org/bugs/show_bug.cgi?id=34647 (llc performance on certain unusual shader IR; intro'd in 4.0, pending as of 5.0) */ if (util_cpu_caps.has_altivec) { MAttrs.push_back("-vsx"); } #else /* - * However, bug 25503 is fixed, by the same fix that fixed - * bug 26775, in versions of LLVM later than 3.8 (starting with 3.8.1): - * Make sure VSX instructions are ENABLED - * See LLVM bug https://llvm.org/bugs/show_bug.cgi?id=26775 + * Bug 25503 is fixed, by the same fix that fixed + * bug 26775, in versions of LLVM later than 3.8 (starting with 3.8.1). + * BZ 33531 actually comprises more than one bug, all of + * which are fixed in LLVM 4.0. + * + * With LLVM 4.0 or higher: + * Make sure VSX instructions are ENABLED, unless + * a) the entire -mattr option is overridden via GALLIVM_MATTRS, or + * b) VSX instructions are explicitly enabled/disabled via GALLIVM_VSX=1 or 0. */ if (util_cpu_caps.has_altivec) { - MAttrs.push_back("+vsx"); + char *env_mattrs = getenv("GALLIVM_MATTRS"); + if (env_mattrs) { + MAttrs.push_back(env_mattrs); + } + else { + boolean enable_vsx = true; + char *env_vsx = getenv("GALLIVM_VSX"); + if (env_vsx && env_vsx[0] == '0') { + enable_vsx = false; + } + if (enable_vsx) + MAttrs.push_back("+vsx"); + else + MAttrs.push_back("-vsx"); + } } #endif #endif @@ -737,13 +792,49 @@ lp_free_memory_manager(LLVMMCJITMemoryManagerRef memorymgr) delete reinterpret_cast<BaseMemoryManager*>(memorymgr); } -extern "C" void -lp_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes) +extern "C" LLVMValueRef +lp_get_called_value(LLVMValueRef call) { -#if HAVE_LLVM >= 0x0306 - llvm::Argument *A = llvm::unwrap<llvm::Argument>(val); - llvm::AttrBuilder B; - B.addDereferenceableAttr(bytes); - A->addAttr(llvm::AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); +#if HAVE_LLVM >= 0x0309 + return LLVMGetCalledValue(call); +#elif HAVE_LLVM >= 0x0305 + return llvm::wrap(llvm::CallSite(llvm::unwrap<llvm::Instruction>(call)).getCalledValue()); +#else + return NULL; /* radeonsi doesn't support so old LLVM. */ +#endif +} + +extern "C" bool +lp_is_function(LLVMValueRef v) +{ +#if HAVE_LLVM >= 0x0309 + return LLVMGetValueKind(v) == LLVMFunctionValueKind; +#else + return llvm::isa<llvm::Function>(llvm::unwrap(v)); +#endif +} + +extern "C" LLVMBuilderRef +lp_create_builder(LLVMContextRef ctx, enum lp_float_mode float_mode) +{ + LLVMBuilderRef builder = LLVMCreateBuilderInContext(ctx); + +#if HAVE_LLVM >= 0x0308 + llvm::FastMathFlags flags; + + switch (float_mode) { + case LP_FLOAT_MODE_DEFAULT: + break; + case LP_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH: + flags.setNoSignedZeros(); + llvm::unwrap(builder)->setFastMathFlags(flags); + break; + case LP_FLOAT_MODE_UNSAFE_FP_MATH: + flags.setUnsafeAlgebra(); + llvm::unwrap(builder)->setFastMathFlags(flags); + break; + } #endif + + return builder; } diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h index c127c480d..1b725d10d 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.h @@ -42,9 +42,6 @@ extern "C" { struct lp_generated_code; -extern void -gallivm_init_llvm_targets(void); - extern LLVMTargetLibraryInfoRef gallivm_create_target_library_info(const char *triple); @@ -73,8 +70,20 @@ lp_get_default_memory_manager(); extern void lp_free_memory_manager(LLVMMCJITMemoryManagerRef memorymgr); -extern void -lp_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes); +extern LLVMValueRef +lp_get_called_value(LLVMValueRef call); + +extern bool +lp_is_function(LLVMValueRef v); + +enum lp_float_mode { + LP_FLOAT_MODE_DEFAULT, + LP_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH, + LP_FLOAT_MODE_UNSAFE_FP_MATH, +}; + +extern LLVMBuilderRef +lp_create_builder(LLVMContextRef ctx, enum lp_float_mode float_mode); #ifdef __cplusplus } diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c index a4b3a7b83..a1dc61d40 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -1412,8 +1412,8 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld, { const unsigned dims = bld->dims; LLVMValueRef width; - LLVMValueRef height; - LLVMValueRef depth; + LLVMValueRef height = NULL; + LLVMValueRef depth = NULL; lp_build_extract_image_sizes(bld, &bld->float_size_bld, diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c index f91b761dc..c46749dba 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c @@ -579,10 +579,12 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld, LLVMValueRef rgba8; struct lp_build_context u8n; LLVMTypeRef u8n_vec_type; + struct lp_type fetch_type; lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width)); u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); + fetch_type = lp_type_uint(bld->texel_type.width); if (util_format_is_rgba8_variant(bld->format_desc)) { /* * Given the format is a rgba8, just read the pixels as is, @@ -591,7 +593,7 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld, rgba8 = lp_build_gather(bld->gallivm, bld->texel_type.length, bld->format_desc->block.bits, - bld->texel_type.width, + fetch_type, TRUE, data_ptr, offset, TRUE); @@ -925,14 +927,16 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld, LLVMValueRef rgba8; if (util_format_is_rgba8_variant(bld->format_desc)) { + struct lp_type fetch_type; /* * Given the format is a rgba8, just read the pixels as is, * without any swizzling. Swizzling will be done later. */ + fetch_type = lp_type_uint(bld->texel_type.width); rgba8 = lp_build_gather(bld->gallivm, bld->texel_type.length, bld->format_desc->block.bits, - bld->texel_type.width, + fetch_type, TRUE, data_ptr, offset[k][j][i], TRUE); diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 1477a72d6..cb4660e42 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -60,6 +60,7 @@ #include "lp_bld_struct.h" #include "lp_bld_quad.h" #include "lp_bld_pack.h" +#include "lp_bld_intr.h" /** @@ -158,7 +159,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, lp_build_fetch_rgba_soa(bld->gallivm, bld->format_desc, - bld->texel_type, + bld->texel_type, TRUE, data_ptr, offset, i, j, bld->cache, @@ -2405,7 +2406,7 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld, lp_build_fetch_rgba_soa(bld->gallivm, bld->format_desc, - bld->texel_type, + bld->texel_type, TRUE, bld->base_ptr, offset, i, j, bld->cache, @@ -3316,7 +3317,8 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, for (i = 0; i < num_param; ++i) { if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) { - LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute); + + lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS); } } @@ -3460,7 +3462,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm, struct lp_sampler_dynamic_state *dynamic_state, const struct lp_sampler_size_query_params *params) { - LLVMValueRef lod, level, size; + LLVMValueRef lod, level = 0, size; LLVMValueRef first_level = NULL; int dims, i; boolean has_array; diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c index 68ac69538..69863ab93 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c @@ -323,16 +323,14 @@ lp_build_tgsi_inst_llvm( LLVMValueRef -lp_build_emit_fetch( +lp_build_emit_fetch_src( struct lp_build_tgsi_context *bld_base, - const struct tgsi_full_instruction *inst, - unsigned src_op, + const struct tgsi_full_src_register *reg, + enum tgsi_opcode_type stype, const unsigned chan_index) { - const struct tgsi_full_src_register *reg = &inst->Src[src_op]; unsigned swizzle; LLVMValueRef res; - enum tgsi_opcode_type stype = tgsi_opcode_infer_src_type(inst->Instruction.Opcode); if (chan_index == LP_CHAN_ALL) { swizzle = ~0u; @@ -360,7 +358,7 @@ lp_build_emit_fetch( case TGSI_TYPE_DOUBLE: case TGSI_TYPE_UNTYPED: /* modifiers on movs assume data is float */ - res = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_ABS, res); + res = lp_build_abs(&bld_base->base, res); break; case TGSI_TYPE_UNSIGNED: case TGSI_TYPE_SIGNED: @@ -413,7 +411,21 @@ lp_build_emit_fetch( } return res; +} + + +LLVMValueRef +lp_build_emit_fetch( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_instruction *inst, + unsigned src_op, + const unsigned chan_index) +{ + const struct tgsi_full_src_register *reg = &inst->Src[src_op]; + enum tgsi_opcode_type stype = + tgsi_opcode_infer_src_type(inst->Instruction.Opcode); + return lp_build_emit_fetch_src(bld_base, reg, stype, chan_index); } diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h index b6b3fe369..eb632b700 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h @@ -458,7 +458,6 @@ struct lp_build_tgsi_soa_context LLVMValueRef immediates[LP_MAX_INLINED_IMMEDIATES][TGSI_NUM_CHANNELS]; LLVMValueRef temps[LP_MAX_INLINED_TEMPS][TGSI_NUM_CHANNELS]; LLVMValueRef addr[LP_MAX_TGSI_ADDRS][TGSI_NUM_CHANNELS]; - LLVMValueRef preds[LP_MAX_TGSI_PREDS][TGSI_NUM_CHANNELS]; /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is * set in the indirect_files field. @@ -552,7 +551,6 @@ struct lp_build_tgsi_aos_context LLVMValueRef immediates[LP_MAX_INLINED_IMMEDIATES]; LLVMValueRef temps[LP_MAX_INLINED_TEMPS]; LLVMValueRef addr[LP_MAX_TGSI_ADDRS]; - LLVMValueRef preds[LP_MAX_TGSI_PREDS]; /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is * set in the indirect_files field. @@ -645,6 +643,13 @@ lp_build_tgsi_inst_llvm( const struct tgsi_full_instruction *inst); LLVMValueRef +lp_build_emit_fetch_src( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_src_register *reg, + enum tgsi_opcode_type stype, + const unsigned chan_index); + +LLVMValueRef lp_build_emit_fetch( struct lp_build_tgsi_context *bld_base, const struct tgsi_full_instruction *inst, diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c index 2e837afe2..dc6568a2d 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c @@ -110,21 +110,6 @@ arr_emit( bld_base->uint_bld.vec_type, ""); } -/* TGSI_OPCODE_CLAMP */ -static void -clamp_emit( - const struct lp_build_tgsi_action * action, - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - LLVMValueRef tmp; - tmp = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX, - emit_data->args[0], - emit_data->args[1]); - emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base, - TGSI_OPCODE_MIN, tmp, emit_data->args[2]); -} - /* DP* Helper */ static void @@ -368,8 +353,8 @@ exp_emit( TGSI_OPCODE_EX2, floor_x); /* src0.x - floor( src0.x ) */ - emit_data->output[TGSI_CHAN_Y] = lp_build_emit_llvm_binary(bld_base, - TGSI_OPCODE_SUB, emit_data->args[0] /* src0.x */, floor_x); + emit_data->output[TGSI_CHAN_Y] = + lp_build_sub(&bld_base->base, emit_data->args[0] /* src0.x */, floor_x); /* 2 ^ src0.x */ emit_data->output[TGSI_CHAN_Z] = lp_build_emit_llvm_unary(bld_base, @@ -394,8 +379,8 @@ frc_emit( LLVMValueRef tmp; tmp = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_FLR, emit_data->args[0]); - emit_data->output[emit_data->chan] = lp_build_emit_llvm_binary(bld_base, - TGSI_OPCODE_SUB, emit_data->args[0], tmp); + emit_data->output[emit_data->chan] = + lp_build_sub(&bld_base->base, emit_data->args[0], tmp); } /* TGSI_OPCODE_KILL_IF */ @@ -499,8 +484,7 @@ log_emit( LLVMValueRef abs_x, log_abs_x, flr_log_abs_x, ex2_flr_log_abs_x; /* abs( src0.x) */ - abs_x = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_ABS, - emit_data->args[0] /* src0.x */); + abs_x = lp_build_abs(&bld_base->base, emit_data->args[0] /* src0.x */); /* log( abs( src0.x ) ) */ log_abs_x = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_LG2, @@ -771,19 +755,6 @@ const struct lp_build_tgsi_action scs_action = { scs_emit /* emit */ }; -/* TGSI_OPCODE_SUB */ -static void -sub_emit( - const struct lp_build_tgsi_action * action, - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - emit_data->output[emit_data->chan] = - LLVMBuildFSub(bld_base->base.gallivm->builder, - emit_data->args[0], - emit_data->args[1], ""); -} - /* TGSI_OPCODE_F2U */ static void f2u_emit( @@ -842,26 +813,32 @@ imul_hi_emit( struct lp_build_tgsi_context * bld_base, struct lp_build_emit_data * emit_data) { - LLVMBuilderRef builder = bld_base->base.gallivm->builder; struct lp_build_context *int_bld = &bld_base->int_bld; - struct lp_type type = int_bld->type; - LLVMValueRef src0, src1; - LLVMValueRef dst64; - LLVMTypeRef typeRef; - - assert(type.width == 32); - type.width = 64; - typeRef = lp_build_vec_type(bld_base->base.gallivm, type); - src0 = LLVMBuildSExt(builder, emit_data->args[0], typeRef, ""); - src1 = LLVMBuildSExt(builder, emit_data->args[1], typeRef, ""); - dst64 = LLVMBuildMul(builder, src0, src1, ""); - dst64 = LLVMBuildAShr( - builder, dst64, - lp_build_const_vec(bld_base->base.gallivm, type, 32), ""); - type.width = 32; - typeRef = lp_build_vec_type(bld_base->base.gallivm, type); - emit_data->output[emit_data->chan] = - LLVMBuildTrunc(builder, dst64, typeRef, ""); + LLVMValueRef hi_bits; + + assert(int_bld->type.width == 32); + + /* low result bits are tossed away */ + lp_build_mul_32_lohi(int_bld, emit_data->args[0], + emit_data->args[1], &hi_bits); + emit_data->output[emit_data->chan] = hi_bits; +} + +static void +imul_hi_emit_cpu( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct lp_build_context *int_bld = &bld_base->int_bld; + LLVMValueRef hi_bits; + + assert(int_bld->type.width == 32); + + /* low result bits are tossed away */ + lp_build_mul_32_lohi_cpu(int_bld, emit_data->args[0], + emit_data->args[1], &hi_bits); + emit_data->output[emit_data->chan] = hi_bits; } /* TGSI_OPCODE_UMUL_HI */ @@ -871,26 +848,32 @@ umul_hi_emit( struct lp_build_tgsi_context * bld_base, struct lp_build_emit_data * emit_data) { - LLVMBuilderRef builder = bld_base->base.gallivm->builder; struct lp_build_context *uint_bld = &bld_base->uint_bld; - struct lp_type type = uint_bld->type; - LLVMValueRef src0, src1; - LLVMValueRef dst64; - LLVMTypeRef typeRef; - - assert(type.width == 32); - type.width = 64; - typeRef = lp_build_vec_type(bld_base->base.gallivm, type); - src0 = LLVMBuildZExt(builder, emit_data->args[0], typeRef, ""); - src1 = LLVMBuildZExt(builder, emit_data->args[1], typeRef, ""); - dst64 = LLVMBuildMul(builder, src0, src1, ""); - dst64 = LLVMBuildLShr( - builder, dst64, - lp_build_const_vec(bld_base->base.gallivm, type, 32), ""); - type.width = 32; - typeRef = lp_build_vec_type(bld_base->base.gallivm, type); - emit_data->output[emit_data->chan] = - LLVMBuildTrunc(builder, dst64, typeRef, ""); + LLVMValueRef hi_bits; + + assert(uint_bld->type.width == 32); + + /* low result bits are tossed away */ + lp_build_mul_32_lohi(uint_bld, emit_data->args[0], + emit_data->args[1], &hi_bits); + emit_data->output[emit_data->chan] = hi_bits; +} + +static void +umul_hi_emit_cpu( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct lp_build_context *uint_bld = &bld_base->uint_bld; + LLVMValueRef hi_bits; + + assert(uint_bld->type.width == 32); + + /* low result bits are tossed away */ + lp_build_mul_32_lohi_cpu(uint_bld, emit_data->args[0], + emit_data->args[1], &hi_bits); + emit_data->output[emit_data->chan] = hi_bits; } /* TGSI_OPCODE_MAX */ @@ -945,7 +928,7 @@ xpd_helper( tmp0 = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MUL, a, b); tmp1 = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MUL, c, d); - return lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_SUB, tmp0, tmp1); + return lp_build_sub(&bld_base->base, tmp0, tmp1); } static void @@ -1332,7 +1315,6 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base) bld_base->op_actions[TGSI_OPCODE_ADD].emit = add_emit; bld_base->op_actions[TGSI_OPCODE_ARR].emit = arr_emit; - bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = clamp_emit; bld_base->op_actions[TGSI_OPCODE_END].emit = end_emit; bld_base->op_actions[TGSI_OPCODE_FRC].emit = frc_emit; bld_base->op_actions[TGSI_OPCODE_LRP].emit = lrp_emit; @@ -1341,7 +1323,6 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base) bld_base->op_actions[TGSI_OPCODE_MUL].emit = mul_emit; bld_base->op_actions[TGSI_OPCODE_DIV].emit = fdiv_emit; bld_base->op_actions[TGSI_OPCODE_RCP].emit = rcp_emit; - bld_base->op_actions[TGSI_OPCODE_SUB].emit = sub_emit; bld_base->op_actions[TGSI_OPCODE_UARL].emit = mov_emit; bld_base->op_actions[TGSI_OPCODE_F2U].emit = f2u_emit; @@ -1358,6 +1339,7 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base) bld_base->op_actions[TGSI_OPCODE_DMAX].emit = fmax_emit; bld_base->op_actions[TGSI_OPCODE_DMIN].emit = fmin_emit; bld_base->op_actions[TGSI_OPCODE_DMUL].emit = mul_emit; + bld_base->op_actions[TGSI_OPCODE_DDIV].emit = fdiv_emit; bld_base->op_actions[TGSI_OPCODE_D2F].emit = d2f_emit; bld_base->op_actions[TGSI_OPCODE_D2I].emit = d2i_emit; @@ -1400,18 +1382,6 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base) * intrinsics. */ -/* TGSI_OPCODE_ABS (CPU Only)*/ - -static void -abs_emit_cpu( - const struct lp_build_tgsi_action * action, - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - emit_data->output[emit_data->chan] = lp_build_abs(&bld_base->base, - emit_data->args[0]); -} - /* TGSI_OPCODE_ADD (CPU Only) */ static void add_emit_cpu( @@ -2072,19 +2042,6 @@ ssg_emit_cpu( emit_data->args[0]); } -/* TGSI_OPCODE_SUB (CPU Only) */ - -static void -sub_emit_cpu( - const struct lp_build_tgsi_action * action, - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - emit_data->output[emit_data->chan] = lp_build_sub(&bld_base->base, - emit_data->args[0], - emit_data->args[1]); -} - /* TGSI_OPCODE_TRUNC (CPU Only) */ static void @@ -2576,7 +2533,6 @@ lp_set_default_actions_cpu( struct lp_build_tgsi_context * bld_base) { lp_set_default_actions(bld_base); - bld_base->op_actions[TGSI_OPCODE_ABS].emit = abs_emit_cpu; bld_base->op_actions[TGSI_OPCODE_ADD].emit = add_emit_cpu; bld_base->op_actions[TGSI_OPCODE_AND].emit = and_emit_cpu; bld_base->op_actions[TGSI_OPCODE_ARL].emit = arl_emit_cpu; @@ -2603,6 +2559,8 @@ lp_set_default_actions_cpu( bld_base->op_actions[TGSI_OPCODE_ISHR].emit = ishr_emit_cpu; bld_base->op_actions[TGSI_OPCODE_ISLT].emit = islt_emit_cpu; bld_base->op_actions[TGSI_OPCODE_ISSG].emit = issg_emit_cpu; + bld_base->op_actions[TGSI_OPCODE_IMUL_HI].emit = imul_hi_emit_cpu; + bld_base->op_actions[TGSI_OPCODE_UMUL_HI].emit = umul_hi_emit_cpu; bld_base->op_actions[TGSI_OPCODE_LG2].emit = lg2_emit_cpu; bld_base->op_actions[TGSI_OPCODE_LOG].emit = log_emit_cpu; @@ -2624,7 +2582,6 @@ lp_set_default_actions_cpu( bld_base->op_actions[TGSI_OPCODE_SLT].emit = slt_emit_cpu; bld_base->op_actions[TGSI_OPCODE_SNE].emit = sne_emit_cpu; bld_base->op_actions[TGSI_OPCODE_SSG].emit = ssg_emit_cpu; - bld_base->op_actions[TGSI_OPCODE_SUB].emit = sub_emit_cpu; bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = trunc_emit_cpu; bld_base->rsq_action.emit = recip_sqrt_emit_cpu; diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c index 610283d79..58c39facf 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c @@ -256,10 +256,6 @@ lp_emit_store_aos( ptr = bld->addr[reg->Indirect.Index]; break; - case TGSI_FILE_PREDICATE: - ptr = bld->preds[reg->Register.Index]; - break; - default: assert(0); return; @@ -267,43 +263,6 @@ lp_emit_store_aos( if (!ptr) return; - /* - * Predicate - */ - - if (inst->Instruction.Predicate) { - LLVMValueRef pred; - - assert(inst->Predicate.Index < LP_MAX_TGSI_PREDS); - - pred = LLVMBuildLoad(builder, - bld->preds[inst->Predicate.Index], ""); - - /* - * Convert the value to an integer mask. - */ - pred = lp_build_compare(bld->bld_base.base.gallivm, - bld->bld_base.base.type, - PIPE_FUNC_NOTEQUAL, - pred, - bld->bld_base.base.zero); - - if (inst->Predicate.Negate) { - pred = LLVMBuildNot(builder, pred, ""); - } - - pred = bld->bld_base.emit_swizzle(&bld->bld_base, pred, - inst->Predicate.SwizzleX, - inst->Predicate.SwizzleY, - inst->Predicate.SwizzleZ, - inst->Predicate.SwizzleW); - - if (mask) { - mask = LLVMBuildAnd(builder, mask, pred, ""); - } else { - mask = pred; - } - } /* * Writemask @@ -442,11 +401,6 @@ lp_emit_declaration_aos( bld->addr[idx] = lp_build_alloca(gallivm, vec_type, ""); break; - case TGSI_FILE_PREDICATE: - assert(idx < LP_MAX_TGSI_PREDS); - bld->preds[idx] = lp_build_alloca(gallivm, vec_type, ""); - break; - case TGSI_FILE_SAMPLER_VIEW: /* * The target stored here MUST match whatever there actually @@ -521,7 +475,7 @@ lp_emit_instruction_aos( case TGSI_OPCODE_RSQ: /* TGSI_OPCODE_RECIPSQRT */ src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); - tmp0 = lp_build_emit_llvm_unary(&bld->bld_base, TGSI_OPCODE_ABS, src0); + tmp0 = lp_build_abs(&bld->bld_base.base, src0); dst0 = lp_build_rsqrt(&bld->bld_base.base, tmp0); break; @@ -591,12 +545,6 @@ lp_emit_instruction_aos( dst0 = lp_build_add(&bld->bld_base.base, tmp0, src2); break; - case TGSI_OPCODE_SUB: - src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); - src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); - dst0 = lp_build_sub(&bld->bld_base.base, src0, src1); - break; - case TGSI_OPCODE_LRP: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); @@ -615,14 +563,6 @@ lp_emit_instruction_aos( dst0 = lp_build_sub(&bld->bld_base.base, src0, tmp0); break; - case TGSI_OPCODE_CLAMP: - src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); - src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); - src2 = lp_build_emit_fetch(&bld->bld_base, inst, 2, LP_CHAN_ALL); - tmp0 = lp_build_max(&bld->bld_base.base, src0, src1); - dst0 = lp_build_min(&bld->bld_base.base, tmp0, src2); - break; - case TGSI_OPCODE_FLR: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); dst0 = lp_build_floor(&bld->bld_base.base, src0); diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c index f8f43a561..e0cc0af27 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c @@ -305,8 +305,7 @@ analyse_instruction(struct analysis_context *ctx, } else if (dst->File == TGSI_FILE_OUTPUT) { regs = info->output; max_regs = ARRAY_SIZE(info->output); - } else if (dst->File == TGSI_FILE_ADDRESS || - dst->File == TGSI_FILE_PREDICATE) { + } else if (dst->File == TGSI_FILE_ADDRESS) { continue; } else { assert(0); @@ -389,8 +388,7 @@ analyse_instruction(struct analysis_context *ctx, memset(res, 0, sizeof res); - if (!inst->Instruction.Predicate && - !inst->Instruction.Saturate) { + if (!inst->Instruction.Saturate) { for (chan = 0; chan < 4; ++chan) { if (dst->WriteMask & (1 << chan)) { if (inst->Instruction.Opcode == TGSI_OPCODE_MOV) { diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index 6871795b4..bfa32b9ad 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -753,30 +753,21 @@ static void lp_exec_default(struct lp_exec_mask *mask, */ static void lp_exec_mask_store(struct lp_exec_mask *mask, struct lp_build_context *bld_store, - LLVMValueRef pred, LLVMValueRef val, LLVMValueRef dst_ptr) { LLVMBuilderRef builder = mask->bld->gallivm->builder; + LLVMValueRef exec_mask = mask->has_mask ? mask->exec_mask : NULL; assert(lp_check_value(bld_store->type, val)); assert(LLVMGetTypeKind(LLVMTypeOf(dst_ptr)) == LLVMPointerTypeKind); assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val)); - /* Mix the predicate and execution mask */ - if (mask->has_mask) { - if (pred) { - pred = LLVMBuildAnd(builder, pred, mask->exec_mask, ""); - } else { - pred = mask->exec_mask; - } - } - - if (pred) { + if (exec_mask) { LLVMValueRef res, dst; dst = LLVMBuildLoad(builder, dst_ptr, ""); - res = lp_build_select(bld_store, pred, val, dst); + res = lp_build_select(bld_store, exec_mask, val, dst); LLVMBuildStore(builder, res, dst_ptr); } else LLVMBuildStore(builder, val, dst_ptr); @@ -1036,22 +1027,12 @@ emit_mask_scatter(struct lp_build_tgsi_soa_context *bld, LLVMValueRef base_ptr, LLVMValueRef indexes, LLVMValueRef values, - struct lp_exec_mask *mask, - LLVMValueRef pred) + struct lp_exec_mask *mask) { struct gallivm_state *gallivm = bld->bld_base.base.gallivm; LLVMBuilderRef builder = gallivm->builder; unsigned i; - - /* Mix the predicate and execution mask */ - if (mask->has_mask) { - if (pred) { - pred = LLVMBuildAnd(builder, pred, mask->exec_mask, ""); - } - else { - pred = mask->exec_mask; - } - } + LLVMValueRef pred = mask->has_mask ? mask->exec_mask : NULL; /* * Loop over elements of index_vec, store scalar value. @@ -1733,74 +1714,6 @@ emit_fetch_deriv( *ddy = lp_build_ddy(&bld->bld_base.base, src); } - -/** - * Predicate. - */ -static void -emit_fetch_predicate( - struct lp_build_tgsi_soa_context *bld, - const struct tgsi_full_instruction *inst, - LLVMValueRef *pred) -{ - LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; - unsigned index; - unsigned char swizzles[4]; - LLVMValueRef unswizzled[4] = {NULL, NULL, NULL, NULL}; - LLVMValueRef value; - unsigned chan; - - if (!inst->Instruction.Predicate) { - TGSI_FOR_EACH_CHANNEL( chan ) { - pred[chan] = NULL; - } - return; - } - - swizzles[0] = inst->Predicate.SwizzleX; - swizzles[1] = inst->Predicate.SwizzleY; - swizzles[2] = inst->Predicate.SwizzleZ; - swizzles[3] = inst->Predicate.SwizzleW; - - index = inst->Predicate.Index; - assert(index < LP_MAX_TGSI_PREDS); - - TGSI_FOR_EACH_CHANNEL( chan ) { - unsigned swizzle = swizzles[chan]; - - /* - * Only fetch the predicate register channels that are actually listed - * in the swizzles - */ - if (!unswizzled[swizzle]) { - value = LLVMBuildLoad(builder, - bld->preds[index][swizzle], ""); - - /* - * Convert the value to an integer mask. - * - * TODO: Short-circuit this comparison -- a D3D setp_xx instructions - * is needlessly causing two comparisons due to storing the intermediate - * result as float vector instead of an integer mask vector. - */ - value = lp_build_compare(bld->bld_base.base.gallivm, - bld->bld_base.base.type, - PIPE_FUNC_NOTEQUAL, - value, - bld->bld_base.base.zero); - if (inst->Predicate.Negate) { - value = LLVMBuildNot(builder, value, ""); - } - - unswizzled[swizzle] = value; - } else { - value = unswizzled[swizzle]; - } - - pred[chan] = value; - } -} - /** * store an array of 8 64-bit into two arrays of 8 floats * i.e. @@ -1813,7 +1726,6 @@ emit_fetch_predicate( static void emit_store_64bit_chan(struct lp_build_tgsi_context *bld_base, LLVMValueRef chan_ptr, LLVMValueRef chan_ptr2, - LLVMValueRef pred, LLVMValueRef value) { struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); @@ -1841,8 +1753,8 @@ emit_store_64bit_chan(struct lp_build_tgsi_context *bld_base, bld_base->base.type.length), ""); - lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp, chan_ptr); - lp_exec_mask_store(&bld->exec_mask, float_bld, pred, temp2, chan_ptr2); + lp_exec_mask_store(&bld->exec_mask, float_bld, temp, chan_ptr); + lp_exec_mask_store(&bld->exec_mask, float_bld, temp2, chan_ptr2); } /** @@ -1854,7 +1766,6 @@ emit_store_chan( const struct tgsi_full_instruction *inst, unsigned index, unsigned chan_index, - LLVMValueRef pred, LLVMValueRef value) { struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); @@ -1917,7 +1828,7 @@ emit_store_chan( /* Scatter store values into output registers */ emit_mask_scatter(bld, outputs_array, index_vec, value, - &bld->exec_mask, pred); + &bld->exec_mask); } else { LLVMValueRef out_ptr = lp_get_output_ptr(bld, reg->Register.Index, @@ -1927,9 +1838,9 @@ emit_store_chan( LLVMValueRef out_ptr2 = lp_get_output_ptr(bld, reg->Register.Index, chan_index + 1); emit_store_64bit_chan(bld_base, out_ptr, out_ptr2, - pred, value); + value); } else - lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, out_ptr); + lp_exec_mask_store(&bld->exec_mask, float_bld, value, out_ptr); } break; @@ -1955,7 +1866,7 @@ emit_store_chan( /* Scatter store values into temp registers */ emit_mask_scatter(bld, temps_array, index_vec, value, - &bld->exec_mask, pred); + &bld->exec_mask); } else { LLVMValueRef temp_ptr; @@ -1966,10 +1877,10 @@ emit_store_chan( reg->Register.Index, chan_index + 1); emit_store_64bit_chan(bld_base, temp_ptr, temp_ptr2, - pred, value); + value); } else - lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, temp_ptr); + lp_exec_mask_store(&bld->exec_mask, float_bld, value, temp_ptr); } break; @@ -1977,17 +1888,10 @@ emit_store_chan( assert(dtype == TGSI_TYPE_SIGNED); assert(LLVMTypeOf(value) == int_bld->vec_type); value = LLVMBuildBitCast(builder, value, int_bld->vec_type, ""); - lp_exec_mask_store(&bld->exec_mask, int_bld, pred, value, + lp_exec_mask_store(&bld->exec_mask, int_bld, value, bld->addr[reg->Register.Index][chan_index]); break; - case TGSI_FILE_PREDICATE: - assert(LLVMTypeOf(value) == float_bld->vec_type); - value = LLVMBuildBitCast(builder, value, float_bld->vec_type, ""); - lp_exec_mask_store(&bld->exec_mask, float_bld, pred, value, - bld->preds[reg->Register.Index][chan_index]); - break; - default: assert( 0 ); } @@ -2037,18 +1941,14 @@ emit_store( { unsigned chan_index; - struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base); enum tgsi_opcode_type dtype = tgsi_opcode_infer_dst_type(inst->Instruction.Opcode); - if(info->num_dst) { - LLVMValueRef pred[TGSI_NUM_CHANNELS]; - - emit_fetch_predicate( bld, inst, pred ); + if(info->num_dst) { TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) { if (tgsi_type_is_64bit(dtype) && (chan_index == 1 || chan_index == 3)) continue; - emit_store_chan(bld_base, inst, 0, chan_index, pred[chan_index], dst[chan_index]); + emit_store_chan(bld_base, inst, 0, chan_index, dst[chan_index]); } } } @@ -2998,15 +2898,6 @@ lp_emit_declaration_soa( } break; - case TGSI_FILE_PREDICATE: - assert(last < LP_MAX_TGSI_PREDS); - for (idx = first; idx <= last; ++idx) { - for (i = 0; i < TGSI_NUM_CHANNELS; i++) - bld->preds[idx][i] = lp_build_alloca(gallivm, vec_type, - "predicate"); - } - break; - case TGSI_FILE_SAMPLER_VIEW: /* * The target stored here MUST match whatever there actually diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h index 7fb449fd0..afe8722b0 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.h @@ -41,6 +41,10 @@ #include "pipe/p_compiler.h" #include "gallivm/lp_bld.h" +#ifdef __cplusplus +extern "C" { +#endif + /** * Native SIMD architecture width available at runtime. * @@ -449,5 +453,8 @@ lp_build_context_init(struct lp_build_context *bld, unsigned lp_build_count_ir_module(LLVMModuleRef module); +#ifdef __cplusplus +} +#endif #endif /* !LP_BLD_TYPE_H */ |