summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2016-07-28 18:33:41 +0100
committerChris Wilson <chris@chris-wilson.co.uk>2016-07-29 17:41:55 +0100
commit49daf5df124b5ae6c7508e934768c292f4143040 (patch)
treead4e06f55ebf164b19b4632e053fea7300e7063f
parent24f613cae4147e0e1e770ee22932b6e2fb7064a2 (diff)
sna: Use GCC pragma to enable SSE2 blt routines
Rather than use per-function attributes, if we set the target for the block using a pragma we can compile the SSE2 routines on 32bit ISA as well. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
-rw-r--r--src/sna/blt.c644
1 files changed, 287 insertions, 357 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c
index ab7bd22c..cb90437a 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -32,89 +32,21 @@
#include "sna.h"
#include <pixman.h>
-#if __x86_64__
-#define USE_SSE2 1
-#endif
-
-#if USE_SSE2
+#if defined(sse2)
+#pragma GCC push_options
+#pragma GCC target("sse2,inline-all-stringops,fpmath=sse")
+#pragma GCC optimize("Ofast")
#include <xmmintrin.h>
#if __x86_64__
#define have_sse2() 1
#else
-enum {
- MMX = 0x1,
- MMX_EXTENSIONS = 0x2,
- SSE = 0x6,
- SSE2 = 0x8,
- CMOV = 0x10
-};
-
-#ifdef __GNUC__
-static unsigned int
-detect_cpu_features(void)
-{
- unsigned int features;
- unsigned int result = 0;
-
- char vendor[13];
- vendor[0] = 0;
- vendor[12] = 0;
-
- asm (
- "pushf\n"
- "pop %%eax\n"
- "mov %%eax, %%ecx\n"
- "xor $0x00200000, %%eax\n"
- "push %%eax\n"
- "popf\n"
- "pushf\n"
- "pop %%eax\n"
- "mov $0x0, %%edx\n"
- "xor %%ecx, %%eax\n"
- "jz 1f\n"
-
- "mov $0x00000000, %%eax\n"
- "push %%ebx\n"
- "cpuid\n"
- "mov %%ebx, %%eax\n"
- "pop %%ebx\n"
- "mov %%eax, %1\n"
- "mov %%edx, %2\n"
- "mov %%ecx, %3\n"
- "mov $0x00000001, %%eax\n"
- "push %%ebx\n"
- "cpuid\n"
- "pop %%ebx\n"
- "1:\n"
- "mov %%edx, %0\n"
- : "=r" (result), "=m" (vendor[0]), "=m" (vendor[4]), "=m" (vendor[8])
- :: "%eax", "%ecx", "%edx");
-
- features = 0;
- if (result) {
- /* result now contains the standard feature bits */
- if (result & (1 << 15))
- features |= CMOV;
- if (result & (1 << 23))
- features |= MMX;
- if (result & (1 << 25))
- features |= SSE;
- if (result & (1 << 26))
- features |= SSE2;
- }
- return features;
-}
-#else
-static unsigned int detect_cpu_features(void) { return 0; }
-#endif
-
static bool have_sse2(void)
{
static int sse2_present = -1;
if (sse2_present == -1)
- sse2_present = detect_cpu_features() & SSE2;
+ sse2_present = sna_cpu_detect() & SSE2;
return sse2_present;
}
@@ -149,274 +81,8 @@ xmm_save_128u(__m128i *dst, __m128i data)
{
_mm_storeu_si128(dst, data);
}
-#endif
-
-fast void
-memcpy_blt(const void *src, void *dst, int bpp,
- int32_t src_stride, int32_t dst_stride,
- int16_t src_x, int16_t src_y,
- int16_t dst_x, int16_t dst_y,
- uint16_t width, uint16_t height)
-{
- const uint8_t *src_bytes;
- uint8_t *dst_bytes;
- int byte_width;
-
- assert(src);
- assert(dst);
- assert(width && height);
- assert(bpp >= 8);
- assert(width*bpp <= 8*src_stride);
- assert(width*bpp <= 8*dst_stride);
-
- DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
- __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
-
- bpp /= 8;
-
- src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp;
- dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp;
-
- byte_width = width * bpp;
- if (byte_width == src_stride && byte_width == dst_stride) {
- byte_width *= height;
- height = 1;
- }
-
- switch (byte_width) {
- case 1:
- do {
- *dst_bytes = *src_bytes;
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- } while (--height);
- break;
-
- case 2:
- do {
- *(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes;
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- } while (--height);
- break;
-
- case 4:
- do {
- *(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes;
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- } while (--height);
- break;
-
- case 8:
- do {
- *(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes;
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- } while (--height);
- break;
- case 16:
- do {
- ((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0];
- ((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1];
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- } while (--height);
- break;
-
- default:
- do {
- memcpy(dst_bytes, src_bytes, byte_width);
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- } while (--height);
- break;
- }
-}
-
-static fast_memcpy void
-memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
- int32_t src_stride, int32_t dst_stride,
- int16_t src_x, int16_t src_y,
- int16_t dst_x, int16_t dst_y,
- uint16_t width, uint16_t height)
-{
- const unsigned tile_width = 512;
- const unsigned tile_height = 8;
- const unsigned tile_size = 4096;
- const unsigned cpp = bpp / 8;
- const unsigned tile_pixels = tile_width / cpp;
- const unsigned tile_shift = ffs(tile_pixels) - 1;
- const unsigned tile_mask = tile_pixels - 1;
-
- DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
- __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
- assert(src != dst);
-
- if (src_x | src_y)
- src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
- assert(src_stride >= width * cpp);
- src_stride -= width * cpp;
-
- while (height--) {
- unsigned w = width * cpp;
- uint8_t *tile_row = dst;
-
- tile_row += dst_y / tile_height * dst_stride * tile_height;
- tile_row += (dst_y & (tile_height-1)) * tile_width;
- if (dst_x) {
- tile_row += (dst_x >> tile_shift) * tile_size;
- if (dst_x & tile_mask) {
- const unsigned x = (dst_x & tile_mask) * cpp;
- const unsigned len = min(tile_width - x, w);
- memcpy(assume_misaligned(tile_row + x, tile_width, x),
- src, len);
-
- tile_row += tile_size;
- src = (const uint8_t *)src + len;
- w -= len;
- }
- }
- while (w >= tile_width) {
- memcpy(assume_aligned(tile_row, tile_width),
- src, tile_width);
- tile_row += tile_size;
- src = (const uint8_t *)src + tile_width;
- w -= tile_width;
- }
- memcpy(assume_aligned(tile_row, tile_width), src, w);
- src = (const uint8_t *)src + src_stride + w;
- dst_y++;
- }
-}
-
-static fast_memcpy void
-memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
- int32_t src_stride, int32_t dst_stride,
- int16_t src_x, int16_t src_y,
- int16_t dst_x, int16_t dst_y,
- uint16_t width, uint16_t height)
-{
- const unsigned tile_width = 512;
- const unsigned tile_height = 8;
- const unsigned tile_size = 4096;
-
- const unsigned cpp = bpp / 8;
- const unsigned tile_pixels = tile_width / cpp;
- const unsigned tile_shift = ffs(tile_pixels) - 1;
- const unsigned tile_mask = tile_pixels - 1;
-
- DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
- __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
- assert(src != dst);
-
- if (dst_x | dst_y)
- dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
- assert(dst_stride >= width * cpp);
- dst_stride -= width * cpp;
-
- while (height--) {
- unsigned w = width * cpp;
- const uint8_t *tile_row = src;
-
- tile_row += src_y / tile_height * src_stride * tile_height;
- tile_row += (src_y & (tile_height-1)) * tile_width;
- if (src_x) {
- tile_row += (src_x >> tile_shift) * tile_size;
- if (src_x & tile_mask) {
- const unsigned x = (src_x & tile_mask) * cpp;
- const unsigned len = min(tile_width - x, w);
- memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len);
-
- tile_row += tile_size;
- dst = (uint8_t *)dst + len;
- w -= len;
- }
- }
- while (w >= tile_width) {
- memcpy(dst,
- assume_aligned(tile_row, tile_width),
- tile_width);
-
- tile_row += tile_size;
- dst = (uint8_t *)dst + tile_width;
- w -= tile_width;
- }
- memcpy(dst, assume_aligned(tile_row, tile_width), w);
- dst = (uint8_t *)dst + dst_stride + w;
- src_y++;
- }
-}
-
-static fast_memcpy void
-memcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
- int32_t src_stride, int32_t dst_stride,
- int16_t src_x, int16_t src_y,
- int16_t dst_x, int16_t dst_y,
- uint16_t width, uint16_t height)
-{
- const unsigned tile_width = 512;
- const unsigned tile_height = 8;
- const unsigned tile_size = 4096;
-
- const unsigned cpp = bpp / 8;
- const unsigned tile_pixels = tile_width / cpp;
- const unsigned tile_shift = ffs(tile_pixels) - 1;
- const unsigned tile_mask = tile_pixels - 1;
-
- DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
- __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
- assert(src != dst);
- assert((dst_x & tile_mask) == (src_x & tile_mask));
-
- while (height--) {
- unsigned w = width * cpp;
- uint8_t *dst_row = dst;
- const uint8_t *src_row = src;
-
- dst_row += dst_y / tile_height * dst_stride * tile_height;
- dst_row += (dst_y & (tile_height-1)) * tile_width;
- if (dst_x)
- dst_row += (dst_x >> tile_shift) * tile_size;
- dst_y++;
-
- src_row += src_y / tile_height * src_stride * tile_height;
- src_row += (src_y & (tile_height-1)) * tile_width;
- if (src_x)
- src_row += (src_x >> tile_shift) * tile_size;
- src_y++;
-
- if (dst_x & tile_mask) {
- const unsigned x = (dst_x & tile_mask) * cpp;
- const unsigned len = min(tile_width - x, w);
-
- memcpy(assume_misaligned(dst_row + x, tile_width, x),
- assume_misaligned(src_row + x, tile_width, x),
- len);
-
- dst_row += tile_size;
- src_row += tile_size;
- w -= len;
- }
-
- while (w >= tile_width) {
- memcpy(assume_aligned(dst_row, tile_width),
- assume_aligned(src_row, tile_width),
- tile_width);
- dst_row += tile_size;
- src_row += tile_size;
- w -= tile_width;
- }
- memcpy(assume_aligned(dst_row, tile_width),
- assume_aligned(src_row, tile_width),
- w);
- }
-}
-
-#if defined(sse2) && defined(__x86_64__)
-
-sse2 static force_inline void
+static force_inline void
to_sse128xN(uint8_t *dst, const uint8_t *src, int bytes)
{
int i;
@@ -448,7 +114,7 @@ to_sse128xN(uint8_t *dst, const uint8_t *src, int bytes)
}
}
-sse2 static force_inline void
+static force_inline void
to_sse64(uint8_t *dst, const uint8_t *src)
{
__m128i xmm1, xmm2, xmm3, xmm4;
@@ -464,7 +130,7 @@ to_sse64(uint8_t *dst, const uint8_t *src)
xmm_save_128((__m128i*)dst + 3, xmm4);
}
-sse2 static force_inline void
+static force_inline void
to_sse32(uint8_t *dst, const uint8_t *src)
{
__m128i xmm1, xmm2;
@@ -476,13 +142,13 @@ to_sse32(uint8_t *dst, const uint8_t *src)
xmm_save_128((__m128i*)dst + 1, xmm2);
}
-sse2 static force_inline void
+static force_inline void
to_sse16(uint8_t *dst, const uint8_t *src)
{
xmm_save_128((__m128i*)dst, xmm_load_128u((const __m128i*)src));
}
-sse2 static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len)
+static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len)
{
assert(len);
if ((uintptr_t)dst & 15) {
@@ -555,7 +221,7 @@ sse2 static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len)
memcpy(dst, src, len & 3);
}
-sse2 static fast_memcpy void
+static void
memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
int32_t src_stride, int32_t dst_stride,
int16_t src_x, int16_t src_y,
@@ -623,7 +289,7 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
}
}
-sse2 static force_inline void
+static force_inline void
from_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes)
{
int i;
@@ -657,7 +323,7 @@ from_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes)
}
}
-sse2 static force_inline void
+static force_inline void
from_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes)
{
int i;
@@ -692,7 +358,7 @@ from_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes)
}
}
-sse2 static force_inline void
+static force_inline void
from_sse64u(uint8_t *dst, const uint8_t *src)
{
__m128i xmm1, xmm2, xmm3, xmm4;
@@ -710,7 +376,7 @@ from_sse64u(uint8_t *dst, const uint8_t *src)
xmm_save_128u((__m128i*)dst + 3, xmm4);
}
-sse2 static force_inline void
+static force_inline void
from_sse64a(uint8_t *dst, const uint8_t *src)
{
__m128i xmm1, xmm2, xmm3, xmm4;
@@ -729,7 +395,7 @@ from_sse64a(uint8_t *dst, const uint8_t *src)
xmm_save_128((__m128i*)dst + 3, xmm4);
}
-sse2 static force_inline void
+static force_inline void
from_sse32u(uint8_t *dst, const uint8_t *src)
{
__m128i xmm1, xmm2;
@@ -741,7 +407,7 @@ from_sse32u(uint8_t *dst, const uint8_t *src)
xmm_save_128u((__m128i*)dst + 1, xmm2);
}
-sse2 static force_inline void
+static force_inline void
from_sse32a(uint8_t *dst, const uint8_t *src)
{
__m128i xmm1, xmm2;
@@ -756,7 +422,7 @@ from_sse32a(uint8_t *dst, const uint8_t *src)
xmm_save_128((__m128i*)dst + 1, xmm2);
}
-sse2 static force_inline void
+static force_inline void
from_sse16u(uint8_t *dst, const uint8_t *src)
{
assert(((uintptr_t)src & 15) == 0);
@@ -764,7 +430,7 @@ from_sse16u(uint8_t *dst, const uint8_t *src)
xmm_save_128u((__m128i*)dst, xmm_load_128((const __m128i*)src));
}
-sse2 static force_inline void
+static force_inline void
from_sse16a(uint8_t *dst, const uint8_t *src)
{
assert(((uintptr_t)dst & 15) == 0);
@@ -773,7 +439,7 @@ from_sse16a(uint8_t *dst, const uint8_t *src)
xmm_save_128((__m128i*)dst, xmm_load_128((const __m128i*)src));
}
-sse2 static fast_memcpy void
+static void
memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
int32_t src_stride, int32_t dst_stride,
int16_t src_x, int16_t src_y,
@@ -885,7 +551,7 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
}
}
-sse2 static fast_memcpy void
+static void
memcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
int32_t src_stride, int32_t dst_stride,
int16_t src_x, int16_t src_y,
@@ -965,8 +631,272 @@ memcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
}
}
+#pragma GCC push_options
#endif
+fast void
+memcpy_blt(const void *src, void *dst, int bpp,
+ int32_t src_stride, int32_t dst_stride,
+ int16_t src_x, int16_t src_y,
+ int16_t dst_x, int16_t dst_y,
+ uint16_t width, uint16_t height)
+{
+ const uint8_t *src_bytes;
+ uint8_t *dst_bytes;
+ int byte_width;
+
+ assert(src);
+ assert(dst);
+ assert(width && height);
+ assert(bpp >= 8);
+ assert(width*bpp <= 8*src_stride);
+ assert(width*bpp <= 8*dst_stride);
+
+ DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+ __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+ bpp /= 8;
+
+ src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp;
+ dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp;
+
+ byte_width = width * bpp;
+ if (byte_width == src_stride && byte_width == dst_stride) {
+ byte_width *= height;
+ height = 1;
+ }
+
+ switch (byte_width) {
+ case 1:
+ do {
+ *dst_bytes = *src_bytes;
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ } while (--height);
+ break;
+
+ case 2:
+ do {
+ *(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes;
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ } while (--height);
+ break;
+
+ case 4:
+ do {
+ *(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes;
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ } while (--height);
+ break;
+
+ case 8:
+ do {
+ *(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes;
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ } while (--height);
+ break;
+ case 16:
+ do {
+ ((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0];
+ ((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1];
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ } while (--height);
+ break;
+
+ default:
+ do {
+ memcpy(dst_bytes, src_bytes, byte_width);
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ } while (--height);
+ break;
+ }
+}
+
+static fast_memcpy void
+memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
+ int32_t src_stride, int32_t dst_stride,
+ int16_t src_x, int16_t src_y,
+ int16_t dst_x, int16_t dst_y,
+ uint16_t width, uint16_t height)
+{
+ const unsigned tile_width = 512;
+ const unsigned tile_height = 8;
+ const unsigned tile_size = 4096;
+
+ const unsigned cpp = bpp / 8;
+ const unsigned tile_pixels = tile_width / cpp;
+ const unsigned tile_shift = ffs(tile_pixels) - 1;
+ const unsigned tile_mask = tile_pixels - 1;
+
+ DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+ __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+ assert(src != dst);
+
+ if (src_x | src_y)
+ src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+ assert(src_stride >= width * cpp);
+ src_stride -= width * cpp;
+
+ while (height--) {
+ unsigned w = width * cpp;
+ uint8_t *tile_row = dst;
+
+ tile_row += dst_y / tile_height * dst_stride * tile_height;
+ tile_row += (dst_y & (tile_height-1)) * tile_width;
+ if (dst_x) {
+ tile_row += (dst_x >> tile_shift) * tile_size;
+ if (dst_x & tile_mask) {
+ const unsigned x = (dst_x & tile_mask) * cpp;
+ const unsigned len = min(tile_width - x, w);
+ memcpy(assume_misaligned(tile_row + x, tile_width, x),
+ src, len);
+
+ tile_row += tile_size;
+ src = (const uint8_t *)src + len;
+ w -= len;
+ }
+ }
+ while (w >= tile_width) {
+ memcpy(assume_aligned(tile_row, tile_width),
+ src, tile_width);
+ tile_row += tile_size;
+ src = (const uint8_t *)src + tile_width;
+ w -= tile_width;
+ }
+ memcpy(assume_aligned(tile_row, tile_width), src, w);
+ src = (const uint8_t *)src + src_stride + w;
+ dst_y++;
+ }
+}
+
+static fast_memcpy void
+memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
+ int32_t src_stride, int32_t dst_stride,
+ int16_t src_x, int16_t src_y,
+ int16_t dst_x, int16_t dst_y,
+ uint16_t width, uint16_t height)
+{
+ const unsigned tile_width = 512;
+ const unsigned tile_height = 8;
+ const unsigned tile_size = 4096;
+
+ const unsigned cpp = bpp / 8;
+ const unsigned tile_pixels = tile_width / cpp;
+ const unsigned tile_shift = ffs(tile_pixels) - 1;
+ const unsigned tile_mask = tile_pixels - 1;
+
+ DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+ __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+ assert(src != dst);
+
+ if (dst_x | dst_y)
+ dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
+ assert(dst_stride >= width * cpp);
+ dst_stride -= width * cpp;
+
+ while (height--) {
+ unsigned w = width * cpp;
+ const uint8_t *tile_row = src;
+
+ tile_row += src_y / tile_height * src_stride * tile_height;
+ tile_row += (src_y & (tile_height-1)) * tile_width;
+ if (src_x) {
+ tile_row += (src_x >> tile_shift) * tile_size;
+ if (src_x & tile_mask) {
+ const unsigned x = (src_x & tile_mask) * cpp;
+ const unsigned len = min(tile_width - x, w);
+ memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len);
+
+ tile_row += tile_size;
+ dst = (uint8_t *)dst + len;
+ w -= len;
+ }
+ }
+ while (w >= tile_width) {
+ memcpy(dst,
+ assume_aligned(tile_row, tile_width),
+ tile_width);
+
+ tile_row += tile_size;
+ dst = (uint8_t *)dst + tile_width;
+ w -= tile_width;
+ }
+ memcpy(dst, assume_aligned(tile_row, tile_width), w);
+ dst = (uint8_t *)dst + dst_stride + w;
+ src_y++;
+ }
+}
+
+static fast_memcpy void
+memcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
+ int32_t src_stride, int32_t dst_stride,
+ int16_t src_x, int16_t src_y,
+ int16_t dst_x, int16_t dst_y,
+ uint16_t width, uint16_t height)
+{
+ const unsigned tile_width = 512;
+ const unsigned tile_height = 8;
+ const unsigned tile_size = 4096;
+
+ const unsigned cpp = bpp / 8;
+ const unsigned tile_pixels = tile_width / cpp;
+ const unsigned tile_shift = ffs(tile_pixels) - 1;
+ const unsigned tile_mask = tile_pixels - 1;
+
+ DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+ __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+ assert(src != dst);
+ assert((dst_x & tile_mask) == (src_x & tile_mask));
+
+ while (height--) {
+ unsigned w = width * cpp;
+ uint8_t *dst_row = dst;
+ const uint8_t *src_row = src;
+
+ dst_row += dst_y / tile_height * dst_stride * tile_height;
+ dst_row += (dst_y & (tile_height-1)) * tile_width;
+ if (dst_x)
+ dst_row += (dst_x >> tile_shift) * tile_size;
+ dst_y++;
+
+ src_row += src_y / tile_height * src_stride * tile_height;
+ src_row += (src_y & (tile_height-1)) * tile_width;
+ if (src_x)
+ src_row += (src_x >> tile_shift) * tile_size;
+ src_y++;
+
+ if (dst_x & tile_mask) {
+ const unsigned x = (dst_x & tile_mask) * cpp;
+ const unsigned len = min(tile_width - x, w);
+
+ memcpy(assume_misaligned(dst_row + x, tile_width, x),
+ assume_misaligned(src_row + x, tile_width, x),
+ len);
+
+ dst_row += tile_size;
+ src_row += tile_size;
+ w -= len;
+ }
+
+ while (w >= tile_width) {
+ memcpy(assume_aligned(dst_row, tile_width),
+ assume_aligned(src_row, tile_width),
+ tile_width);
+ dst_row += tile_size;
+ src_row += tile_size;
+ w -= tile_width;
+ }
+ memcpy(assume_aligned(dst_row, tile_width),
+ assume_aligned(src_row, tile_width),
+ w);
+ }
+}
+
#define memcpy_to_tiled_x(swizzle) \
fast_memcpy static void \
memcpy_to_tiled_x__##swizzle (const void *src, void *dst, int bpp, \
@@ -1241,7 +1171,7 @@ void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling, unsigned cpu)
break;
case I915_BIT_6_SWIZZLE_NONE:
DBG(("%s: no swizzling\n", __FUNCTION__));
-#if defined(sse2) && defined(__x86_64__)
+#if defined(sse2)
if (cpu & SSE2) {
kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0__sse2;
kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0__sse2;
@@ -1498,7 +1428,7 @@ memcpy_xor(const void *src, void *dst, int bpp,
height = 1;
}
-#if USE_SSE2
+#if defined(sse2) && __x86_64__
if (have_sse2()) {
do {
uint32_t *d = (uint32_t *)dst_bytes;