diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2016-07-28 18:33:41 +0100 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2016-07-29 17:41:55 +0100 |
commit | 49daf5df124b5ae6c7508e934768c292f4143040 (patch) | |
tree | ad4e06f55ebf164b19b4632e053fea7300e7063f | |
parent | 24f613cae4147e0e1e770ee22932b6e2fb7064a2 (diff) |
sna: Use GCC pragma to enable SSE2 blt routines
Rather than use per-function attributes, if we set the target for the
block using a pragma we can compile the SSE2 routines on 32bit ISA as
well.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
-rw-r--r-- | src/sna/blt.c | 644 |
1 files changed, 287 insertions, 357 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c index ab7bd22c..cb90437a 100644 --- a/src/sna/blt.c +++ b/src/sna/blt.c @@ -32,89 +32,21 @@ #include "sna.h" #include <pixman.h> -#if __x86_64__ -#define USE_SSE2 1 -#endif - -#if USE_SSE2 +#if defined(sse2) +#pragma GCC push_options +#pragma GCC target("sse2,inline-all-stringops,fpmath=sse") +#pragma GCC optimize("Ofast") #include <xmmintrin.h> #if __x86_64__ #define have_sse2() 1 #else -enum { - MMX = 0x1, - MMX_EXTENSIONS = 0x2, - SSE = 0x6, - SSE2 = 0x8, - CMOV = 0x10 -}; - -#ifdef __GNUC__ -static unsigned int -detect_cpu_features(void) -{ - unsigned int features; - unsigned int result = 0; - - char vendor[13]; - vendor[0] = 0; - vendor[12] = 0; - - asm ( - "pushf\n" - "pop %%eax\n" - "mov %%eax, %%ecx\n" - "xor $0x00200000, %%eax\n" - "push %%eax\n" - "popf\n" - "pushf\n" - "pop %%eax\n" - "mov $0x0, %%edx\n" - "xor %%ecx, %%eax\n" - "jz 1f\n" - - "mov $0x00000000, %%eax\n" - "push %%ebx\n" - "cpuid\n" - "mov %%ebx, %%eax\n" - "pop %%ebx\n" - "mov %%eax, %1\n" - "mov %%edx, %2\n" - "mov %%ecx, %3\n" - "mov $0x00000001, %%eax\n" - "push %%ebx\n" - "cpuid\n" - "pop %%ebx\n" - "1:\n" - "mov %%edx, %0\n" - : "=r" (result), "=m" (vendor[0]), "=m" (vendor[4]), "=m" (vendor[8]) - :: "%eax", "%ecx", "%edx"); - - features = 0; - if (result) { - /* result now contains the standard feature bits */ - if (result & (1 << 15)) - features |= CMOV; - if (result & (1 << 23)) - features |= MMX; - if (result & (1 << 25)) - features |= SSE; - if (result & (1 << 26)) - features |= SSE2; - } - return features; -} -#else -static unsigned int detect_cpu_features(void) { return 0; } -#endif - static bool have_sse2(void) { static int sse2_present = -1; if (sse2_present == -1) - sse2_present = detect_cpu_features() & SSE2; + sse2_present = sna_cpu_detect() & SSE2; return sse2_present; } @@ -149,274 +81,8 @@ xmm_save_128u(__m128i *dst, __m128i data) { _mm_storeu_si128(dst, data); } -#endif - -fast void -memcpy_blt(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height) -{ - const uint8_t *src_bytes; - uint8_t *dst_bytes; - int byte_width; - - assert(src); - assert(dst); - assert(width && height); - assert(bpp >= 8); - assert(width*bpp <= 8*src_stride); - assert(width*bpp <= 8*dst_stride); - - DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", - __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); - - bpp /= 8; - - src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp; - dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp; - - byte_width = width * bpp; - if (byte_width == src_stride && byte_width == dst_stride) { - byte_width *= height; - height = 1; - } - - switch (byte_width) { - case 1: - do { - *dst_bytes = *src_bytes; - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; - - case 2: - do { - *(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes; - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; - - case 4: - do { - *(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes; - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; - - case 8: - do { - *(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes; - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; - case 16: - do { - ((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0]; - ((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1]; - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; - - default: - do { - memcpy(dst_bytes, src_bytes, byte_width); - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; - } -} - -static fast_memcpy void -memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height) -{ - const unsigned tile_width = 512; - const unsigned tile_height = 8; - const unsigned tile_size = 4096; - const unsigned cpp = bpp / 8; - const unsigned tile_pixels = tile_width / cpp; - const unsigned tile_shift = ffs(tile_pixels) - 1; - const unsigned tile_mask = tile_pixels - 1; - - DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", - __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); - assert(src != dst); - - if (src_x | src_y) - src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; - assert(src_stride >= width * cpp); - src_stride -= width * cpp; - - while (height--) { - unsigned w = width * cpp; - uint8_t *tile_row = dst; - - tile_row += dst_y / tile_height * dst_stride * tile_height; - tile_row += (dst_y & (tile_height-1)) * tile_width; - if (dst_x) { - tile_row += (dst_x >> tile_shift) * tile_size; - if (dst_x & tile_mask) { - const unsigned x = (dst_x & tile_mask) * cpp; - const unsigned len = min(tile_width - x, w); - memcpy(assume_misaligned(tile_row + x, tile_width, x), - src, len); - - tile_row += tile_size; - src = (const uint8_t *)src + len; - w -= len; - } - } - while (w >= tile_width) { - memcpy(assume_aligned(tile_row, tile_width), - src, tile_width); - tile_row += tile_size; - src = (const uint8_t *)src + tile_width; - w -= tile_width; - } - memcpy(assume_aligned(tile_row, tile_width), src, w); - src = (const uint8_t *)src + src_stride + w; - dst_y++; - } -} - -static fast_memcpy void -memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height) -{ - const unsigned tile_width = 512; - const unsigned tile_height = 8; - const unsigned tile_size = 4096; - - const unsigned cpp = bpp / 8; - const unsigned tile_pixels = tile_width / cpp; - const unsigned tile_shift = ffs(tile_pixels) - 1; - const unsigned tile_mask = tile_pixels - 1; - - DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", - __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); - assert(src != dst); - - if (dst_x | dst_y) - dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; - assert(dst_stride >= width * cpp); - dst_stride -= width * cpp; - - while (height--) { - unsigned w = width * cpp; - const uint8_t *tile_row = src; - - tile_row += src_y / tile_height * src_stride * tile_height; - tile_row += (src_y & (tile_height-1)) * tile_width; - if (src_x) { - tile_row += (src_x >> tile_shift) * tile_size; - if (src_x & tile_mask) { - const unsigned x = (src_x & tile_mask) * cpp; - const unsigned len = min(tile_width - x, w); - memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len); - - tile_row += tile_size; - dst = (uint8_t *)dst + len; - w -= len; - } - } - while (w >= tile_width) { - memcpy(dst, - assume_aligned(tile_row, tile_width), - tile_width); - - tile_row += tile_size; - dst = (uint8_t *)dst + tile_width; - w -= tile_width; - } - memcpy(dst, assume_aligned(tile_row, tile_width), w); - dst = (uint8_t *)dst + dst_stride + w; - src_y++; - } -} - -static fast_memcpy void -memcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height) -{ - const unsigned tile_width = 512; - const unsigned tile_height = 8; - const unsigned tile_size = 4096; - - const unsigned cpp = bpp / 8; - const unsigned tile_pixels = tile_width / cpp; - const unsigned tile_shift = ffs(tile_pixels) - 1; - const unsigned tile_mask = tile_pixels - 1; - - DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", - __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); - assert(src != dst); - assert((dst_x & tile_mask) == (src_x & tile_mask)); - - while (height--) { - unsigned w = width * cpp; - uint8_t *dst_row = dst; - const uint8_t *src_row = src; - - dst_row += dst_y / tile_height * dst_stride * tile_height; - dst_row += (dst_y & (tile_height-1)) * tile_width; - if (dst_x) - dst_row += (dst_x >> tile_shift) * tile_size; - dst_y++; - - src_row += src_y / tile_height * src_stride * tile_height; - src_row += (src_y & (tile_height-1)) * tile_width; - if (src_x) - src_row += (src_x >> tile_shift) * tile_size; - src_y++; - - if (dst_x & tile_mask) { - const unsigned x = (dst_x & tile_mask) * cpp; - const unsigned len = min(tile_width - x, w); - - memcpy(assume_misaligned(dst_row + x, tile_width, x), - assume_misaligned(src_row + x, tile_width, x), - len); - - dst_row += tile_size; - src_row += tile_size; - w -= len; - } - - while (w >= tile_width) { - memcpy(assume_aligned(dst_row, tile_width), - assume_aligned(src_row, tile_width), - tile_width); - dst_row += tile_size; - src_row += tile_size; - w -= tile_width; - } - memcpy(assume_aligned(dst_row, tile_width), - assume_aligned(src_row, tile_width), - w); - } -} - -#if defined(sse2) && defined(__x86_64__) - -sse2 static force_inline void +static force_inline void to_sse128xN(uint8_t *dst, const uint8_t *src, int bytes) { int i; @@ -448,7 +114,7 @@ to_sse128xN(uint8_t *dst, const uint8_t *src, int bytes) } } -sse2 static force_inline void +static force_inline void to_sse64(uint8_t *dst, const uint8_t *src) { __m128i xmm1, xmm2, xmm3, xmm4; @@ -464,7 +130,7 @@ to_sse64(uint8_t *dst, const uint8_t *src) xmm_save_128((__m128i*)dst + 3, xmm4); } -sse2 static force_inline void +static force_inline void to_sse32(uint8_t *dst, const uint8_t *src) { __m128i xmm1, xmm2; @@ -476,13 +142,13 @@ to_sse32(uint8_t *dst, const uint8_t *src) xmm_save_128((__m128i*)dst + 1, xmm2); } -sse2 static force_inline void +static force_inline void to_sse16(uint8_t *dst, const uint8_t *src) { xmm_save_128((__m128i*)dst, xmm_load_128u((const __m128i*)src)); } -sse2 static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len) +static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len) { assert(len); if ((uintptr_t)dst & 15) { @@ -555,7 +221,7 @@ sse2 static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len) memcpy(dst, src, len & 3); } -sse2 static fast_memcpy void +static void memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, int32_t src_stride, int32_t dst_stride, int16_t src_x, int16_t src_y, @@ -623,7 +289,7 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, } } -sse2 static force_inline void +static force_inline void from_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes) { int i; @@ -657,7 +323,7 @@ from_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes) } } -sse2 static force_inline void +static force_inline void from_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes) { int i; @@ -692,7 +358,7 @@ from_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes) } } -sse2 static force_inline void +static force_inline void from_sse64u(uint8_t *dst, const uint8_t *src) { __m128i xmm1, xmm2, xmm3, xmm4; @@ -710,7 +376,7 @@ from_sse64u(uint8_t *dst, const uint8_t *src) xmm_save_128u((__m128i*)dst + 3, xmm4); } -sse2 static force_inline void +static force_inline void from_sse64a(uint8_t *dst, const uint8_t *src) { __m128i xmm1, xmm2, xmm3, xmm4; @@ -729,7 +395,7 @@ from_sse64a(uint8_t *dst, const uint8_t *src) xmm_save_128((__m128i*)dst + 3, xmm4); } -sse2 static force_inline void +static force_inline void from_sse32u(uint8_t *dst, const uint8_t *src) { __m128i xmm1, xmm2; @@ -741,7 +407,7 @@ from_sse32u(uint8_t *dst, const uint8_t *src) xmm_save_128u((__m128i*)dst + 1, xmm2); } -sse2 static force_inline void +static force_inline void from_sse32a(uint8_t *dst, const uint8_t *src) { __m128i xmm1, xmm2; @@ -756,7 +422,7 @@ from_sse32a(uint8_t *dst, const uint8_t *src) xmm_save_128((__m128i*)dst + 1, xmm2); } -sse2 static force_inline void +static force_inline void from_sse16u(uint8_t *dst, const uint8_t *src) { assert(((uintptr_t)src & 15) == 0); @@ -764,7 +430,7 @@ from_sse16u(uint8_t *dst, const uint8_t *src) xmm_save_128u((__m128i*)dst, xmm_load_128((const __m128i*)src)); } -sse2 static force_inline void +static force_inline void from_sse16a(uint8_t *dst, const uint8_t *src) { assert(((uintptr_t)dst & 15) == 0); @@ -773,7 +439,7 @@ from_sse16a(uint8_t *dst, const uint8_t *src) xmm_save_128((__m128i*)dst, xmm_load_128((const __m128i*)src)); } -sse2 static fast_memcpy void +static void memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, int32_t src_stride, int32_t dst_stride, int16_t src_x, int16_t src_y, @@ -885,7 +551,7 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, } } -sse2 static fast_memcpy void +static void memcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, int32_t src_stride, int32_t dst_stride, int16_t src_x, int16_t src_y, @@ -965,8 +631,272 @@ memcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, } } +#pragma GCC push_options #endif +fast void +memcpy_blt(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const uint8_t *src_bytes; + uint8_t *dst_bytes; + int byte_width; + + assert(src); + assert(dst); + assert(width && height); + assert(bpp >= 8); + assert(width*bpp <= 8*src_stride); + assert(width*bpp <= 8*dst_stride); + + DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + + bpp /= 8; + + src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp; + dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp; + + byte_width = width * bpp; + if (byte_width == src_stride && byte_width == dst_stride) { + byte_width *= height; + height = 1; + } + + switch (byte_width) { + case 1: + do { + *dst_bytes = *src_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; + + case 2: + do { + *(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; + + case 4: + do { + *(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; + + case 8: + do { + *(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; + case 16: + do { + ((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0]; + ((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1]; + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; + + default: + do { + memcpy(dst_bytes, src_bytes, byte_width); + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; + } +} + +static fast_memcpy void +memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const unsigned tile_width = 512; + const unsigned tile_height = 8; + const unsigned tile_size = 4096; + + const unsigned cpp = bpp / 8; + const unsigned tile_pixels = tile_width / cpp; + const unsigned tile_shift = ffs(tile_pixels) - 1; + const unsigned tile_mask = tile_pixels - 1; + + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + assert(src != dst); + + if (src_x | src_y) + src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; + assert(src_stride >= width * cpp); + src_stride -= width * cpp; + + while (height--) { + unsigned w = width * cpp; + uint8_t *tile_row = dst; + + tile_row += dst_y / tile_height * dst_stride * tile_height; + tile_row += (dst_y & (tile_height-1)) * tile_width; + if (dst_x) { + tile_row += (dst_x >> tile_shift) * tile_size; + if (dst_x & tile_mask) { + const unsigned x = (dst_x & tile_mask) * cpp; + const unsigned len = min(tile_width - x, w); + memcpy(assume_misaligned(tile_row + x, tile_width, x), + src, len); + + tile_row += tile_size; + src = (const uint8_t *)src + len; + w -= len; + } + } + while (w >= tile_width) { + memcpy(assume_aligned(tile_row, tile_width), + src, tile_width); + tile_row += tile_size; + src = (const uint8_t *)src + tile_width; + w -= tile_width; + } + memcpy(assume_aligned(tile_row, tile_width), src, w); + src = (const uint8_t *)src + src_stride + w; + dst_y++; + } +} + +static fast_memcpy void +memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const unsigned tile_width = 512; + const unsigned tile_height = 8; + const unsigned tile_size = 4096; + + const unsigned cpp = bpp / 8; + const unsigned tile_pixels = tile_width / cpp; + const unsigned tile_shift = ffs(tile_pixels) - 1; + const unsigned tile_mask = tile_pixels - 1; + + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + assert(src != dst); + + if (dst_x | dst_y) + dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; + assert(dst_stride >= width * cpp); + dst_stride -= width * cpp; + + while (height--) { + unsigned w = width * cpp; + const uint8_t *tile_row = src; + + tile_row += src_y / tile_height * src_stride * tile_height; + tile_row += (src_y & (tile_height-1)) * tile_width; + if (src_x) { + tile_row += (src_x >> tile_shift) * tile_size; + if (src_x & tile_mask) { + const unsigned x = (src_x & tile_mask) * cpp; + const unsigned len = min(tile_width - x, w); + memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len); + + tile_row += tile_size; + dst = (uint8_t *)dst + len; + w -= len; + } + } + while (w >= tile_width) { + memcpy(dst, + assume_aligned(tile_row, tile_width), + tile_width); + + tile_row += tile_size; + dst = (uint8_t *)dst + tile_width; + w -= tile_width; + } + memcpy(dst, assume_aligned(tile_row, tile_width), w); + dst = (uint8_t *)dst + dst_stride + w; + src_y++; + } +} + +static fast_memcpy void +memcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const unsigned tile_width = 512; + const unsigned tile_height = 8; + const unsigned tile_size = 4096; + + const unsigned cpp = bpp / 8; + const unsigned tile_pixels = tile_width / cpp; + const unsigned tile_shift = ffs(tile_pixels) - 1; + const unsigned tile_mask = tile_pixels - 1; + + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + assert(src != dst); + assert((dst_x & tile_mask) == (src_x & tile_mask)); + + while (height--) { + unsigned w = width * cpp; + uint8_t *dst_row = dst; + const uint8_t *src_row = src; + + dst_row += dst_y / tile_height * dst_stride * tile_height; + dst_row += (dst_y & (tile_height-1)) * tile_width; + if (dst_x) + dst_row += (dst_x >> tile_shift) * tile_size; + dst_y++; + + src_row += src_y / tile_height * src_stride * tile_height; + src_row += (src_y & (tile_height-1)) * tile_width; + if (src_x) + src_row += (src_x >> tile_shift) * tile_size; + src_y++; + + if (dst_x & tile_mask) { + const unsigned x = (dst_x & tile_mask) * cpp; + const unsigned len = min(tile_width - x, w); + + memcpy(assume_misaligned(dst_row + x, tile_width, x), + assume_misaligned(src_row + x, tile_width, x), + len); + + dst_row += tile_size; + src_row += tile_size; + w -= len; + } + + while (w >= tile_width) { + memcpy(assume_aligned(dst_row, tile_width), + assume_aligned(src_row, tile_width), + tile_width); + dst_row += tile_size; + src_row += tile_size; + w -= tile_width; + } + memcpy(assume_aligned(dst_row, tile_width), + assume_aligned(src_row, tile_width), + w); + } +} + #define memcpy_to_tiled_x(swizzle) \ fast_memcpy static void \ memcpy_to_tiled_x__##swizzle (const void *src, void *dst, int bpp, \ @@ -1241,7 +1171,7 @@ void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling, unsigned cpu) break; case I915_BIT_6_SWIZZLE_NONE: DBG(("%s: no swizzling\n", __FUNCTION__)); -#if defined(sse2) && defined(__x86_64__) +#if defined(sse2) if (cpu & SSE2) { kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0__sse2; kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0__sse2; @@ -1498,7 +1428,7 @@ memcpy_xor(const void *src, void *dst, int bpp, height = 1; } -#if USE_SSE2 +#if defined(sse2) && __x86_64__ if (have_sse2()) { do { uint32_t *d = (uint32_t *)dst_bytes; |