From 49daf5df124b5ae6c7508e934768c292f4143040 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 28 Jul 2016 18:33:41 +0100 Subject: sna: Use GCC pragma to enable SSE2 blt routines Rather than use per-function attributes, if we set the target for the block using a pragma we can compile the SSE2 routines on 32bit ISA as well. Signed-off-by: Chris Wilson --- src/sna/blt.c | 1258 +++++++++++++++++++++++++++------------------------------ 1 file changed, 594 insertions(+), 664 deletions(-) diff --git a/src/sna/blt.c b/src/sna/blt.c index ab7bd22c..cb90437a 100644 --- a/src/sna/blt.c +++ b/src/sna/blt.c @@ -32,89 +32,21 @@ #include "sna.h" #include -#if __x86_64__ -#define USE_SSE2 1 -#endif - -#if USE_SSE2 +#if defined(sse2) +#pragma GCC push_options +#pragma GCC target("sse2,inline-all-stringops,fpmath=sse") +#pragma GCC optimize("Ofast") #include #if __x86_64__ #define have_sse2() 1 #else -enum { - MMX = 0x1, - MMX_EXTENSIONS = 0x2, - SSE = 0x6, - SSE2 = 0x8, - CMOV = 0x10 -}; - -#ifdef __GNUC__ -static unsigned int -detect_cpu_features(void) -{ - unsigned int features; - unsigned int result = 0; - - char vendor[13]; - vendor[0] = 0; - vendor[12] = 0; - - asm ( - "pushf\n" - "pop %%eax\n" - "mov %%eax, %%ecx\n" - "xor $0x00200000, %%eax\n" - "push %%eax\n" - "popf\n" - "pushf\n" - "pop %%eax\n" - "mov $0x0, %%edx\n" - "xor %%ecx, %%eax\n" - "jz 1f\n" - - "mov $0x00000000, %%eax\n" - "push %%ebx\n" - "cpuid\n" - "mov %%ebx, %%eax\n" - "pop %%ebx\n" - "mov %%eax, %1\n" - "mov %%edx, %2\n" - "mov %%ecx, %3\n" - "mov $0x00000001, %%eax\n" - "push %%ebx\n" - "cpuid\n" - "pop %%ebx\n" - "1:\n" - "mov %%edx, %0\n" - : "=r" (result), "=m" (vendor[0]), "=m" (vendor[4]), "=m" (vendor[8]) - :: "%eax", "%ecx", "%edx"); - - features = 0; - if (result) { - /* result now contains the standard feature bits */ - if (result & (1 << 15)) - features |= CMOV; - if (result & (1 << 23)) - features |= MMX; - if (result & (1 << 25)) - features |= SSE; - if (result & (1 << 26)) - features |= SSE2; - } - return features; -} -#else -static unsigned int detect_cpu_features(void) { return 0; } -#endif - static bool have_sse2(void) { static int sse2_present = -1; if (sse2_present == -1) - sse2_present = detect_cpu_features() & SSE2; + sse2_present = sna_cpu_detect() & SSE2; return sse2_present; } @@ -149,97 +81,152 @@ xmm_save_128u(__m128i *dst, __m128i data) { _mm_storeu_si128(dst, data); } -#endif -fast void -memcpy_blt(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height) +static force_inline void +to_sse128xN(uint8_t *dst, const uint8_t *src, int bytes) { - const uint8_t *src_bytes; - uint8_t *dst_bytes; - int byte_width; - - assert(src); - assert(dst); - assert(width && height); - assert(bpp >= 8); - assert(width*bpp <= 8*src_stride); - assert(width*bpp <= 8*dst_stride); + int i; - DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", - __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + for (i = 0; i < bytes / 128; i++) { + __m128i xmm0, xmm1, xmm2, xmm3; + __m128i xmm4, xmm5, xmm6, xmm7; - bpp /= 8; + xmm0 = xmm_load_128u((const __m128i*)src + 0); + xmm1 = xmm_load_128u((const __m128i*)src + 1); + xmm2 = xmm_load_128u((const __m128i*)src + 2); + xmm3 = xmm_load_128u((const __m128i*)src + 3); + xmm4 = xmm_load_128u((const __m128i*)src + 4); + xmm5 = xmm_load_128u((const __m128i*)src + 5); + xmm6 = xmm_load_128u((const __m128i*)src + 6); + xmm7 = xmm_load_128u((const __m128i*)src + 7); - src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp; - dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp; + xmm_save_128((__m128i*)dst + 0, xmm0); + xmm_save_128((__m128i*)dst + 1, xmm1); + xmm_save_128((__m128i*)dst + 2, xmm2); + xmm_save_128((__m128i*)dst + 3, xmm3); + xmm_save_128((__m128i*)dst + 4, xmm4); + xmm_save_128((__m128i*)dst + 5, xmm5); + xmm_save_128((__m128i*)dst + 6, xmm6); + xmm_save_128((__m128i*)dst + 7, xmm7); - byte_width = width * bpp; - if (byte_width == src_stride && byte_width == dst_stride) { - byte_width *= height; - height = 1; + dst += 128; + src += 128; } +} - switch (byte_width) { - case 1: - do { - *dst_bytes = *src_bytes; - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; +static force_inline void +to_sse64(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm1, xmm2, xmm3, xmm4; - case 2: - do { - *(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes; - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; + xmm1 = xmm_load_128u((const __m128i*)src + 0); + xmm2 = xmm_load_128u((const __m128i*)src + 1); + xmm3 = xmm_load_128u((const __m128i*)src + 2); + xmm4 = xmm_load_128u((const __m128i*)src + 3); - case 4: - do { - *(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes; - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; + xmm_save_128((__m128i*)dst + 0, xmm1); + xmm_save_128((__m128i*)dst + 1, xmm2); + xmm_save_128((__m128i*)dst + 2, xmm3); + xmm_save_128((__m128i*)dst + 3, xmm4); +} - case 8: - do { - *(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes; - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; - case 16: - do { - ((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0]; - ((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1]; - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; +static force_inline void +to_sse32(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm1, xmm2; - default: - do { - memcpy(dst_bytes, src_bytes, byte_width); - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; + xmm1 = xmm_load_128u((const __m128i*)src + 0); + xmm2 = xmm_load_128u((const __m128i*)src + 1); + + xmm_save_128((__m128i*)dst + 0, xmm1); + xmm_save_128((__m128i*)dst + 1, xmm2); +} + +static force_inline void +to_sse16(uint8_t *dst, const uint8_t *src) +{ + xmm_save_128((__m128i*)dst, xmm_load_128u((const __m128i*)src)); +} + +static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len) +{ + assert(len); + if ((uintptr_t)dst & 15) { + if (len <= 16 - ((uintptr_t)dst & 15)) { + memcpy(dst, src, len); + return; + } + + if ((uintptr_t)dst & 1) { + assert(len >= 1); + *dst++ = *src++; + len--; + } + if ((uintptr_t)dst & 2) { + assert(((uintptr_t)dst & 1) == 0); + assert(len >= 2); + *(uint16_t *)dst = *(const uint16_t *)src; + dst += 2; + src += 2; + len -= 2; + } + if ((uintptr_t)dst & 4) { + assert(((uintptr_t)dst & 3) == 0); + assert(len >= 4); + *(uint32_t *)dst = *(const uint32_t *)src; + dst += 4; + src += 4; + len -= 4; + } + if ((uintptr_t)dst & 8) { + assert(((uintptr_t)dst & 7) == 0); + assert(len >= 8); + *(uint64_t *)dst = *(const uint64_t *)src; + dst += 8; + src += 8; + len -= 8; + } + } + + assert(((uintptr_t)dst & 15) == 0); + while (len >= 64) { + to_sse64(dst, src); + dst += 64; + src += 64; + len -= 64; + } + if (len == 0) + return; + + if (len & 32) { + to_sse32(dst, src); + dst += 32; + src += 32; + } + if (len & 16) { + to_sse16(dst, src); + dst += 16; + src += 16; } + if (len & 8) { + *(uint64_t *)dst = *(uint64_t *)src; + dst += 8; + src += 8; + } + if (len & 4) { + *(uint32_t *)dst = *(uint32_t *)src; + dst += 4; + src += 4; + } + memcpy(dst, src, len & 3); } -static fast_memcpy void -memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height) +static void +memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) { const unsigned tile_width = 512; const unsigned tile_height = 8; @@ -250,189 +237,112 @@ memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp, const unsigned tile_shift = ffs(tile_pixels) - 1; const unsigned tile_mask = tile_pixels - 1; + unsigned offset_x, length_x; + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); assert(src != dst); if (src_x | src_y) src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; - assert(src_stride >= width * cpp); - src_stride -= width * cpp; + width *= cpp; + assert(src_stride >= width); + + if (dst_x & tile_mask) { + offset_x = (dst_x & tile_mask) * cpp; + length_x = min(tile_width - offset_x, width); + } else + length_x = 0; + dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size; while (height--) { - unsigned w = width * cpp; + unsigned w = width; + const uint8_t *src_row = src; uint8_t *tile_row = dst; + src = (const uint8_t *)src + src_stride; + tile_row += dst_y / tile_height * dst_stride * tile_height; tile_row += (dst_y & (tile_height-1)) * tile_width; - if (dst_x) { - tile_row += (dst_x >> tile_shift) * tile_size; - if (dst_x & tile_mask) { - const unsigned x = (dst_x & tile_mask) * cpp; - const unsigned len = min(tile_width - x, w); - memcpy(assume_misaligned(tile_row + x, tile_width, x), - src, len); + dst_y++; - tile_row += tile_size; - src = (const uint8_t *)src + len; - w -= len; - } + if (length_x) { + to_memcpy(tile_row + offset_x, src_row, length_x); + + tile_row += tile_size; + src_row = (const uint8_t *)src_row + length_x; + w -= length_x; } while (w >= tile_width) { - memcpy(assume_aligned(tile_row, tile_width), - src, tile_width); + assert(((uintptr_t)tile_row & (tile_width - 1)) == 0); + to_sse128xN(assume_aligned(tile_row, tile_width), + src_row, tile_width); tile_row += tile_size; - src = (const uint8_t *)src + tile_width; + src_row = (const uint8_t *)src_row + tile_width; w -= tile_width; } - memcpy(assume_aligned(tile_row, tile_width), src, w); - src = (const uint8_t *)src + src_stride + w; - dst_y++; - } -} - -static fast_memcpy void -memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height) -{ - const unsigned tile_width = 512; - const unsigned tile_height = 8; - const unsigned tile_size = 4096; - - const unsigned cpp = bpp / 8; - const unsigned tile_pixels = tile_width / cpp; - const unsigned tile_shift = ffs(tile_pixels) - 1; - const unsigned tile_mask = tile_pixels - 1; - - DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", - __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); - assert(src != dst); - - if (dst_x | dst_y) - dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; - assert(dst_stride >= width * cpp); - dst_stride -= width * cpp; - - while (height--) { - unsigned w = width * cpp; - const uint8_t *tile_row = src; - - tile_row += src_y / tile_height * src_stride * tile_height; - tile_row += (src_y & (tile_height-1)) * tile_width; - if (src_x) { - tile_row += (src_x >> tile_shift) * tile_size; - if (src_x & tile_mask) { - const unsigned x = (src_x & tile_mask) * cpp; - const unsigned len = min(tile_width - x, w); - memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len); - - tile_row += tile_size; - dst = (uint8_t *)dst + len; - w -= len; - } - } - while (w >= tile_width) { - memcpy(dst, - assume_aligned(tile_row, tile_width), - tile_width); - - tile_row += tile_size; - dst = (uint8_t *)dst + tile_width; - w -= tile_width; + if (w) { + assert(((uintptr_t)tile_row & (tile_width - 1)) == 0); + to_memcpy(assume_aligned(tile_row, tile_width), + src_row, w); } - memcpy(dst, assume_aligned(tile_row, tile_width), w); - dst = (uint8_t *)dst + dst_stride + w; - src_y++; } } -static fast_memcpy void -memcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height) +static force_inline void +from_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes) { - const unsigned tile_width = 512; - const unsigned tile_height = 8; - const unsigned tile_size = 4096; - - const unsigned cpp = bpp / 8; - const unsigned tile_pixels = tile_width / cpp; - const unsigned tile_shift = ffs(tile_pixels) - 1; - const unsigned tile_mask = tile_pixels - 1; - - DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", - __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); - assert(src != dst); - assert((dst_x & tile_mask) == (src_x & tile_mask)); - - while (height--) { - unsigned w = width * cpp; - uint8_t *dst_row = dst; - const uint8_t *src_row = src; - - dst_row += dst_y / tile_height * dst_stride * tile_height; - dst_row += (dst_y & (tile_height-1)) * tile_width; - if (dst_x) - dst_row += (dst_x >> tile_shift) * tile_size; - dst_y++; + int i; - src_row += src_y / tile_height * src_stride * tile_height; - src_row += (src_y & (tile_height-1)) * tile_width; - if (src_x) - src_row += (src_x >> tile_shift) * tile_size; - src_y++; + assert(((uintptr_t)src & 15) == 0); - if (dst_x & tile_mask) { - const unsigned x = (dst_x & tile_mask) * cpp; - const unsigned len = min(tile_width - x, w); + for (i = 0; i < bytes / 128; i++) { + __m128i xmm0, xmm1, xmm2, xmm3; + __m128i xmm4, xmm5, xmm6, xmm7; - memcpy(assume_misaligned(dst_row + x, tile_width, x), - assume_misaligned(src_row + x, tile_width, x), - len); + xmm0 = xmm_load_128((const __m128i*)src + 0); + xmm1 = xmm_load_128((const __m128i*)src + 1); + xmm2 = xmm_load_128((const __m128i*)src + 2); + xmm3 = xmm_load_128((const __m128i*)src + 3); + xmm4 = xmm_load_128((const __m128i*)src + 4); + xmm5 = xmm_load_128((const __m128i*)src + 5); + xmm6 = xmm_load_128((const __m128i*)src + 6); + xmm7 = xmm_load_128((const __m128i*)src + 7); - dst_row += tile_size; - src_row += tile_size; - w -= len; - } + xmm_save_128u((__m128i*)dst + 0, xmm0); + xmm_save_128u((__m128i*)dst + 1, xmm1); + xmm_save_128u((__m128i*)dst + 2, xmm2); + xmm_save_128u((__m128i*)dst + 3, xmm3); + xmm_save_128u((__m128i*)dst + 4, xmm4); + xmm_save_128u((__m128i*)dst + 5, xmm5); + xmm_save_128u((__m128i*)dst + 6, xmm6); + xmm_save_128u((__m128i*)dst + 7, xmm7); - while (w >= tile_width) { - memcpy(assume_aligned(dst_row, tile_width), - assume_aligned(src_row, tile_width), - tile_width); - dst_row += tile_size; - src_row += tile_size; - w -= tile_width; - } - memcpy(assume_aligned(dst_row, tile_width), - assume_aligned(src_row, tile_width), - w); + dst += 128; + src += 128; } } -#if defined(sse2) && defined(__x86_64__) - -sse2 static force_inline void -to_sse128xN(uint8_t *dst, const uint8_t *src, int bytes) +static force_inline void +from_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes) { int i; + assert(((uintptr_t)dst & 15) == 0); + assert(((uintptr_t)src & 15) == 0); + for (i = 0; i < bytes / 128; i++) { __m128i xmm0, xmm1, xmm2, xmm3; __m128i xmm4, xmm5, xmm6, xmm7; - xmm0 = xmm_load_128u((const __m128i*)src + 0); - xmm1 = xmm_load_128u((const __m128i*)src + 1); - xmm2 = xmm_load_128u((const __m128i*)src + 2); - xmm3 = xmm_load_128u((const __m128i*)src + 3); - xmm4 = xmm_load_128u((const __m128i*)src + 4); - xmm5 = xmm_load_128u((const __m128i*)src + 5); - xmm6 = xmm_load_128u((const __m128i*)src + 6); - xmm7 = xmm_load_128u((const __m128i*)src + 7); + xmm0 = xmm_load_128((const __m128i*)src + 0); + xmm1 = xmm_load_128((const __m128i*)src + 1); + xmm2 = xmm_load_128((const __m128i*)src + 2); + xmm3 = xmm_load_128((const __m128i*)src + 3); + xmm4 = xmm_load_128((const __m128i*)src + 4); + xmm5 = xmm_load_128((const __m128i*)src + 5); + xmm6 = xmm_load_128((const __m128i*)src + 6); + xmm7 = xmm_load_128((const __m128i*)src + 7); xmm_save_128((__m128i*)dst + 0, xmm0); xmm_save_128((__m128i*)dst + 1, xmm1); @@ -448,15 +358,36 @@ to_sse128xN(uint8_t *dst, const uint8_t *src, int bytes) } } -sse2 static force_inline void -to_sse64(uint8_t *dst, const uint8_t *src) +static force_inline void +from_sse64u(uint8_t *dst, const uint8_t *src) { __m128i xmm1, xmm2, xmm3, xmm4; - xmm1 = xmm_load_128u((const __m128i*)src + 0); - xmm2 = xmm_load_128u((const __m128i*)src + 1); - xmm3 = xmm_load_128u((const __m128i*)src + 2); - xmm4 = xmm_load_128u((const __m128i*)src + 3); + assert(((uintptr_t)src & 15) == 0); + + xmm1 = xmm_load_128((const __m128i*)src + 0); + xmm2 = xmm_load_128((const __m128i*)src + 1); + xmm3 = xmm_load_128((const __m128i*)src + 2); + xmm4 = xmm_load_128((const __m128i*)src + 3); + + xmm_save_128u((__m128i*)dst + 0, xmm1); + xmm_save_128u((__m128i*)dst + 1, xmm2); + xmm_save_128u((__m128i*)dst + 2, xmm3); + xmm_save_128u((__m128i*)dst + 3, xmm4); +} + +static force_inline void +from_sse64a(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm1, xmm2, xmm3, xmm4; + + assert(((uintptr_t)dst & 15) == 0); + assert(((uintptr_t)src & 15) == 0); + + xmm1 = xmm_load_128((const __m128i*)src + 0); + xmm2 = xmm_load_128((const __m128i*)src + 1); + xmm3 = xmm_load_128((const __m128i*)src + 2); + xmm4 = xmm_load_128((const __m128i*)src + 3); xmm_save_128((__m128i*)dst + 0, xmm1); xmm_save_128((__m128i*)dst + 1, xmm2); @@ -464,103 +395,56 @@ to_sse64(uint8_t *dst, const uint8_t *src) xmm_save_128((__m128i*)dst + 3, xmm4); } -sse2 static force_inline void -to_sse32(uint8_t *dst, const uint8_t *src) +static force_inline void +from_sse32u(uint8_t *dst, const uint8_t *src) { __m128i xmm1, xmm2; - xmm1 = xmm_load_128u((const __m128i*)src + 0); - xmm2 = xmm_load_128u((const __m128i*)src + 1); + xmm1 = xmm_load_128((const __m128i*)src + 0); + xmm2 = xmm_load_128((const __m128i*)src + 1); + + xmm_save_128u((__m128i*)dst + 0, xmm1); + xmm_save_128u((__m128i*)dst + 1, xmm2); +} + +static force_inline void +from_sse32a(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm1, xmm2; + + assert(((uintptr_t)dst & 15) == 0); + assert(((uintptr_t)src & 15) == 0); + + xmm1 = xmm_load_128((const __m128i*)src + 0); + xmm2 = xmm_load_128((const __m128i*)src + 1); xmm_save_128((__m128i*)dst + 0, xmm1); xmm_save_128((__m128i*)dst + 1, xmm2); } -sse2 static force_inline void -to_sse16(uint8_t *dst, const uint8_t *src) +static force_inline void +from_sse16u(uint8_t *dst, const uint8_t *src) { - xmm_save_128((__m128i*)dst, xmm_load_128u((const __m128i*)src)); + assert(((uintptr_t)src & 15) == 0); + + xmm_save_128u((__m128i*)dst, xmm_load_128((const __m128i*)src)); } -sse2 static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len) +static force_inline void +from_sse16a(uint8_t *dst, const uint8_t *src) { - assert(len); - if ((uintptr_t)dst & 15) { - if (len <= 16 - ((uintptr_t)dst & 15)) { - memcpy(dst, src, len); - return; - } + assert(((uintptr_t)dst & 15) == 0); + assert(((uintptr_t)src & 15) == 0); - if ((uintptr_t)dst & 1) { - assert(len >= 1); - *dst++ = *src++; - len--; - } - if ((uintptr_t)dst & 2) { - assert(((uintptr_t)dst & 1) == 0); - assert(len >= 2); - *(uint16_t *)dst = *(const uint16_t *)src; - dst += 2; - src += 2; - len -= 2; - } - if ((uintptr_t)dst & 4) { - assert(((uintptr_t)dst & 3) == 0); - assert(len >= 4); - *(uint32_t *)dst = *(const uint32_t *)src; - dst += 4; - src += 4; - len -= 4; - } - if ((uintptr_t)dst & 8) { - assert(((uintptr_t)dst & 7) == 0); - assert(len >= 8); - *(uint64_t *)dst = *(const uint64_t *)src; - dst += 8; - src += 8; - len -= 8; - } - } - - assert(((uintptr_t)dst & 15) == 0); - while (len >= 64) { - to_sse64(dst, src); - dst += 64; - src += 64; - len -= 64; - } - if (len == 0) - return; - - if (len & 32) { - to_sse32(dst, src); - dst += 32; - src += 32; - } - if (len & 16) { - to_sse16(dst, src); - dst += 16; - src += 16; - } - if (len & 8) { - *(uint64_t *)dst = *(uint64_t *)src; - dst += 8; - src += 8; - } - if (len & 4) { - *(uint32_t *)dst = *(uint32_t *)src; - dst += 4; - src += 4; - } - memcpy(dst, src, len & 3); + xmm_save_128((__m128i*)dst, xmm_load_128((const __m128i*)src)); } -sse2 static fast_memcpy void -memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height) +static void +memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) { const unsigned tile_width = 512; const unsigned tile_height = 8; @@ -571,214 +455,331 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, const unsigned tile_shift = ffs(tile_pixels) - 1; const unsigned tile_mask = tile_pixels - 1; - unsigned offset_x, length_x; + unsigned length_x, offset_x; DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); assert(src != dst); - if (src_x | src_y) - src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; + if (dst_x | dst_y) + dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; width *= cpp; - assert(src_stride >= width); - - if (dst_x & tile_mask) { - offset_x = (dst_x & tile_mask) * cpp; + assert(dst_stride >= width); + if (src_x & tile_mask) { + offset_x = (src_x & tile_mask) * cpp; length_x = min(tile_width - offset_x, width); - } else - length_x = 0; - dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size; + dst_stride -= width; + dst_stride += (width - length_x) & 15; + } else { + offset_x = 0; + dst_stride -= width & ~15; + } + assert(dst_stride >= 0); + src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size; while (height--) { unsigned w = width; - const uint8_t *src_row = src; - uint8_t *tile_row = dst; - - src = (const uint8_t *)src + src_stride; - - tile_row += dst_y / tile_height * dst_stride * tile_height; - tile_row += (dst_y & (tile_height-1)) * tile_width; - dst_y++; + const uint8_t *tile_row = src; - if (length_x) { - to_memcpy(tile_row + offset_x, src_row, length_x); + tile_row += src_y / tile_height * src_stride * tile_height; + tile_row += (src_y & (tile_height-1)) * tile_width; + src_y++; + if (offset_x) { + memcpy(dst, tile_row + offset_x, length_x); tile_row += tile_size; - src_row = (const uint8_t *)src_row + length_x; + dst = (uint8_t *)dst + length_x; w -= length_x; } - while (w >= tile_width) { - assert(((uintptr_t)tile_row & (tile_width - 1)) == 0); - to_sse128xN(assume_aligned(tile_row, tile_width), - src_row, tile_width); - tile_row += tile_size; - src_row = (const uint8_t *)src_row + tile_width; - w -= tile_width; - } - if (w) { - assert(((uintptr_t)tile_row & (tile_width - 1)) == 0); - to_memcpy(assume_aligned(tile_row, tile_width), - src_row, w); + + if ((uintptr_t)dst & 15) { + while (w >= tile_width) { + from_sse128xNu(dst, + assume_aligned(tile_row, tile_width), + tile_width); + tile_row += tile_size; + dst = (uint8_t *)dst + tile_width; + w -= tile_width; + } + while (w >= 64) { + from_sse64u(dst, tile_row); + tile_row += 64; + dst = (uint8_t *)dst + 64; + w -= 64; + } + if (w & 32) { + from_sse32u(dst, tile_row); + tile_row += 32; + dst = (uint8_t *)dst + 32; + } + if (w & 16) { + from_sse16u(dst, tile_row); + tile_row += 16; + dst = (uint8_t *)dst + 16; + } + memcpy(dst, assume_aligned(tile_row, 16), w & 15); + } else { + while (w >= tile_width) { + from_sse128xNa(assume_aligned(dst, 16), + assume_aligned(tile_row, tile_width), + tile_width); + tile_row += tile_size; + dst = (uint8_t *)dst + tile_width; + w -= tile_width; + } + while (w >= 64) { + from_sse64a(dst, tile_row); + tile_row += 64; + dst = (uint8_t *)dst + 64; + w -= 64; + } + if (w & 32) { + from_sse32a(dst, tile_row); + tile_row += 32; + dst = (uint8_t *)dst + 32; + } + if (w & 16) { + from_sse16a(dst, tile_row); + tile_row += 16; + dst = (uint8_t *)dst + 16; + } + memcpy(assume_aligned(dst, 16), + assume_aligned(tile_row, 16), + w & 15); } + dst = (uint8_t *)dst + dst_stride; } } -sse2 static force_inline void -from_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes) +static void +memcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) { - int i; - - assert(((uintptr_t)src & 15) == 0); + const unsigned tile_width = 512; + const unsigned tile_height = 8; + const unsigned tile_size = 4096; - for (i = 0; i < bytes / 128; i++) { - __m128i xmm0, xmm1, xmm2, xmm3; - __m128i xmm4, xmm5, xmm6, xmm7; + const unsigned cpp = bpp / 8; + const unsigned tile_pixels = tile_width / cpp; + const unsigned tile_shift = ffs(tile_pixels) - 1; + const unsigned tile_mask = tile_pixels - 1; - xmm0 = xmm_load_128((const __m128i*)src + 0); - xmm1 = xmm_load_128((const __m128i*)src + 1); - xmm2 = xmm_load_128((const __m128i*)src + 2); - xmm3 = xmm_load_128((const __m128i*)src + 3); - xmm4 = xmm_load_128((const __m128i*)src + 4); - xmm5 = xmm_load_128((const __m128i*)src + 5); - xmm6 = xmm_load_128((const __m128i*)src + 6); - xmm7 = xmm_load_128((const __m128i*)src + 7); + unsigned ox, lx; - xmm_save_128u((__m128i*)dst + 0, xmm0); - xmm_save_128u((__m128i*)dst + 1, xmm1); - xmm_save_128u((__m128i*)dst + 2, xmm2); - xmm_save_128u((__m128i*)dst + 3, xmm3); - xmm_save_128u((__m128i*)dst + 4, xmm4); - xmm_save_128u((__m128i*)dst + 5, xmm5); - xmm_save_128u((__m128i*)dst + 6, xmm6); - xmm_save_128u((__m128i*)dst + 7, xmm7); + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + assert(src != dst); - dst += 128; - src += 128; - } -} + width *= cpp; + dst_stride *= tile_height; + src_stride *= tile_height; -sse2 static force_inline void -from_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes) -{ - int i; + assert((dst_x & tile_mask) == (src_x & tile_mask)); + if (dst_x & tile_mask) { + ox = (dst_x & tile_mask) * cpp; + lx = min(tile_width - ox, width); + assert(lx != 0); + } else + lx = 0; - assert(((uintptr_t)dst & 15) == 0); - assert(((uintptr_t)src & 15) == 0); + if (dst_x) + dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size; + if (src_x) + src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size; - for (i = 0; i < bytes / 128; i++) { - __m128i xmm0, xmm1, xmm2, xmm3; - __m128i xmm4, xmm5, xmm6, xmm7; + while (height--) { + const uint8_t *src_row; + uint8_t *dst_row; + unsigned w = width; - xmm0 = xmm_load_128((const __m128i*)src + 0); - xmm1 = xmm_load_128((const __m128i*)src + 1); - xmm2 = xmm_load_128((const __m128i*)src + 2); - xmm3 = xmm_load_128((const __m128i*)src + 3); - xmm4 = xmm_load_128((const __m128i*)src + 4); - xmm5 = xmm_load_128((const __m128i*)src + 5); - xmm6 = xmm_load_128((const __m128i*)src + 6); - xmm7 = xmm_load_128((const __m128i*)src + 7); + dst_row = dst; + dst_row += dst_y / tile_height * dst_stride; + dst_row += (dst_y & (tile_height-1)) * tile_width; + dst_y++; - xmm_save_128((__m128i*)dst + 0, xmm0); - xmm_save_128((__m128i*)dst + 1, xmm1); - xmm_save_128((__m128i*)dst + 2, xmm2); - xmm_save_128((__m128i*)dst + 3, xmm3); - xmm_save_128((__m128i*)dst + 4, xmm4); - xmm_save_128((__m128i*)dst + 5, xmm5); - xmm_save_128((__m128i*)dst + 6, xmm6); - xmm_save_128((__m128i*)dst + 7, xmm7); + src_row = src; + src_row += src_y / tile_height * src_stride; + src_row += (src_y & (tile_height-1)) * tile_width; + src_y++; - dst += 128; - src += 128; + if (lx) { + to_memcpy(dst_row + ox, src_row + ox, lx); + dst_row += tile_size; + src_row += tile_size; + w -= lx; + } + while (w >= tile_width) { + assert(((uintptr_t)dst_row & (tile_width - 1)) == 0); + assert(((uintptr_t)src_row & (tile_width - 1)) == 0); + to_sse128xN(assume_aligned(dst_row, tile_width), + assume_aligned(src_row, tile_width), + tile_width); + dst_row += tile_size; + src_row += tile_size; + w -= tile_width; + } + if (w) { + assert(((uintptr_t)dst_row & (tile_width - 1)) == 0); + assert(((uintptr_t)src_row & (tile_width - 1)) == 0); + to_memcpy(assume_aligned(dst_row, tile_width), + assume_aligned(src_row, tile_width), + w); + } } } -sse2 static force_inline void -from_sse64u(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm1, xmm2, xmm3, xmm4; +#pragma GCC push_options +#endif - assert(((uintptr_t)src & 15) == 0); +fast void +memcpy_blt(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const uint8_t *src_bytes; + uint8_t *dst_bytes; + int byte_width; - xmm1 = xmm_load_128((const __m128i*)src + 0); - xmm2 = xmm_load_128((const __m128i*)src + 1); - xmm3 = xmm_load_128((const __m128i*)src + 2); - xmm4 = xmm_load_128((const __m128i*)src + 3); + assert(src); + assert(dst); + assert(width && height); + assert(bpp >= 8); + assert(width*bpp <= 8*src_stride); + assert(width*bpp <= 8*dst_stride); - xmm_save_128u((__m128i*)dst + 0, xmm1); - xmm_save_128u((__m128i*)dst + 1, xmm2); - xmm_save_128u((__m128i*)dst + 2, xmm3); - xmm_save_128u((__m128i*)dst + 3, xmm4); -} + DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); -sse2 static force_inline void -from_sse64a(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm1, xmm2, xmm3, xmm4; + bpp /= 8; - assert(((uintptr_t)dst & 15) == 0); - assert(((uintptr_t)src & 15) == 0); + src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp; + dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp; - xmm1 = xmm_load_128((const __m128i*)src + 0); - xmm2 = xmm_load_128((const __m128i*)src + 1); - xmm3 = xmm_load_128((const __m128i*)src + 2); - xmm4 = xmm_load_128((const __m128i*)src + 3); + byte_width = width * bpp; + if (byte_width == src_stride && byte_width == dst_stride) { + byte_width *= height; + height = 1; + } - xmm_save_128((__m128i*)dst + 0, xmm1); - xmm_save_128((__m128i*)dst + 1, xmm2); - xmm_save_128((__m128i*)dst + 2, xmm3); - xmm_save_128((__m128i*)dst + 3, xmm4); -} + switch (byte_width) { + case 1: + do { + *dst_bytes = *src_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; -sse2 static force_inline void -from_sse32u(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm1, xmm2; + case 2: + do { + *(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; - xmm1 = xmm_load_128((const __m128i*)src + 0); - xmm2 = xmm_load_128((const __m128i*)src + 1); + case 4: + do { + *(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; - xmm_save_128u((__m128i*)dst + 0, xmm1); - xmm_save_128u((__m128i*)dst + 1, xmm2); -} + case 8: + do { + *(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; + case 16: + do { + ((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0]; + ((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1]; + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; -sse2 static force_inline void -from_sse32a(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm1, xmm2; + default: + do { + memcpy(dst_bytes, src_bytes, byte_width); + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; + } +} - assert(((uintptr_t)dst & 15) == 0); - assert(((uintptr_t)src & 15) == 0); +static fast_memcpy void +memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const unsigned tile_width = 512; + const unsigned tile_height = 8; + const unsigned tile_size = 4096; - xmm1 = xmm_load_128((const __m128i*)src + 0); - xmm2 = xmm_load_128((const __m128i*)src + 1); + const unsigned cpp = bpp / 8; + const unsigned tile_pixels = tile_width / cpp; + const unsigned tile_shift = ffs(tile_pixels) - 1; + const unsigned tile_mask = tile_pixels - 1; - xmm_save_128((__m128i*)dst + 0, xmm1); - xmm_save_128((__m128i*)dst + 1, xmm2); -} + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + assert(src != dst); -sse2 static force_inline void -from_sse16u(uint8_t *dst, const uint8_t *src) -{ - assert(((uintptr_t)src & 15) == 0); + if (src_x | src_y) + src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; + assert(src_stride >= width * cpp); + src_stride -= width * cpp; - xmm_save_128u((__m128i*)dst, xmm_load_128((const __m128i*)src)); -} + while (height--) { + unsigned w = width * cpp; + uint8_t *tile_row = dst; -sse2 static force_inline void -from_sse16a(uint8_t *dst, const uint8_t *src) -{ - assert(((uintptr_t)dst & 15) == 0); - assert(((uintptr_t)src & 15) == 0); + tile_row += dst_y / tile_height * dst_stride * tile_height; + tile_row += (dst_y & (tile_height-1)) * tile_width; + if (dst_x) { + tile_row += (dst_x >> tile_shift) * tile_size; + if (dst_x & tile_mask) { + const unsigned x = (dst_x & tile_mask) * cpp; + const unsigned len = min(tile_width - x, w); + memcpy(assume_misaligned(tile_row + x, tile_width, x), + src, len); - xmm_save_128((__m128i*)dst, xmm_load_128((const __m128i*)src)); + tile_row += tile_size; + src = (const uint8_t *)src + len; + w -= len; + } + } + while (w >= tile_width) { + memcpy(assume_aligned(tile_row, tile_width), + src, tile_width); + tile_row += tile_size; + src = (const uint8_t *)src + tile_width; + w -= tile_width; + } + memcpy(assume_aligned(tile_row, tile_width), src, w); + src = (const uint8_t *)src + src_stride + w; + dst_y++; + } } -sse2 static fast_memcpy void -memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height) +static fast_memcpy void +memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) { const unsigned tile_width = 512; const unsigned tile_height = 8; @@ -789,108 +790,54 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, const unsigned tile_shift = ffs(tile_pixels) - 1; const unsigned tile_mask = tile_pixels - 1; - unsigned length_x, offset_x; - DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); assert(src != dst); if (dst_x | dst_y) dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; - width *= cpp; - assert(dst_stride >= width); - if (src_x & tile_mask) { - offset_x = (src_x & tile_mask) * cpp; - length_x = min(tile_width - offset_x, width); - dst_stride -= width; - dst_stride += (width - length_x) & 15; - } else { - offset_x = 0; - dst_stride -= width & ~15; - } - assert(dst_stride >= 0); - src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size; + assert(dst_stride >= width * cpp); + dst_stride -= width * cpp; while (height--) { - unsigned w = width; + unsigned w = width * cpp; const uint8_t *tile_row = src; tile_row += src_y / tile_height * src_stride * tile_height; tile_row += (src_y & (tile_height-1)) * tile_width; - src_y++; - - if (offset_x) { - memcpy(dst, tile_row + offset_x, length_x); - tile_row += tile_size; - dst = (uint8_t *)dst + length_x; - w -= length_x; - } + if (src_x) { + tile_row += (src_x >> tile_shift) * tile_size; + if (src_x & tile_mask) { + const unsigned x = (src_x & tile_mask) * cpp; + const unsigned len = min(tile_width - x, w); + memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len); - if ((uintptr_t)dst & 15) { - while (w >= tile_width) { - from_sse128xNu(dst, - assume_aligned(tile_row, tile_width), - tile_width); - tile_row += tile_size; - dst = (uint8_t *)dst + tile_width; - w -= tile_width; - } - while (w >= 64) { - from_sse64u(dst, tile_row); - tile_row += 64; - dst = (uint8_t *)dst + 64; - w -= 64; - } - if (w & 32) { - from_sse32u(dst, tile_row); - tile_row += 32; - dst = (uint8_t *)dst + 32; - } - if (w & 16) { - from_sse16u(dst, tile_row); - tile_row += 16; - dst = (uint8_t *)dst + 16; - } - memcpy(dst, assume_aligned(tile_row, 16), w & 15); - } else { - while (w >= tile_width) { - from_sse128xNa(assume_aligned(dst, 16), - assume_aligned(tile_row, tile_width), - tile_width); tile_row += tile_size; - dst = (uint8_t *)dst + tile_width; - w -= tile_width; - } - while (w >= 64) { - from_sse64a(dst, tile_row); - tile_row += 64; - dst = (uint8_t *)dst + 64; - w -= 64; - } - if (w & 32) { - from_sse32a(dst, tile_row); - tile_row += 32; - dst = (uint8_t *)dst + 32; - } - if (w & 16) { - from_sse16a(dst, tile_row); - tile_row += 16; - dst = (uint8_t *)dst + 16; + dst = (uint8_t *)dst + len; + w -= len; } - memcpy(assume_aligned(dst, 16), - assume_aligned(tile_row, 16), - w & 15); } - dst = (uint8_t *)dst + dst_stride; + while (w >= tile_width) { + memcpy(dst, + assume_aligned(tile_row, tile_width), + tile_width); + + tile_row += tile_size; + dst = (uint8_t *)dst + tile_width; + w -= tile_width; + } + memcpy(dst, assume_aligned(tile_row, tile_width), w); + dst = (uint8_t *)dst + dst_stride + w; + src_y++; } } -sse2 static fast_memcpy void -memcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height) +static fast_memcpy void +memcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) { const unsigned tile_width = 512; const unsigned tile_height = 8; @@ -901,72 +848,55 @@ memcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, const unsigned tile_shift = ffs(tile_pixels) - 1; const unsigned tile_mask = tile_pixels - 1; - unsigned ox, lx; - DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); assert(src != dst); - - width *= cpp; - dst_stride *= tile_height; - src_stride *= tile_height; - assert((dst_x & tile_mask) == (src_x & tile_mask)); - if (dst_x & tile_mask) { - ox = (dst_x & tile_mask) * cpp; - lx = min(tile_width - ox, width); - assert(lx != 0); - } else - lx = 0; - - if (dst_x) - dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size; - if (src_x) - src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size; while (height--) { - const uint8_t *src_row; - uint8_t *dst_row; - unsigned w = width; + unsigned w = width * cpp; + uint8_t *dst_row = dst; + const uint8_t *src_row = src; - dst_row = dst; - dst_row += dst_y / tile_height * dst_stride; + dst_row += dst_y / tile_height * dst_stride * tile_height; dst_row += (dst_y & (tile_height-1)) * tile_width; + if (dst_x) + dst_row += (dst_x >> tile_shift) * tile_size; dst_y++; - src_row = src; - src_row += src_y / tile_height * src_stride; + src_row += src_y / tile_height * src_stride * tile_height; src_row += (src_y & (tile_height-1)) * tile_width; + if (src_x) + src_row += (src_x >> tile_shift) * tile_size; src_y++; - if (lx) { - to_memcpy(dst_row + ox, src_row + ox, lx); + if (dst_x & tile_mask) { + const unsigned x = (dst_x & tile_mask) * cpp; + const unsigned len = min(tile_width - x, w); + + memcpy(assume_misaligned(dst_row + x, tile_width, x), + assume_misaligned(src_row + x, tile_width, x), + len); + dst_row += tile_size; src_row += tile_size; - w -= lx; + w -= len; } + while (w >= tile_width) { - assert(((uintptr_t)dst_row & (tile_width - 1)) == 0); - assert(((uintptr_t)src_row & (tile_width - 1)) == 0); - to_sse128xN(assume_aligned(dst_row, tile_width), - assume_aligned(src_row, tile_width), - tile_width); + memcpy(assume_aligned(dst_row, tile_width), + assume_aligned(src_row, tile_width), + tile_width); dst_row += tile_size; src_row += tile_size; w -= tile_width; } - if (w) { - assert(((uintptr_t)dst_row & (tile_width - 1)) == 0); - assert(((uintptr_t)src_row & (tile_width - 1)) == 0); - to_memcpy(assume_aligned(dst_row, tile_width), - assume_aligned(src_row, tile_width), - w); - } + memcpy(assume_aligned(dst_row, tile_width), + assume_aligned(src_row, tile_width), + w); } } -#endif - #define memcpy_to_tiled_x(swizzle) \ fast_memcpy static void \ memcpy_to_tiled_x__##swizzle (const void *src, void *dst, int bpp, \ @@ -1241,7 +1171,7 @@ void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling, unsigned cpu) break; case I915_BIT_6_SWIZZLE_NONE: DBG(("%s: no swizzling\n", __FUNCTION__)); -#if defined(sse2) && defined(__x86_64__) +#if defined(sse2) if (cpu & SSE2) { kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0__sse2; kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0__sse2; @@ -1498,7 +1428,7 @@ memcpy_xor(const void *src, void *dst, int bpp, height = 1; } -#if USE_SSE2 +#if defined(sse2) && __x86_64__ if (have_sse2()) { do { uint32_t *d = (uint32_t *)dst_bytes; -- cgit v1.2.3