diff options
Diffstat (limited to 'src/sna/blt.c')
-rw-r--r-- | src/sna/blt.c | 253 |
1 files changed, 222 insertions, 31 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c index d28ad985..fb3dd35c 100644 --- a/src/sna/blt.c +++ b/src/sna/blt.c @@ -31,11 +31,118 @@ #include "sna.h" +#if __x86_64__ +#define USE_SSE2 1 +#endif + #if DEBUG_BLT #undef DBG #define DBG(x) ErrorF x #endif +#if USE_SSE2 +#include <xmmintrin.h> + +#if __x86_64__ +#define have_sse2() 1 +#else +enum { + MMX = 0x1, + MMX_EXTENSIONS = 0x2, + SSE = 0x6, + SSE2 = 0x8, + CMOV = 0x10 +}; + +#ifdef __GNUC__ +static unsigned int +detect_cpu_features(void) +{ + unsigned int features; + unsigned int result = 0; + + char vendor[13]; + vendor[0] = 0; + vendor[12] = 0; + + asm ( + "pushf\n" + "pop %%eax\n" + "mov %%eax, %%ecx\n" + "xor $0x00200000, %%eax\n" + "push %%eax\n" + "popf\n" + "pushf\n" + "pop %%eax\n" + "mov $0x0, %%edx\n" + "xor %%ecx, %%eax\n" + "jz 1f\n" + + "mov $0x00000000, %%eax\n" + "push %%ebx\n" + "cpuid\n" + "mov %%ebx, %%eax\n" + "pop %%ebx\n" + "mov %%eax, %1\n" + "mov %%edx, %2\n" + "mov %%ecx, %3\n" + "mov $0x00000001, %%eax\n" + "push %%ebx\n" + "cpuid\n" + "pop %%ebx\n" + "1:\n" + "mov %%edx, %0\n" + : "=r" (result), "=m" (vendor[0]), "=m" (vendor[4]), "=m" (vendor[8]) + :: "%eax", "%ecx", "%edx"); + + features = 0; + if (result) { + /* result now contains the standard feature bits */ + if (result & (1 << 15)) + features |= CMOV; + if (result & (1 << 23)) + features |= MMX; + if (result & (1 << 25)) + features |= SSE; + if (result & (1 << 26)) + features |= SSE2; + } + return features; +} +#else +static unsigned int detect_cpu_features(void) { return 0; } +#endif + +static bool have_sse2(void) +{ + static int sse2_present = -1; + + if (sse2_present == -1) + sse2_present = detect_cpu_features() & SSE2; + + return sse2_present; +} +#endif + +static inline __m128i +xmm_create_mask_32(uint32_t mask) +{ + return _mm_set_epi32(mask, mask, mask, mask); +} + +static inline __m128i +xmm_load_128u(const __m128i *src) +{ + return _mm_loadu_si128(src); +} + +static inline void +xmm_save_128(__m128i *dst, __m128i data) +{ + _mm_store_si128(dst, data); +} +#endif + void memcpy_blt(const void *src, void *dst, int bpp, int32_t src_stride, int32_t dst_stride, @@ -136,39 +243,123 @@ memcpy_xor(const void *src, void *dst, int bpp, if (and == 0xffffffff) { switch (bpp) { case 1: - do { - for (i = 0; i < width; i++) - dst_bytes[i] = src_bytes[i] | or; - - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; - + if (width & 1) { + do { + for (i = 0; i < width; i++) + dst_bytes[i] = src_bytes[i] | or; + + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; + } else { + width /= 2; + or |= or << 8; + } case 2: - do { - uint16_t *d = (uint16_t *)dst_bytes; - uint16_t *s = (uint16_t *)src_bytes; - - for (i = 0; i < width; i++) - d[i] = s[i] | or; - - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); - break; - + if (width & 1) { + do { + uint16_t *d = (uint16_t *)dst_bytes; + uint16_t *s = (uint16_t *)src_bytes; + + for (i = 0; i < width; i++) + d[i] = s[i] | or; + + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + break; + } else { + width /= 2; + or |= or << 16; + } case 4: - do { - uint32_t *d = (uint32_t *)dst_bytes; - uint32_t *s = (uint32_t *)src_bytes; - - for (i = 0; i < width; i++) - d[i] = s[i] | or; - - src_bytes += src_stride; - dst_bytes += dst_stride; - } while (--height); +#if USE_SSE2 + if (width * 4 == dst_stride && dst_stride == src_stride) { + width *= height; + height = 1; + } + + if (have_sse2()) { + do { + uint32_t *d = (uint32_t *)dst_bytes; + uint32_t *s = (uint32_t *)src_bytes; + __m128i mask = xmm_create_mask_32(or); + + i = width; + while (i && (uintptr_t)d & 15) { + *d++ = *s++ | or; + i--; + } + + while (i >= 16) { + __m128i xmm1, xmm2, xmm3, xmm4; + + xmm1 = xmm_load_128u((__m128i*)s + 0); + xmm2 = xmm_load_128u((__m128i*)s + 1); + xmm3 = xmm_load_128u((__m128i*)s + 2); + xmm4 = xmm_load_128u((__m128i*)s + 3); + + xmm_save_128((__m128i*)d + 0, + _mm_or_si128(xmm1, mask)); + xmm_save_128((__m128i*)d + 1, + _mm_or_si128(xmm2, mask)); + xmm_save_128((__m128i*)d + 2, + _mm_or_si128(xmm3, mask)); + xmm_save_128((__m128i*)d + 3, + _mm_or_si128(xmm4, mask)); + + d += 16; + s += 16; + i -= 16; + } + + if (i & 8) { + __m128i xmm1, xmm2; + + xmm1 = xmm_load_128u((__m128i*)s + 0); + xmm2 = xmm_load_128u((__m128i*)s + 1); + + xmm_save_128((__m128i*)d + 0, + _mm_or_si128(xmm1, mask)); + xmm_save_128((__m128i*)d + 1, + _mm_or_si128(xmm2, mask)); + d += 8; + s += 8; + i -= 8; + } + + if (i & 4) { + xmm_save_128((__m128i*)d, + _mm_or_si128(xmm_load_128u((__m128i*)s), + mask)); + + d += 4; + s += 4; + i -= 4; + } + + while (i) { + *d++ = *s++ | or; + i--; + } + + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); + } else +#else + do { + uint32_t *d = (uint32_t *)dst_bytes; + uint32_t *s = (uint32_t *)src_bytes; + + for (i = 0; i < width; i++) + d[i] = s[i] | or; + + src_bytes += src_stride; + dst_bytes += dst_stride; + } while (--height); +#endif break; } } else { |