summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/sna/blt.c253
1 files changed, 222 insertions, 31 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c
index d28ad985..fb3dd35c 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -31,11 +31,118 @@
#include "sna.h"
+#if __x86_64__
+#define USE_SSE2 1
+#endif
+
#if DEBUG_BLT
#undef DBG
#define DBG(x) ErrorF x
#endif
+#if USE_SSE2
+#include <xmmintrin.h>
+
+#if __x86_64__
+#define have_sse2() 1
+#else
+enum {
+ MMX = 0x1,
+ MMX_EXTENSIONS = 0x2,
+ SSE = 0x6,
+ SSE2 = 0x8,
+ CMOV = 0x10
+};
+
+#ifdef __GNUC__
+static unsigned int
+detect_cpu_features(void)
+{
+ unsigned int features;
+ unsigned int result = 0;
+
+ char vendor[13];
+ vendor[0] = 0;
+ vendor[12] = 0;
+
+ asm (
+ "pushf\n"
+ "pop %%eax\n"
+ "mov %%eax, %%ecx\n"
+ "xor $0x00200000, %%eax\n"
+ "push %%eax\n"
+ "popf\n"
+ "pushf\n"
+ "pop %%eax\n"
+ "mov $0x0, %%edx\n"
+ "xor %%ecx, %%eax\n"
+ "jz 1f\n"
+
+ "mov $0x00000000, %%eax\n"
+ "push %%ebx\n"
+ "cpuid\n"
+ "mov %%ebx, %%eax\n"
+ "pop %%ebx\n"
+ "mov %%eax, %1\n"
+ "mov %%edx, %2\n"
+ "mov %%ecx, %3\n"
+ "mov $0x00000001, %%eax\n"
+ "push %%ebx\n"
+ "cpuid\n"
+ "pop %%ebx\n"
+ "1:\n"
+ "mov %%edx, %0\n"
+ : "=r" (result), "=m" (vendor[0]), "=m" (vendor[4]), "=m" (vendor[8])
+ :: "%eax", "%ecx", "%edx");
+
+ features = 0;
+ if (result) {
+ /* result now contains the standard feature bits */
+ if (result & (1 << 15))
+ features |= CMOV;
+ if (result & (1 << 23))
+ features |= MMX;
+ if (result & (1 << 25))
+ features |= SSE;
+ if (result & (1 << 26))
+ features |= SSE2;
+ }
+ return features;
+}
+#else
+static unsigned int detect_cpu_features(void) { return 0; }
+#endif
+
+static bool have_sse2(void)
+{
+ static int sse2_present = -1;
+
+ if (sse2_present == -1)
+ sse2_present = detect_cpu_features() & SSE2;
+
+ return sse2_present;
+}
+#endif
+
+static inline __m128i
+xmm_create_mask_32(uint32_t mask)
+{
+ return _mm_set_epi32(mask, mask, mask, mask);
+}
+
+static inline __m128i
+xmm_load_128u(const __m128i *src)
+{
+ return _mm_loadu_si128(src);
+}
+
+static inline void
+xmm_save_128(__m128i *dst, __m128i data)
+{
+ _mm_store_si128(dst, data);
+}
+#endif
+
void
memcpy_blt(const void *src, void *dst, int bpp,
int32_t src_stride, int32_t dst_stride,
@@ -136,39 +243,123 @@ memcpy_xor(const void *src, void *dst, int bpp,
if (and == 0xffffffff) {
switch (bpp) {
case 1:
- do {
- for (i = 0; i < width; i++)
- dst_bytes[i] = src_bytes[i] | or;
-
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- } while (--height);
- break;
-
+ if (width & 1) {
+ do {
+ for (i = 0; i < width; i++)
+ dst_bytes[i] = src_bytes[i] | or;
+
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ } while (--height);
+ break;
+ } else {
+ width /= 2;
+ or |= or << 8;
+ }
case 2:
- do {
- uint16_t *d = (uint16_t *)dst_bytes;
- uint16_t *s = (uint16_t *)src_bytes;
-
- for (i = 0; i < width; i++)
- d[i] = s[i] | or;
-
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- } while (--height);
- break;
-
+ if (width & 1) {
+ do {
+ uint16_t *d = (uint16_t *)dst_bytes;
+ uint16_t *s = (uint16_t *)src_bytes;
+
+ for (i = 0; i < width; i++)
+ d[i] = s[i] | or;
+
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ } while (--height);
+ break;
+ } else {
+ width /= 2;
+ or |= or << 16;
+ }
case 4:
- do {
- uint32_t *d = (uint32_t *)dst_bytes;
- uint32_t *s = (uint32_t *)src_bytes;
-
- for (i = 0; i < width; i++)
- d[i] = s[i] | or;
-
- src_bytes += src_stride;
- dst_bytes += dst_stride;
- } while (--height);
+#if USE_SSE2
+ if (width * 4 == dst_stride && dst_stride == src_stride) {
+ width *= height;
+ height = 1;
+ }
+
+ if (have_sse2()) {
+ do {
+ uint32_t *d = (uint32_t *)dst_bytes;
+ uint32_t *s = (uint32_t *)src_bytes;
+ __m128i mask = xmm_create_mask_32(or);
+
+ i = width;
+ while (i && (uintptr_t)d & 15) {
+ *d++ = *s++ | or;
+ i--;
+ }
+
+ while (i >= 16) {
+ __m128i xmm1, xmm2, xmm3, xmm4;
+
+ xmm1 = xmm_load_128u((__m128i*)s + 0);
+ xmm2 = xmm_load_128u((__m128i*)s + 1);
+ xmm3 = xmm_load_128u((__m128i*)s + 2);
+ xmm4 = xmm_load_128u((__m128i*)s + 3);
+
+ xmm_save_128((__m128i*)d + 0,
+ _mm_or_si128(xmm1, mask));
+ xmm_save_128((__m128i*)d + 1,
+ _mm_or_si128(xmm2, mask));
+ xmm_save_128((__m128i*)d + 2,
+ _mm_or_si128(xmm3, mask));
+ xmm_save_128((__m128i*)d + 3,
+ _mm_or_si128(xmm4, mask));
+
+ d += 16;
+ s += 16;
+ i -= 16;
+ }
+
+ if (i & 8) {
+ __m128i xmm1, xmm2;
+
+ xmm1 = xmm_load_128u((__m128i*)s + 0);
+ xmm2 = xmm_load_128u((__m128i*)s + 1);
+
+ xmm_save_128((__m128i*)d + 0,
+ _mm_or_si128(xmm1, mask));
+ xmm_save_128((__m128i*)d + 1,
+ _mm_or_si128(xmm2, mask));
+ d += 8;
+ s += 8;
+ i -= 8;
+ }
+
+ if (i & 4) {
+ xmm_save_128((__m128i*)d,
+ _mm_or_si128(xmm_load_128u((__m128i*)s),
+ mask));
+
+ d += 4;
+ s += 4;
+ i -= 4;
+ }
+
+ while (i) {
+ *d++ = *s++ | or;
+ i--;
+ }
+
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ } while (--height);
+ } else
+#else
+ do {
+ uint32_t *d = (uint32_t *)dst_bytes;
+ uint32_t *s = (uint32_t *)src_bytes;
+
+ for (i = 0; i < width; i++)
+ d[i] = s[i] | or;
+
+ src_bytes += src_stride;
+ dst_bytes += dst_stride;
+ } while (--height);
+#endif
break;
}
} else {