diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2016-04-08 10:59:36 +0100 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2016-04-08 11:00:22 +0100 |
commit | 15903e7c687705d084d8eadfd7b11c35dc0b247d (patch) | |
tree | 5dd4bfd17fafaef9cd59a59533ba358da2e8aa96 /src | |
parent | ab041b9b91b9bd65861b8a4c30ea8d776041e56d (diff) |
sna: Avoid rep mov (builtin memcpy) for WC writes
Lesson learnt, rep mov is terrible when applied to WC.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Diffstat (limited to 'src')
-rw-r--r-- | src/sna/blt.c | 100 |
1 files changed, 76 insertions, 24 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c index a4738f56..38771d1c 100644 --- a/src/sna/blt.c +++ b/src/sna/blt.c @@ -419,6 +419,74 @@ to_sse16(uint8_t *dst, const uint8_t *src) xmm_save_128((__m128i*)dst, xmm_load_128u((const __m128i*)src)); } +sse2 static void to_memcpy(uint8_t *dst, const uint8_t *src, int len) +{ + if ((uintptr_t)dst & 15) { + if (len <= ((uintptr_t)dst & 15)) { + memcpy(dst, src, len); + return; + } + + if ((uintptr_t)dst & 1) { + *dst++ = *src++; + len--; + } + if ((uintptr_t)dst & 2) { + *(uint16_t *)dst = *(const uint16_t *)src; + dst += 2; + src += 2; + len -= 2; + } + if ((uintptr_t)dst & 4) { + *(uint32_t *)dst = *(const uint32_t *)src; + dst += 4; + src += 4; + len -= 4; + } + if ((uintptr_t)dst & 8) { + *(uint64_t *)dst = *(const uint64_t *)src; + dst += 8; + src += 8; + len -= 8; + } + + if (len == 0) + return; + } + + assert(((uintptr_t)dst & 15) == 0); + while (len >= 64) { + to_sse64(dst, src); + dst += 64; + src = (const uint8_t *)src + 64; + len -= 64; + } + if (len == 0) + return; + + if (len & 32) { + to_sse32(dst, src); + dst += 32; + src = (const uint8_t *)src + 32; + } + if (len & 16) { + to_sse16(dst, src); + dst += 16; + src = (const uint8_t *)src + 16; + } + if (len & 8) { + *(uint64_t *)dst = *(uint64_t *)src; + dst += 8; + src = (const uint8_t *)src + 8; + } + if (len & 4) { + *(uint32_t *)dst = *(uint32_t *)src; + dst += 4; + src = (const uint8_t *)src + 4; + } + memcpy(dst, src, len & 3); +} + sse2 static fast_memcpy void memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, int32_t src_stride, int32_t dst_stride, @@ -443,12 +511,14 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; width *= cpp; assert(src_stride >= width); - src_stride -= width; while (height--) { unsigned w = width; + const uint8_t *src_row = src; uint8_t *tile_row = dst; + src = (const uint8_t *)src + src_stride; + tile_row += dst_y / tile_height * dst_stride * tile_height; tile_row += (dst_y & (tile_height-1)) * tile_width; dst_y++; @@ -458,39 +528,21 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, if (dst_x & tile_mask) { const unsigned x = (dst_x & tile_mask) * cpp; const unsigned len = min(tile_width - x, w); - memcpy(assume_misaligned(tile_row + x, tile_width, x), - src, len); + to_memcpy(tile_row + x, src_row, len); tile_row += tile_size; - src = (const uint8_t *)src + len; + src_row = (const uint8_t *)src_row + len; w -= len; } } while (w >= tile_width) { to_sse128xN(assume_aligned(tile_row, tile_width), - src, tile_width); + src_row, tile_width); tile_row += tile_size; - src = (const uint8_t *)src + tile_width; + src_row = (const uint8_t *)src_row + tile_width; w -= tile_width; } - while (w >= 64) { - to_sse64(tile_row, src); - tile_row += 64; - src = (const uint8_t *)src + 64; - w -= 64; - } - if (w & 32) { - to_sse32(tile_row, src); - tile_row += 32; - src = (const uint8_t *)src + 32; - } - if (w & 16) { - to_sse16(tile_row, src); - tile_row += 16; - src = (const uint8_t *)src + 16; - } - memcpy(assume_aligned(tile_row, 16), src, w & 15); - src = (const uint8_t *)src + src_stride + (w & 15); + to_memcpy(tile_row, src_row, w); } } |