diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2016-04-08 08:34:20 +0100 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2016-04-08 10:01:29 +0100 |
commit | ab041b9b91b9bd65861b8a4c30ea8d776041e56d (patch) | |
tree | b032ac78168e778f9519b5284e8429bc8e26dc1a | |
parent | e62010374c0ffd1b0103285a4bf5572ce0359f51 (diff) |
sna: Specialise alignment paths for storing
Switch between aligned/unaligned stores for the bulk copy inner loops.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
-rw-r--r-- | src/sna/blt.c | 234 |
1 files changed, 176 insertions, 58 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c index c2461404..a4738f56 100644 --- a/src/sna/blt.c +++ b/src/sna/blt.c @@ -444,7 +444,6 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, width *= cpp; assert(src_stride >= width); src_stride -= width; - src_stride += width & 15; while (height--) { unsigned w = width; @@ -452,6 +451,8 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, tile_row += dst_y / tile_height * dst_stride * tile_height; tile_row += (dst_y & (tile_height-1)) * tile_width; + dst_y++; + if (dst_x) { tile_row += (dst_x >> tile_shift) * tile_size; if (dst_x & tile_mask) { @@ -488,33 +489,18 @@ memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, tile_row += 16; src = (const uint8_t *)src + 16; } - memcpy(tile_row, src, w & 15); - src = (const uint8_t *)src + src_stride; - dst_y++; + memcpy(assume_aligned(tile_row, 16), src, w & 15); + src = (const uint8_t *)src + src_stride + (w & 15); } } sse2 static force_inline void -from_sse64(uint8_t *dst, const uint8_t *src) -{ - __m128i xmm1, xmm2, xmm3, xmm4; - - xmm1 = xmm_load_128((const __m128i*)src + 0); - xmm2 = xmm_load_128((const __m128i*)src + 1); - xmm3 = xmm_load_128((const __m128i*)src + 2); - xmm4 = xmm_load_128((const __m128i*)src + 3); - - xmm_save_128u((__m128i*)dst + 0, xmm1); - xmm_save_128u((__m128i*)dst + 1, xmm2); - xmm_save_128u((__m128i*)dst + 2, xmm3); - xmm_save_128u((__m128i*)dst + 3, xmm4); -} - -sse2 static force_inline void -from_sse128xN(uint8_t *dst, const uint8_t *src, int bytes) +from_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes) { int i; + assert(((uintptr_t)src & 15) == 0); + for (i = 0; i < bytes / 128; i++) { __m128i xmm0, xmm1, xmm2, xmm3; __m128i xmm4, xmm5, xmm6, xmm7; @@ -543,7 +529,79 @@ from_sse128xN(uint8_t *dst, const uint8_t *src, int bytes) } sse2 static force_inline void -from_sse32(uint8_t *dst, const uint8_t *src) +from_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes) +{ + int i; + + assert(((uintptr_t)dst & 15) == 0); + assert(((uintptr_t)src & 15) == 0); + + for (i = 0; i < bytes / 128; i++) { + __m128i xmm0, xmm1, xmm2, xmm3; + __m128i xmm4, xmm5, xmm6, xmm7; + + xmm0 = xmm_load_128((const __m128i*)src + 0); + xmm1 = xmm_load_128((const __m128i*)src + 1); + xmm2 = xmm_load_128((const __m128i*)src + 2); + xmm3 = xmm_load_128((const __m128i*)src + 3); + xmm4 = xmm_load_128((const __m128i*)src + 4); + xmm5 = xmm_load_128((const __m128i*)src + 5); + xmm6 = xmm_load_128((const __m128i*)src + 6); + xmm7 = xmm_load_128((const __m128i*)src + 7); + + xmm_save_128((__m128i*)dst + 0, xmm0); + xmm_save_128((__m128i*)dst + 1, xmm1); + xmm_save_128((__m128i*)dst + 2, xmm2); + xmm_save_128((__m128i*)dst + 3, xmm3); + xmm_save_128((__m128i*)dst + 4, xmm4); + xmm_save_128((__m128i*)dst + 5, xmm5); + xmm_save_128((__m128i*)dst + 6, xmm6); + xmm_save_128((__m128i*)dst + 7, xmm7); + + dst += 128; + src += 128; + } +} + +sse2 static force_inline void +from_sse64u(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm1, xmm2, xmm3, xmm4; + + assert(((uintptr_t)src & 15) == 0); + + xmm1 = xmm_load_128((const __m128i*)src + 0); + xmm2 = xmm_load_128((const __m128i*)src + 1); + xmm3 = xmm_load_128((const __m128i*)src + 2); + xmm4 = xmm_load_128((const __m128i*)src + 3); + + xmm_save_128u((__m128i*)dst + 0, xmm1); + xmm_save_128u((__m128i*)dst + 1, xmm2); + xmm_save_128u((__m128i*)dst + 2, xmm3); + xmm_save_128u((__m128i*)dst + 3, xmm4); +} + +sse2 static force_inline void +from_sse64a(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm1, xmm2, xmm3, xmm4; + + assert(((uintptr_t)dst & 15) == 0); + assert(((uintptr_t)src & 15) == 0); + + xmm1 = xmm_load_128((const __m128i*)src + 0); + xmm2 = xmm_load_128((const __m128i*)src + 1); + xmm3 = xmm_load_128((const __m128i*)src + 2); + xmm4 = xmm_load_128((const __m128i*)src + 3); + + xmm_save_128((__m128i*)dst + 0, xmm1); + xmm_save_128((__m128i*)dst + 1, xmm2); + xmm_save_128((__m128i*)dst + 2, xmm3); + xmm_save_128((__m128i*)dst + 3, xmm4); +} + +sse2 static force_inline void +from_sse32u(uint8_t *dst, const uint8_t *src) { __m128i xmm1, xmm2; @@ -555,11 +613,37 @@ from_sse32(uint8_t *dst, const uint8_t *src) } sse2 static force_inline void -from_sse16(uint8_t *dst, const uint8_t *src) +from_sse32a(uint8_t *dst, const uint8_t *src) +{ + __m128i xmm1, xmm2; + + assert(((uintptr_t)dst & 15) == 0); + assert(((uintptr_t)src & 15) == 0); + + xmm1 = xmm_load_128((const __m128i*)src + 0); + xmm2 = xmm_load_128((const __m128i*)src + 1); + + xmm_save_128((__m128i*)dst + 0, xmm1); + xmm_save_128((__m128i*)dst + 1, xmm2); +} + +sse2 static force_inline void +from_sse16u(uint8_t *dst, const uint8_t *src) { + assert(((uintptr_t)src & 15) == 0); + xmm_save_128u((__m128i*)dst, xmm_load_128((const __m128i*)src)); } +sse2 static force_inline void +from_sse16a(uint8_t *dst, const uint8_t *src) +{ + assert(((uintptr_t)dst & 15) == 0); + assert(((uintptr_t)src & 15) == 0); + + xmm_save_128((__m128i*)dst, xmm_load_128((const __m128i*)src)); +} + sse2 static fast_memcpy void memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, int32_t src_stride, int32_t dst_stride, @@ -576,6 +660,8 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, const unsigned tile_shift = ffs(tile_pixels) - 1; const unsigned tile_mask = tile_pixels - 1; + unsigned offset_x; + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); assert(src != dst); @@ -584,8 +670,14 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; width *= cpp; assert(dst_stride >= width); - dst_stride -= width; - dst_stride += width & 15; + if (src_x & tile_mask) { + const unsigned x = (src_x & tile_mask) * cpp; + dst_stride -= width; + offset_x = min(tile_width - x, width); + dst_stride += (width - offset_x) & 15; + } else + dst_stride -= width & ~15; + assert(dst_stride >= 0); while (height--) { unsigned w = width; @@ -593,47 +685,73 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, tile_row += src_y / tile_height * src_stride * tile_height; tile_row += (src_y & (tile_height-1)) * tile_width; + src_y++; + if (src_x) { tile_row += (src_x >> tile_shift) * tile_size; if (src_x & tile_mask) { - const unsigned x = (src_x & tile_mask) * cpp; - const unsigned len = min(tile_width - x, w); - memcpy(dst, - assume_misaligned(tile_row, tile_width, x), - len); - + memcpy(dst, tile_row, offset_x); tile_row += tile_size; - dst = (uint8_t *)dst + len; - w -= len; + dst = (uint8_t *)dst + offset_x; + w -= offset_x; } } - while (w >= tile_width) { - from_sse128xN(dst, - assume_aligned(tile_row, tile_width), - tile_width); - tile_row += tile_size; - dst = (uint8_t *)dst + tile_width; - w -= tile_width; - } - while (w >= 64) { - from_sse64(dst, tile_row); - tile_row += 64; - dst = (uint8_t *)dst + 64; - w -= 64; - } - if (w & 32) { - from_sse32(dst, tile_row); - tile_row += 32; - dst = (uint8_t *)dst + 32; - } - if (w & 16) { - from_sse16(dst, tile_row); - tile_row += 16; - dst = (uint8_t *)dst + 16; + if ((uintptr_t)dst & 15) { + while (w >= tile_width) { + from_sse128xNu(dst, + assume_aligned(tile_row, tile_width), + tile_width); + tile_row += tile_size; + dst = (uint8_t *)dst + tile_width; + w -= tile_width; + } + while (w >= 64) { + from_sse64u(dst, tile_row); + tile_row += 64; + dst = (uint8_t *)dst + 64; + w -= 64; + } + if (w & 32) { + from_sse32u(dst, tile_row); + tile_row += 32; + dst = (uint8_t *)dst + 32; + } + if (w & 16) { + from_sse16u(dst, tile_row); + tile_row += 16; + dst = (uint8_t *)dst + 16; + } + memcpy(dst, assume_aligned(tile_row, 16), w & 15); + } else { + while (w >= tile_width) { + from_sse128xNa(assume_aligned(dst, 16), + assume_aligned(tile_row, tile_width), + tile_width); + tile_row += tile_size; + dst = (uint8_t *)dst + tile_width; + w -= tile_width; + } + while (w >= 64) { + from_sse64a(dst, tile_row); + tile_row += 64; + dst = (uint8_t *)dst + 64; + w -= 64; + } + if (w & 32) { + from_sse32a(dst, tile_row); + tile_row += 32; + dst = (uint8_t *)dst + 32; + } + if (w & 16) { + from_sse16a(dst, tile_row); + tile_row += 16; + dst = (uint8_t *)dst + 16; + } + memcpy(assume_aligned(dst, 16), + assume_aligned(tile_row, 16), + w & 15); } - memcpy(dst, tile_row, w & 15); dst = (uint8_t *)dst + dst_stride; - src_y++; } } |