diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2016-05-07 15:24:28 +0100 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2016-05-07 15:58:43 +0100 |
commit | 08865b0af288e0460c38c2e3ca20a7f9d0311f27 (patch) | |
tree | 2a4fc95d29416fa507d916eddd38501e9ad6a085 /src | |
parent | b89f203b0d65b607bc906b9a1ac184ebef7b41df (diff) |
sna: Add a special case for fast DRI2CopyRegion and NoAccel
Enable copying onto a scanout buffer using a WC mmap - so long as it is
X-tiled and no swizzling.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Diffstat (limited to 'src')
-rw-r--r-- | src/sna/blt.c | 157 | ||||
-rw-r--r-- | src/sna/kgem.h | 19 | ||||
-rw-r--r-- | src/sna/sna_render.c | 73 |
3 files changed, 219 insertions, 30 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c index eced9715..ab7bd22c 100644 --- a/src/sna/blt.c +++ b/src/sna/blt.c @@ -349,6 +349,71 @@ memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp, } } +static fast_memcpy void +memcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const unsigned tile_width = 512; + const unsigned tile_height = 8; + const unsigned tile_size = 4096; + + const unsigned cpp = bpp / 8; + const unsigned tile_pixels = tile_width / cpp; + const unsigned tile_shift = ffs(tile_pixels) - 1; + const unsigned tile_mask = tile_pixels - 1; + + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + assert(src != dst); + assert((dst_x & tile_mask) == (src_x & tile_mask)); + + while (height--) { + unsigned w = width * cpp; + uint8_t *dst_row = dst; + const uint8_t *src_row = src; + + dst_row += dst_y / tile_height * dst_stride * tile_height; + dst_row += (dst_y & (tile_height-1)) * tile_width; + if (dst_x) + dst_row += (dst_x >> tile_shift) * tile_size; + dst_y++; + + src_row += src_y / tile_height * src_stride * tile_height; + src_row += (src_y & (tile_height-1)) * tile_width; + if (src_x) + src_row += (src_x >> tile_shift) * tile_size; + src_y++; + + if (dst_x & tile_mask) { + const unsigned x = (dst_x & tile_mask) * cpp; + const unsigned len = min(tile_width - x, w); + + memcpy(assume_misaligned(dst_row + x, tile_width, x), + assume_misaligned(src_row + x, tile_width, x), + len); + + dst_row += tile_size; + src_row += tile_size; + w -= len; + } + + while (w >= tile_width) { + memcpy(assume_aligned(dst_row, tile_width), + assume_aligned(src_row, tile_width), + tile_width); + dst_row += tile_size; + src_row += tile_size; + w -= tile_width; + } + memcpy(assume_aligned(dst_row, tile_width), + assume_aligned(src_row, tile_width), + w); + } +} + #if defined(sse2) && defined(__x86_64__) sse2 static force_inline void @@ -461,7 +526,7 @@ sse2 static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len) while (len >= 64) { to_sse64(dst, src); dst += 64; - src = (const uint8_t *)src + 64; + src += 64; len -= 64; } if (len == 0) @@ -470,22 +535,22 @@ sse2 static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len) if (len & 32) { to_sse32(dst, src); dst += 32; - src = (const uint8_t *)src + 32; + src += 32; } if (len & 16) { to_sse16(dst, src); dst += 16; - src = (const uint8_t *)src + 16; + src += 16; } if (len & 8) { *(uint64_t *)dst = *(uint64_t *)src; dst += 8; - src = (const uint8_t *)src + 8; + src += 8; } if (len & 4) { *(uint32_t *)dst = *(uint32_t *)src; dst += 4; - src = (const uint8_t *)src + 4; + src += 4; } memcpy(dst, src, len & 3); } @@ -820,6 +885,86 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, } } +sse2 static fast_memcpy void +memcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const unsigned tile_width = 512; + const unsigned tile_height = 8; + const unsigned tile_size = 4096; + + const unsigned cpp = bpp / 8; + const unsigned tile_pixels = tile_width / cpp; + const unsigned tile_shift = ffs(tile_pixels) - 1; + const unsigned tile_mask = tile_pixels - 1; + + unsigned ox, lx; + + DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + assert(src != dst); + + width *= cpp; + dst_stride *= tile_height; + src_stride *= tile_height; + + assert((dst_x & tile_mask) == (src_x & tile_mask)); + if (dst_x & tile_mask) { + ox = (dst_x & tile_mask) * cpp; + lx = min(tile_width - ox, width); + assert(lx != 0); + } else + lx = 0; + + if (dst_x) + dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size; + if (src_x) + src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size; + + while (height--) { + const uint8_t *src_row; + uint8_t *dst_row; + unsigned w = width; + + dst_row = dst; + dst_row += dst_y / tile_height * dst_stride; + dst_row += (dst_y & (tile_height-1)) * tile_width; + dst_y++; + + src_row = src; + src_row += src_y / tile_height * src_stride; + src_row += (src_y & (tile_height-1)) * tile_width; + src_y++; + + if (lx) { + to_memcpy(dst_row + ox, src_row + ox, lx); + dst_row += tile_size; + src_row += tile_size; + w -= lx; + } + while (w >= tile_width) { + assert(((uintptr_t)dst_row & (tile_width - 1)) == 0); + assert(((uintptr_t)src_row & (tile_width - 1)) == 0); + to_sse128xN(assume_aligned(dst_row, tile_width), + assume_aligned(src_row, tile_width), + tile_width); + dst_row += tile_size; + src_row += tile_size; + w -= tile_width; + } + if (w) { + assert(((uintptr_t)dst_row & (tile_width - 1)) == 0); + assert(((uintptr_t)src_row & (tile_width - 1)) == 0); + to_memcpy(assume_aligned(dst_row, tile_width), + assume_aligned(src_row, tile_width), + w); + } + } +} + #endif #define memcpy_to_tiled_x(swizzle) \ @@ -1100,11 +1245,13 @@ void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling, unsigned cpu) if (cpu & SSE2) { kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0__sse2; kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0__sse2; + kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0__sse2; } else #endif { kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0; kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0; + kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0; } break; case I915_BIT_6_SWIZZLE_9: diff --git a/src/sna/kgem.h b/src/sna/kgem.h index cd077561..ded8f78f 100644 --- a/src/sna/kgem.h +++ b/src/sna/kgem.h @@ -113,6 +113,12 @@ enum { NUM_MAP_TYPES, }; +typedef void (*memcpy_box_func)(const void *src, void *dst, int bpp, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height); + struct kgem { unsigned wedged; int fd; @@ -212,16 +218,9 @@ struct kgem { void (*retire)(struct kgem *kgem); void (*expire)(struct kgem *kgem); - void (*memcpy_to_tiled_x)(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height); - void (*memcpy_from_tiled_x)(const void *src, void *dst, int bpp, - int32_t src_stride, int32_t dst_stride, - int16_t src_x, int16_t src_y, - int16_t dst_x, int16_t dst_y, - uint16_t width, uint16_t height); + memcpy_box_func memcpy_to_tiled_x; + memcpy_box_func memcpy_from_tiled_x; + memcpy_box_func memcpy_between_tiled_x; struct kgem_bo *batch_bo; diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c index 5a8df06d..f8281e99 100644 --- a/src/sna/sna_render.c +++ b/src/sna/sna_render.c @@ -2298,16 +2298,22 @@ static bool can_copy_cpu(struct sna *sna, struct kgem_bo *src, struct kgem_bo *dst) { - if (src->tiling != dst->tiling) - return false; + DBG(("%s: tiling=%d:%d, pitch=%d:%d, can_map=%d:%d[%d]\n", + __FUNCTION__, + src->tiling, dst->tiling, + src->pitch, dst->pitch, + kgem_bo_can_map__cpu(&sna->kgem, src, false), + kgem_bo_can_map__cpu(&sna->kgem, dst, true), + sna->kgem.has_wc_mmap)); - if (src->pitch != dst->pitch) + if (src->tiling != dst->tiling) return false; if (!kgem_bo_can_map__cpu(&sna->kgem, src, false)) return false; - if (!kgem_bo_can_map__cpu(&sna->kgem, dst, true)) + if (!kgem_bo_can_map__cpu(&sna->kgem, dst, true) && + !sna->kgem.has_wc_mmap) return false; DBG(("%s -- yes, src handle=%d, dst handle=%d\n", __FUNCTION__, src->handle, dst->handle)); @@ -2320,8 +2326,8 @@ memcpy_copy_boxes(struct sna *sna, uint8_t op, const DrawableRec *dst_draw, struct kgem_bo *dst_bo, int16_t dx, int16_t dy, const BoxRec *box, int n, unsigned flags) { + memcpy_box_func detile = NULL; void *dst, *src; - bool clipped; if (op != GXcopy) return false; @@ -2329,25 +2335,53 @@ memcpy_copy_boxes(struct sna *sna, uint8_t op, if (src_draw->depth != dst_draw->depth) return false; - clipped = (n > 1 || - box->x1 + dx > 0 || - box->y1 + dy > 0 || - box->x2 + dx < dst_draw->width || - box->y2 + dy < dst_draw->height); - dst = src = NULL; - if (!clipped && can_copy_cpu(sna, src_bo, dst_bo)) { - dst = kgem_bo_map__cpu(&sna->kgem, dst_bo); + if (can_copy_cpu(sna, src_bo, dst_bo)) { + if (src_bo->pitch != dst_bo->pitch || + dx != sx || dy != sy || n > 1 || + box->x1 + dx > 0 || + box->y1 + dy > 0 || + box->x2 + dx < dst_draw->width || + box->y2 + dy < dst_draw->height) { + if (dx != sx) /* not implemented in memcpy yet */ + goto use_gtt; + + switch (dst_bo->tiling) { + default: + case I915_TILING_Y: + goto use_gtt; + + case I915_TILING_X: + detile = sna->kgem.memcpy_between_tiled_x; + if (detile == NULL) + goto use_gtt; + break; + + case I915_TILING_NONE: + break; + } + } + + if (kgem_bo_can_map__cpu(&sna->kgem, dst_bo, true)) + dst = kgem_bo_map__cpu(&sna->kgem, dst_bo); + else + dst = kgem_bo_map__wc(&sna->kgem, dst_bo); src = kgem_bo_map__cpu(&sna->kgem, src_bo); } if (dst == NULL || src == NULL) { +use_gtt: dst = kgem_bo_map__gtt(&sna->kgem, dst_bo); src = kgem_bo_map__gtt(&sna->kgem, src_bo); if (dst == NULL || src == NULL) return false; + + detile = NULL; } else { - kgem_bo_sync__cpu_full(&sna->kgem, dst_bo, true); + if (dst == dst_bo->map__wc) + kgem_bo_sync__gtt(&sna->kgem, dst_bo); + else + kgem_bo_sync__cpu_full(&sna->kgem, dst_bo, true); kgem_bo_sync__cpu_full(&sna->kgem, src_bo, false); } @@ -2355,7 +2389,16 @@ memcpy_copy_boxes(struct sna *sna, uint8_t op, __FUNCTION__, sx, sy, dx, dy, n)); if (sigtrap_get() == 0) { - do { + if (detile) { + do { + detile(src, dst, dst_draw->bitsPerPixel, + src_bo->pitch, dst_bo->pitch, + box->x1 + sx, box->y1 + sy, + box->x1 + dx, box->y1 + dy, + box->x2 - box->x1, box->y2 - box->y1); + box++; + } while (--n); + } else do { memcpy_blt(src, dst, dst_draw->bitsPerPixel, src_bo->pitch, dst_bo->pitch, box->x1 + sx, box->y1 + sy, |