summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2016-05-07 15:24:28 +0100
committerChris Wilson <chris@chris-wilson.co.uk>2016-05-07 15:58:43 +0100
commit08865b0af288e0460c38c2e3ca20a7f9d0311f27 (patch)
tree2a4fc95d29416fa507d916eddd38501e9ad6a085 /src
parentb89f203b0d65b607bc906b9a1ac184ebef7b41df (diff)
sna: Add a special case for fast DRI2CopyRegion and NoAccel
Enable copying onto a scanout buffer using a WC mmap - so long as it is X-tiled and no swizzling. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Diffstat (limited to 'src')
-rw-r--r--src/sna/blt.c157
-rw-r--r--src/sna/kgem.h19
-rw-r--r--src/sna/sna_render.c73
3 files changed, 219 insertions, 30 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c
index eced9715..ab7bd22c 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -349,6 +349,71 @@ memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
}
}
+static fast_memcpy void
+memcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
+ int32_t src_stride, int32_t dst_stride,
+ int16_t src_x, int16_t src_y,
+ int16_t dst_x, int16_t dst_y,
+ uint16_t width, uint16_t height)
+{
+ const unsigned tile_width = 512;
+ const unsigned tile_height = 8;
+ const unsigned tile_size = 4096;
+
+ const unsigned cpp = bpp / 8;
+ const unsigned tile_pixels = tile_width / cpp;
+ const unsigned tile_shift = ffs(tile_pixels) - 1;
+ const unsigned tile_mask = tile_pixels - 1;
+
+ DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+ __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+ assert(src != dst);
+ assert((dst_x & tile_mask) == (src_x & tile_mask));
+
+ while (height--) {
+ unsigned w = width * cpp;
+ uint8_t *dst_row = dst;
+ const uint8_t *src_row = src;
+
+ dst_row += dst_y / tile_height * dst_stride * tile_height;
+ dst_row += (dst_y & (tile_height-1)) * tile_width;
+ if (dst_x)
+ dst_row += (dst_x >> tile_shift) * tile_size;
+ dst_y++;
+
+ src_row += src_y / tile_height * src_stride * tile_height;
+ src_row += (src_y & (tile_height-1)) * tile_width;
+ if (src_x)
+ src_row += (src_x >> tile_shift) * tile_size;
+ src_y++;
+
+ if (dst_x & tile_mask) {
+ const unsigned x = (dst_x & tile_mask) * cpp;
+ const unsigned len = min(tile_width - x, w);
+
+ memcpy(assume_misaligned(dst_row + x, tile_width, x),
+ assume_misaligned(src_row + x, tile_width, x),
+ len);
+
+ dst_row += tile_size;
+ src_row += tile_size;
+ w -= len;
+ }
+
+ while (w >= tile_width) {
+ memcpy(assume_aligned(dst_row, tile_width),
+ assume_aligned(src_row, tile_width),
+ tile_width);
+ dst_row += tile_size;
+ src_row += tile_size;
+ w -= tile_width;
+ }
+ memcpy(assume_aligned(dst_row, tile_width),
+ assume_aligned(src_row, tile_width),
+ w);
+ }
+}
+
#if defined(sse2) && defined(__x86_64__)
sse2 static force_inline void
@@ -461,7 +526,7 @@ sse2 static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len)
while (len >= 64) {
to_sse64(dst, src);
dst += 64;
- src = (const uint8_t *)src + 64;
+ src += 64;
len -= 64;
}
if (len == 0)
@@ -470,22 +535,22 @@ sse2 static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len)
if (len & 32) {
to_sse32(dst, src);
dst += 32;
- src = (const uint8_t *)src + 32;
+ src += 32;
}
if (len & 16) {
to_sse16(dst, src);
dst += 16;
- src = (const uint8_t *)src + 16;
+ src += 16;
}
if (len & 8) {
*(uint64_t *)dst = *(uint64_t *)src;
dst += 8;
- src = (const uint8_t *)src + 8;
+ src += 8;
}
if (len & 4) {
*(uint32_t *)dst = *(uint32_t *)src;
dst += 4;
- src = (const uint8_t *)src + 4;
+ src += 4;
}
memcpy(dst, src, len & 3);
}
@@ -820,6 +885,86 @@ memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
}
}
+sse2 static fast_memcpy void
+memcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
+ int32_t src_stride, int32_t dst_stride,
+ int16_t src_x, int16_t src_y,
+ int16_t dst_x, int16_t dst_y,
+ uint16_t width, uint16_t height)
+{
+ const unsigned tile_width = 512;
+ const unsigned tile_height = 8;
+ const unsigned tile_size = 4096;
+
+ const unsigned cpp = bpp / 8;
+ const unsigned tile_pixels = tile_width / cpp;
+ const unsigned tile_shift = ffs(tile_pixels) - 1;
+ const unsigned tile_mask = tile_pixels - 1;
+
+ unsigned ox, lx;
+
+ DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+ __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+ assert(src != dst);
+
+ width *= cpp;
+ dst_stride *= tile_height;
+ src_stride *= tile_height;
+
+ assert((dst_x & tile_mask) == (src_x & tile_mask));
+ if (dst_x & tile_mask) {
+ ox = (dst_x & tile_mask) * cpp;
+ lx = min(tile_width - ox, width);
+ assert(lx != 0);
+ } else
+ lx = 0;
+
+ if (dst_x)
+ dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size;
+ if (src_x)
+ src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size;
+
+ while (height--) {
+ const uint8_t *src_row;
+ uint8_t *dst_row;
+ unsigned w = width;
+
+ dst_row = dst;
+ dst_row += dst_y / tile_height * dst_stride;
+ dst_row += (dst_y & (tile_height-1)) * tile_width;
+ dst_y++;
+
+ src_row = src;
+ src_row += src_y / tile_height * src_stride;
+ src_row += (src_y & (tile_height-1)) * tile_width;
+ src_y++;
+
+ if (lx) {
+ to_memcpy(dst_row + ox, src_row + ox, lx);
+ dst_row += tile_size;
+ src_row += tile_size;
+ w -= lx;
+ }
+ while (w >= tile_width) {
+ assert(((uintptr_t)dst_row & (tile_width - 1)) == 0);
+ assert(((uintptr_t)src_row & (tile_width - 1)) == 0);
+ to_sse128xN(assume_aligned(dst_row, tile_width),
+ assume_aligned(src_row, tile_width),
+ tile_width);
+ dst_row += tile_size;
+ src_row += tile_size;
+ w -= tile_width;
+ }
+ if (w) {
+ assert(((uintptr_t)dst_row & (tile_width - 1)) == 0);
+ assert(((uintptr_t)src_row & (tile_width - 1)) == 0);
+ to_memcpy(assume_aligned(dst_row, tile_width),
+ assume_aligned(src_row, tile_width),
+ w);
+ }
+ }
+}
+
#endif
#define memcpy_to_tiled_x(swizzle) \
@@ -1100,11 +1245,13 @@ void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling, unsigned cpu)
if (cpu & SSE2) {
kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0__sse2;
kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0__sse2;
+ kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0__sse2;
} else
#endif
{
kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0;
kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0;
+ kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0;
}
break;
case I915_BIT_6_SWIZZLE_9:
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index cd077561..ded8f78f 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -113,6 +113,12 @@ enum {
NUM_MAP_TYPES,
};
+typedef void (*memcpy_box_func)(const void *src, void *dst, int bpp,
+ int32_t src_stride, int32_t dst_stride,
+ int16_t src_x, int16_t src_y,
+ int16_t dst_x, int16_t dst_y,
+ uint16_t width, uint16_t height);
+
struct kgem {
unsigned wedged;
int fd;
@@ -212,16 +218,9 @@ struct kgem {
void (*retire)(struct kgem *kgem);
void (*expire)(struct kgem *kgem);
- void (*memcpy_to_tiled_x)(const void *src, void *dst, int bpp,
- int32_t src_stride, int32_t dst_stride,
- int16_t src_x, int16_t src_y,
- int16_t dst_x, int16_t dst_y,
- uint16_t width, uint16_t height);
- void (*memcpy_from_tiled_x)(const void *src, void *dst, int bpp,
- int32_t src_stride, int32_t dst_stride,
- int16_t src_x, int16_t src_y,
- int16_t dst_x, int16_t dst_y,
- uint16_t width, uint16_t height);
+ memcpy_box_func memcpy_to_tiled_x;
+ memcpy_box_func memcpy_from_tiled_x;
+ memcpy_box_func memcpy_between_tiled_x;
struct kgem_bo *batch_bo;
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 5a8df06d..f8281e99 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -2298,16 +2298,22 @@ static bool can_copy_cpu(struct sna *sna,
struct kgem_bo *src,
struct kgem_bo *dst)
{
- if (src->tiling != dst->tiling)
- return false;
+ DBG(("%s: tiling=%d:%d, pitch=%d:%d, can_map=%d:%d[%d]\n",
+ __FUNCTION__,
+ src->tiling, dst->tiling,
+ src->pitch, dst->pitch,
+ kgem_bo_can_map__cpu(&sna->kgem, src, false),
+ kgem_bo_can_map__cpu(&sna->kgem, dst, true),
+ sna->kgem.has_wc_mmap));
- if (src->pitch != dst->pitch)
+ if (src->tiling != dst->tiling)
return false;
if (!kgem_bo_can_map__cpu(&sna->kgem, src, false))
return false;
- if (!kgem_bo_can_map__cpu(&sna->kgem, dst, true))
+ if (!kgem_bo_can_map__cpu(&sna->kgem, dst, true) &&
+ !sna->kgem.has_wc_mmap)
return false;
DBG(("%s -- yes, src handle=%d, dst handle=%d\n", __FUNCTION__, src->handle, dst->handle));
@@ -2320,8 +2326,8 @@ memcpy_copy_boxes(struct sna *sna, uint8_t op,
const DrawableRec *dst_draw, struct kgem_bo *dst_bo, int16_t dx, int16_t dy,
const BoxRec *box, int n, unsigned flags)
{
+ memcpy_box_func detile = NULL;
void *dst, *src;
- bool clipped;
if (op != GXcopy)
return false;
@@ -2329,25 +2335,53 @@ memcpy_copy_boxes(struct sna *sna, uint8_t op,
if (src_draw->depth != dst_draw->depth)
return false;
- clipped = (n > 1 ||
- box->x1 + dx > 0 ||
- box->y1 + dy > 0 ||
- box->x2 + dx < dst_draw->width ||
- box->y2 + dy < dst_draw->height);
-
dst = src = NULL;
- if (!clipped && can_copy_cpu(sna, src_bo, dst_bo)) {
- dst = kgem_bo_map__cpu(&sna->kgem, dst_bo);
+ if (can_copy_cpu(sna, src_bo, dst_bo)) {
+ if (src_bo->pitch != dst_bo->pitch ||
+ dx != sx || dy != sy || n > 1 ||
+ box->x1 + dx > 0 ||
+ box->y1 + dy > 0 ||
+ box->x2 + dx < dst_draw->width ||
+ box->y2 + dy < dst_draw->height) {
+ if (dx != sx) /* not implemented in memcpy yet */
+ goto use_gtt;
+
+ switch (dst_bo->tiling) {
+ default:
+ case I915_TILING_Y:
+ goto use_gtt;
+
+ case I915_TILING_X:
+ detile = sna->kgem.memcpy_between_tiled_x;
+ if (detile == NULL)
+ goto use_gtt;
+ break;
+
+ case I915_TILING_NONE:
+ break;
+ }
+ }
+
+ if (kgem_bo_can_map__cpu(&sna->kgem, dst_bo, true))
+ dst = kgem_bo_map__cpu(&sna->kgem, dst_bo);
+ else
+ dst = kgem_bo_map__wc(&sna->kgem, dst_bo);
src = kgem_bo_map__cpu(&sna->kgem, src_bo);
}
if (dst == NULL || src == NULL) {
+use_gtt:
dst = kgem_bo_map__gtt(&sna->kgem, dst_bo);
src = kgem_bo_map__gtt(&sna->kgem, src_bo);
if (dst == NULL || src == NULL)
return false;
+
+ detile = NULL;
} else {
- kgem_bo_sync__cpu_full(&sna->kgem, dst_bo, true);
+ if (dst == dst_bo->map__wc)
+ kgem_bo_sync__gtt(&sna->kgem, dst_bo);
+ else
+ kgem_bo_sync__cpu_full(&sna->kgem, dst_bo, true);
kgem_bo_sync__cpu_full(&sna->kgem, src_bo, false);
}
@@ -2355,7 +2389,16 @@ memcpy_copy_boxes(struct sna *sna, uint8_t op,
__FUNCTION__, sx, sy, dx, dy, n));
if (sigtrap_get() == 0) {
- do {
+ if (detile) {
+ do {
+ detile(src, dst, dst_draw->bitsPerPixel,
+ src_bo->pitch, dst_bo->pitch,
+ box->x1 + sx, box->y1 + sy,
+ box->x1 + dx, box->y1 + dy,
+ box->x2 - box->x1, box->y2 - box->y1);
+ box++;
+ } while (--n);
+ } else do {
memcpy_blt(src, dst, dst_draw->bitsPerPixel,
src_bo->pitch, dst_bo->pitch,
box->x1 + sx, box->y1 + sy,