sna: Use inplace X tiling for LLC uploads

Based on a suggestion by Chad Versace (taken from a patch for mesa). This allows for a faster upload of pixel data through a ShmImage, or for complete replacement of a GPU bo. Using a modified version of x11perf to upload to a pixmap rather than scanout on an IVB i7-3720qm: Before: 40000000 trep @ 0.0007 msec (1410000.0/sec): ShmPutImage 10x10 square 4000000 trep @ 0.0110 msec ( 90700.0/sec): ShmPutImage 100x100 square 160000 trep @ 0.1689 msec ( 5920.0/sec): ShmPutImage 500x500 square After: 40000000 trep @ 0.0007 msec (1450000.0/sec): ShmPutImage 10x10 square 6000000 trep @ 0.0061 msec ( 164000.0/sec): ShmPutImage 100x100 square 400000 trep @ 0.1126 msec ( 8880.0/sec): ShmPutImage 500x500 square However, the real takeaway from this is that the overheads for ShmPutImage are substantial, only hitting around 70% expected efficiency, and overshadowed by PutImage, which for reference is 60000000 trep @ 0.0006 msec (1800000.0/sec): PutImage 10x10 square Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
author: Chris Wilson <chris@chris-wilson.co.uk> 2012-09-11 21:48:24 +0100
committer: Chris Wilson <chris@chris-wilson.co.uk> 2012-09-21 11:56:16 +0100
commit: 0be1d964713ca407f029278a8256d02d925dc9da (patch)
tree: d360eb12a9eed2b0938df9a5c5475da2bf82c1b2 /src/sna/blt.c
parent: d853064e7eebc5719645c12605782f995131a6fe (diff)
1 files changed, 117 insertions, 0 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c
index 853eb20d..4735d14c 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -214,6 +214,123 @@ memcpy_blt(const void *src, void *dst, int bpp,
 }
 
 void
+memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
+		  int32_t src_stride, int32_t dst_stride,
+		  int16_t src_x, int16_t src_y,
+		  int16_t dst_x, int16_t dst_y,
+		  uint16_t width, uint16_t height)
+{
+	const unsigned tile_width = 512;
+	const unsigned tile_height = 8;
+	const unsigned tile_size = 4096;
+
+	const unsigned cpp = bpp / 8;
+	const unsigned stride_tiles = dst_stride / tile_width;
+	const unsigned swizzle_pixels = (swizzling ? 64 : tile_width) / cpp;
+	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+	const unsigned tile_mask = (1 << tile_pixels) - 1;
+
+	unsigned x, y;
+
+	DBG(("%s(bpp=%d, swizzling=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, swizzling, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+
+	for (y = 0; y < height; ++y) {
+		const uint32_t dy = y + dst_y;
+		const uint32_t tile_row =
+			(dy / tile_height * stride_tiles * tile_size +
+			 (dy & (tile_height-1)) * tile_width);
+		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
+		uint32_t dx = dst_x, offset;
+
+		x = width * cpp;
+		if (dx & (swizzle_pixels - 1)) {
+			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
+			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			switch (swizzling) {
+			case I915_BIT_6_SWIZZLE_NONE:
+				break;
+			case I915_BIT_6_SWIZZLE_9:
+				offset ^= (offset >> 3) & 64;
+				break;
+			case I915_BIT_6_SWIZZLE_9_10:
+				offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+				break;
+			case I915_BIT_6_SWIZZLE_9_11:
+				offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+				break;
+			}
+
+			memcpy((char *)dst + offset, src_row, length * cpp);
+
+			src_row += length * cpp;
+			x -= length * cpp;
+			dx += length;
+		}
+		if (swizzling) {
+			while (x >= 64) {
+				offset = tile_row +
+					(dx >> tile_pixels) * tile_size +
+					(dx & tile_mask) * cpp;
+				switch (swizzling) {
+				case I915_BIT_6_SWIZZLE_9:
+					offset ^= (offset >> 3) & 64;
+					break;
+				case I915_BIT_6_SWIZZLE_9_10:
+					offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+					break;
+				case I915_BIT_6_SWIZZLE_9_11:
+					offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+					break;
+				}
+
+				memcpy((char *)dst + offset, src_row, 64);
+
+				src_row += 64;
+				x -= 64;
+				dx += swizzle_pixels;
+			}
+		} else {
+			while (x >= 512) {
+				assert((dx & tile_mask) == 0);
+				offset = tile_row + (dx >> tile_pixels) * tile_size;
+
+				memcpy((char *)dst + offset, src_row, 512);
+
+				src_row += 512;
+				x -= 512;
+				dx += swizzle_pixels;
+			}
+		}
+		if (x) {
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			switch (swizzling) {
+			case I915_BIT_6_SWIZZLE_NONE:
+				break;
+			case I915_BIT_6_SWIZZLE_9:
+				offset ^= (offset >> 3) & 64;
+				break;
+			case I915_BIT_6_SWIZZLE_9_10:
+				offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+				break;
+			case I915_BIT_6_SWIZZLE_9_11:
+				offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+				break;
+			}
+
+			memcpy((char *)dst + offset, src_row, x);
+		}
+	}
+}
+
+void
 memmove_box(const void *src, void *dst,
 	    int bpp, int32_t stride,
 	    const BoxRec *box,
author	Chris Wilson <chris@chris-wilson.co.uk>	2012-09-11 21:48:24 +0100
committer	Chris Wilson <chris@chris-wilson.co.uk>	2012-09-21 11:56:16 +0100
commit	0be1d964713ca407f029278a8256d02d925dc9da (patch)
tree	d360eb12a9eed2b0938df9a5c5475da2bf82c1b2 /src/sna/blt.c
parent	d853064e7eebc5719645c12605782f995131a6fe (diff)