summaryrefslogtreecommitdiff
path: root/src/sna/blt.c
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2012-09-11 21:48:24 +0100
committerChris Wilson <chris@chris-wilson.co.uk>2012-09-21 11:56:16 +0100
commit0be1d964713ca407f029278a8256d02d925dc9da (patch)
treed360eb12a9eed2b0938df9a5c5475da2bf82c1b2 /src/sna/blt.c
parentd853064e7eebc5719645c12605782f995131a6fe (diff)
sna: Use inplace X tiling for LLC uploads
Based on a suggestion by Chad Versace (taken from a patch for mesa). This allows for a faster upload of pixel data through a ShmImage, or for complete replacement of a GPU bo. Using a modified version of x11perf to upload to a pixmap rather than scanout on an IVB i7-3720qm: Before: 40000000 trep @ 0.0007 msec (1410000.0/sec): ShmPutImage 10x10 square 4000000 trep @ 0.0110 msec ( 90700.0/sec): ShmPutImage 100x100 square 160000 trep @ 0.1689 msec ( 5920.0/sec): ShmPutImage 500x500 square After: 40000000 trep @ 0.0007 msec (1450000.0/sec): ShmPutImage 10x10 square 6000000 trep @ 0.0061 msec ( 164000.0/sec): ShmPutImage 100x100 square 400000 trep @ 0.1126 msec ( 8880.0/sec): ShmPutImage 500x500 square However, the real takeaway from this is that the overheads for ShmPutImage are substantial, only hitting around 70% expected efficiency, and overshadowed by PutImage, which for reference is 60000000 trep @ 0.0006 msec (1800000.0/sec): PutImage 10x10 square Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Diffstat (limited to 'src/sna/blt.c')
-rw-r--r--src/sna/blt.c117
1 files changed, 117 insertions, 0 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c
index 853eb20d..4735d14c 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -214,6 +214,123 @@ memcpy_blt(const void *src, void *dst, int bpp,
}
void
+memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
+ int32_t src_stride, int32_t dst_stride,
+ int16_t src_x, int16_t src_y,
+ int16_t dst_x, int16_t dst_y,
+ uint16_t width, uint16_t height)
+{
+ const unsigned tile_width = 512;
+ const unsigned tile_height = 8;
+ const unsigned tile_size = 4096;
+
+ const unsigned cpp = bpp / 8;
+ const unsigned stride_tiles = dst_stride / tile_width;
+ const unsigned swizzle_pixels = (swizzling ? 64 : tile_width) / cpp;
+ const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+ const unsigned tile_mask = (1 << tile_pixels) - 1;
+
+ unsigned x, y;
+
+ DBG(("%s(bpp=%d, swizzling=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+ __FUNCTION__, bpp, swizzling, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+ src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+
+ for (y = 0; y < height; ++y) {
+ const uint32_t dy = y + dst_y;
+ const uint32_t tile_row =
+ (dy / tile_height * stride_tiles * tile_size +
+ (dy & (tile_height-1)) * tile_width);
+ const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
+ uint32_t dx = dst_x, offset;
+
+ x = width * cpp;
+ if (dx & (swizzle_pixels - 1)) {
+ const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
+ const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
+ offset = tile_row +
+ (dx >> tile_pixels) * tile_size +
+ (dx & tile_mask) * cpp;
+ switch (swizzling) {
+ case I915_BIT_6_SWIZZLE_NONE:
+ break;
+ case I915_BIT_6_SWIZZLE_9:
+ offset ^= (offset >> 3) & 64;
+ break;
+ case I915_BIT_6_SWIZZLE_9_10:
+ offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+ break;
+ case I915_BIT_6_SWIZZLE_9_11:
+ offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+ break;
+ }
+
+ memcpy((char *)dst + offset, src_row, length * cpp);
+
+ src_row += length * cpp;
+ x -= length * cpp;
+ dx += length;
+ }
+ if (swizzling) {
+ while (x >= 64) {
+ offset = tile_row +
+ (dx >> tile_pixels) * tile_size +
+ (dx & tile_mask) * cpp;
+ switch (swizzling) {
+ case I915_BIT_6_SWIZZLE_9:
+ offset ^= (offset >> 3) & 64;
+ break;
+ case I915_BIT_6_SWIZZLE_9_10:
+ offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+ break;
+ case I915_BIT_6_SWIZZLE_9_11:
+ offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+ break;
+ }
+
+ memcpy((char *)dst + offset, src_row, 64);
+
+ src_row += 64;
+ x -= 64;
+ dx += swizzle_pixels;
+ }
+ } else {
+ while (x >= 512) {
+ assert((dx & tile_mask) == 0);
+ offset = tile_row + (dx >> tile_pixels) * tile_size;
+
+ memcpy((char *)dst + offset, src_row, 512);
+
+ src_row += 512;
+ x -= 512;
+ dx += swizzle_pixels;
+ }
+ }
+ if (x) {
+ offset = tile_row +
+ (dx >> tile_pixels) * tile_size +
+ (dx & tile_mask) * cpp;
+ switch (swizzling) {
+ case I915_BIT_6_SWIZZLE_NONE:
+ break;
+ case I915_BIT_6_SWIZZLE_9:
+ offset ^= (offset >> 3) & 64;
+ break;
+ case I915_BIT_6_SWIZZLE_9_10:
+ offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+ break;
+ case I915_BIT_6_SWIZZLE_9_11:
+ offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+ break;
+ }
+
+ memcpy((char *)dst + offset, src_row, x);
+ }
+ }
+}
+
+void
memmove_box(const void *src, void *dst,
int bpp, int32_t stride,
const BoxRec *box,