diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2012-09-11 21:48:24 +0100 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2012-09-21 11:56:16 +0100 |
commit | 0be1d964713ca407f029278a8256d02d925dc9da (patch) | |
tree | d360eb12a9eed2b0938df9a5c5475da2bf82c1b2 /src/sna/blt.c | |
parent | d853064e7eebc5719645c12605782f995131a6fe (diff) |
sna: Use inplace X tiling for LLC uploads
Based on a suggestion by Chad Versace (taken from a patch for mesa).
This allows for a faster upload of pixel data through a ShmImage, or for
complete replacement of a GPU bo.
Using a modified version of x11perf to upload to a pixmap rather than
scanout on an IVB i7-3720qm:
Before:
40000000 trep @ 0.0007 msec (1410000.0/sec): ShmPutImage 10x10 square
4000000 trep @ 0.0110 msec ( 90700.0/sec): ShmPutImage 100x100 square
160000 trep @ 0.1689 msec ( 5920.0/sec): ShmPutImage 500x500 square
After:
40000000 trep @ 0.0007 msec (1450000.0/sec): ShmPutImage 10x10 square
6000000 trep @ 0.0061 msec ( 164000.0/sec): ShmPutImage 100x100 square
400000 trep @ 0.1126 msec ( 8880.0/sec): ShmPutImage 500x500 square
However, the real takeaway from this is that the overheads for
ShmPutImage are substantial, only hitting around 70% expected efficiency,
and overshadowed by PutImage, which for reference is
60000000 trep @ 0.0006 msec (1800000.0/sec): PutImage 10x10 square
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Diffstat (limited to 'src/sna/blt.c')
-rw-r--r-- | src/sna/blt.c | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c index 853eb20d..4735d14c 100644 --- a/src/sna/blt.c +++ b/src/sna/blt.c @@ -214,6 +214,123 @@ memcpy_blt(const void *src, void *dst, int bpp, } void +memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const unsigned tile_width = 512; + const unsigned tile_height = 8; + const unsigned tile_size = 4096; + + const unsigned cpp = bpp / 8; + const unsigned stride_tiles = dst_stride / tile_width; + const unsigned swizzle_pixels = (swizzling ? 64 : tile_width) / cpp; + const unsigned tile_pixels = ffs(tile_width / cpp) - 1; + const unsigned tile_mask = (1 << tile_pixels) - 1; + + unsigned x, y; + + DBG(("%s(bpp=%d, swizzling=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, swizzling, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + + src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; + + for (y = 0; y < height; ++y) { + const uint32_t dy = y + dst_y; + const uint32_t tile_row = + (dy / tile_height * stride_tiles * tile_size + + (dy & (tile_height-1)) * tile_width); + const uint8_t *src_row = (const uint8_t *)src + src_stride * y; + uint32_t dx = dst_x, offset; + + x = width * cpp; + if (dx & (swizzle_pixels - 1)) { + const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); + const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + switch (swizzling) { + case I915_BIT_6_SWIZZLE_NONE: + break; + case I915_BIT_6_SWIZZLE_9: + offset ^= (offset >> 3) & 64; + break; + case I915_BIT_6_SWIZZLE_9_10: + offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; + break; + case I915_BIT_6_SWIZZLE_9_11: + offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; + break; + } + + memcpy((char *)dst + offset, src_row, length * cpp); + + src_row += length * cpp; + x -= length * cpp; + dx += length; + } + if (swizzling) { + while (x >= 64) { + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + switch (swizzling) { + case I915_BIT_6_SWIZZLE_9: + offset ^= (offset >> 3) & 64; + break; + case I915_BIT_6_SWIZZLE_9_10: + offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; + break; + case I915_BIT_6_SWIZZLE_9_11: + offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; + break; + } + + memcpy((char *)dst + offset, src_row, 64); + + src_row += 64; + x -= 64; + dx += swizzle_pixels; + } + } else { + while (x >= 512) { + assert((dx & tile_mask) == 0); + offset = tile_row + (dx >> tile_pixels) * tile_size; + + memcpy((char *)dst + offset, src_row, 512); + + src_row += 512; + x -= 512; + dx += swizzle_pixels; + } + } + if (x) { + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + switch (swizzling) { + case I915_BIT_6_SWIZZLE_NONE: + break; + case I915_BIT_6_SWIZZLE_9: + offset ^= (offset >> 3) & 64; + break; + case I915_BIT_6_SWIZZLE_9_10: + offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; + break; + case I915_BIT_6_SWIZZLE_9_11: + offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; + break; + } + + memcpy((char *)dst + offset, src_row, x); + } + } +} + +void memmove_box(const void *src, void *dst, int bpp, int32_t stride, const BoxRec *box, |