diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2012-09-11 21:48:24 +0100 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2012-09-21 11:56:16 +0100 |
commit | 0be1d964713ca407f029278a8256d02d925dc9da (patch) | |
tree | d360eb12a9eed2b0938df9a5c5475da2bf82c1b2 | |
parent | d853064e7eebc5719645c12605782f995131a6fe (diff) |
sna: Use inplace X tiling for LLC uploads
Based on a suggestion by Chad Versace (taken from a patch for mesa).
This allows for a faster upload of pixel data through a ShmImage, or for
complete replacement of a GPU bo.
Using a modified version of x11perf to upload to a pixmap rather than
scanout on an IVB i7-3720qm:
Before:
40000000 trep @ 0.0007 msec (1410000.0/sec): ShmPutImage 10x10 square
4000000 trep @ 0.0110 msec ( 90700.0/sec): ShmPutImage 100x100 square
160000 trep @ 0.1689 msec ( 5920.0/sec): ShmPutImage 500x500 square
After:
40000000 trep @ 0.0007 msec (1450000.0/sec): ShmPutImage 10x10 square
6000000 trep @ 0.0061 msec ( 164000.0/sec): ShmPutImage 100x100 square
400000 trep @ 0.1126 msec ( 8880.0/sec): ShmPutImage 500x500 square
However, the real takeaway from this is that the overheads for
ShmPutImage are substantial, only hitting around 70% expected efficiency,
and overshadowed by PutImage, which for reference is
60000000 trep @ 0.0006 msec (1800000.0/sec): PutImage 10x10 square
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
-rw-r--r-- | src/sna/blt.c | 117 | ||||
-rw-r--r-- | src/sna/kgem.c | 63 | ||||
-rw-r--r-- | src/sna/kgem.h | 10 | ||||
-rw-r--r-- | src/sna/sna.h | 6 | ||||
-rw-r--r-- | src/sna/sna_io.c | 95 |
5 files changed, 284 insertions, 7 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c index 853eb20d..4735d14c 100644 --- a/src/sna/blt.c +++ b/src/sna/blt.c @@ -214,6 +214,123 @@ memcpy_blt(const void *src, void *dst, int bpp, } void +memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height) +{ + const unsigned tile_width = 512; + const unsigned tile_height = 8; + const unsigned tile_size = 4096; + + const unsigned cpp = bpp / 8; + const unsigned stride_tiles = dst_stride / tile_width; + const unsigned swizzle_pixels = (swizzling ? 64 : tile_width) / cpp; + const unsigned tile_pixels = ffs(tile_width / cpp) - 1; + const unsigned tile_mask = (1 << tile_pixels) - 1; + + unsigned x, y; + + DBG(("%s(bpp=%d, swizzling=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", + __FUNCTION__, bpp, swizzling, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); + + src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; + + for (y = 0; y < height; ++y) { + const uint32_t dy = y + dst_y; + const uint32_t tile_row = + (dy / tile_height * stride_tiles * tile_size + + (dy & (tile_height-1)) * tile_width); + const uint8_t *src_row = (const uint8_t *)src + src_stride * y; + uint32_t dx = dst_x, offset; + + x = width * cpp; + if (dx & (swizzle_pixels - 1)) { + const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); + const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + switch (swizzling) { + case I915_BIT_6_SWIZZLE_NONE: + break; + case I915_BIT_6_SWIZZLE_9: + offset ^= (offset >> 3) & 64; + break; + case I915_BIT_6_SWIZZLE_9_10: + offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; + break; + case I915_BIT_6_SWIZZLE_9_11: + offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; + break; + } + + memcpy((char *)dst + offset, src_row, length * cpp); + + src_row += length * cpp; + x -= length * cpp; + dx += length; + } + if (swizzling) { + while (x >= 64) { + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + switch (swizzling) { + case I915_BIT_6_SWIZZLE_9: + offset ^= (offset >> 3) & 64; + break; + case I915_BIT_6_SWIZZLE_9_10: + offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; + break; + case I915_BIT_6_SWIZZLE_9_11: + offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; + break; + } + + memcpy((char *)dst + offset, src_row, 64); + + src_row += 64; + x -= 64; + dx += swizzle_pixels; + } + } else { + while (x >= 512) { + assert((dx & tile_mask) == 0); + offset = tile_row + (dx >> tile_pixels) * tile_size; + + memcpy((char *)dst + offset, src_row, 512); + + src_row += 512; + x -= 512; + dx += swizzle_pixels; + } + } + if (x) { + offset = tile_row + + (dx >> tile_pixels) * tile_size + + (dx & tile_mask) * cpp; + switch (swizzling) { + case I915_BIT_6_SWIZZLE_NONE: + break; + case I915_BIT_6_SWIZZLE_9: + offset ^= (offset >> 3) & 64; + break; + case I915_BIT_6_SWIZZLE_9_10: + offset ^= ((offset ^ (offset >> 1)) >> 3) & 64; + break; + case I915_BIT_6_SWIZZLE_9_11: + offset ^= ((offset ^ (offset >> 2)) >> 3) & 64; + break; + } + + memcpy((char *)dst + offset, src_row, x); + } + } +} + +void memmove_box(const void *src, void *dst, int bpp, int32_t stride, const BoxRec *box, diff --git a/src/sna/kgem.c b/src/sna/kgem.c index fc7c8811..0ea14f01 100644 --- a/src/sna/kgem.c +++ b/src/sna/kgem.c @@ -4082,6 +4082,56 @@ retry: return (void *)(uintptr_t)mmap_arg.addr_ptr; } +void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo) +{ + struct drm_i915_gem_mmap mmap_arg; + + DBG(("%s(handle=%d, size=%d, mapped? %d)\n", + __FUNCTION__, bo->handle, bytes(bo), (int)__MAP_TYPE(bo->map))); + assert(bo->refcnt); + assert(!bo->purged); + assert(list_is_empty(&bo->list)); + assert(bo->proxy == NULL); + + if (IS_CPU_MAP(bo->map)) + return MAP(bo->map); + +retry: + VG_CLEAR(mmap_arg); + mmap_arg.handle = bo->handle; + mmap_arg.offset = 0; + mmap_arg.size = bytes(bo); + if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) { + ErrorF("%s: failed to mmap %d, %d bytes, into CPU domain: %d\n", + __FUNCTION__, bo->handle, bytes(bo), errno); + if (__kgem_throttle_retire(kgem, 0)) + goto retry; + + return NULL; + } + + VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo))); + if (bo->map == NULL) { + DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle)); + bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr); + } + return (void *)(uintptr_t)mmap_arg.addr_ptr; +} + +void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr) +{ + DBG(("%s(handle=%d, size=%d)\n", + __FUNCTION__, bo->handle, bytes(bo))); + assert(bo->refcnt); + + if (IS_CPU_MAP(bo->map)) { + assert(ptr == MAP(bo->map)); + return; + } + + munmap(ptr, bytes(bo)); +} + uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo) { struct drm_gem_flink flink; @@ -4961,6 +5011,19 @@ void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset) } } +int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo) +{ + struct drm_i915_gem_get_tiling tiling; + + VG_CLEAR(tiling); + tiling.handle = bo->handle; + if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_TILING, &tiling)) + return 0; + + assert(bo->tiling == tiling.tiling_mode); + return tiling.swizzle_mode; +} + struct kgem_bo * kgem_replace_bo(struct kgem *kgem, struct kgem_bo *src, diff --git a/src/sna/kgem.h b/src/sna/kgem.h index 832b3f06..cdbb7cbf 100644 --- a/src/sna/kgem.h +++ b/src/sna/kgem.h @@ -262,6 +262,7 @@ struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem, uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format); void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset); +int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo); void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo); bool kgem_retire(struct kgem *kgem); @@ -419,6 +420,8 @@ void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo); void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo); void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo); void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo); +void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo); +void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr); uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo); bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo, @@ -494,7 +497,7 @@ static inline bool kgem_bo_is_mappable(struct kgem *kgem, return bo->presumed_offset + kgem_bo_size(bo) <= kgem->aperture_mappable; } -static inline bool kgem_bo_mapped(struct kgem_bo *bo) +static inline bool kgem_bo_mapped(struct kgem *kgem, struct kgem_bo *bo) { DBG(("%s: map=%p, tiling=%d, domain=%d\n", __FUNCTION__, bo->map, bo->tiling, bo->domain)); @@ -502,12 +505,15 @@ static inline bool kgem_bo_mapped(struct kgem_bo *bo) if (bo->map == NULL) return bo->tiling == I915_TILING_NONE && bo->domain == DOMAIN_CPU; + if (bo->tiling == I915_TILING_X && !bo->scanout && kgem->has_llc) + return IS_CPU_MAP(bo->map); + return IS_CPU_MAP(bo->map) == !bo->tiling; } static inline bool kgem_bo_can_map(struct kgem *kgem, struct kgem_bo *bo) { - if (kgem_bo_mapped(bo)) + if (kgem_bo_mapped(kgem, bo)) return true; if (!bo->tiling && kgem->has_llc) diff --git a/src/sna/sna.h b/src/sna/sna.h index 382c0a52..28dff6d2 100644 --- a/src/sna/sna.h +++ b/src/sna/sna.h @@ -764,6 +764,12 @@ memcpy_blt(const void *src, void *dst, int bpp, int16_t dst_x, int16_t dst_y, uint16_t width, uint16_t height); void +memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling, + int32_t src_stride, int32_t dst_stride, + int16_t src_x, int16_t src_y, + int16_t dst_x, int16_t dst_y, + uint16_t width, uint16_t height); +void memmove_box(const void *src, void *dst, int bpp, int32_t stride, const BoxRec *box, diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c index a466f558..cdaadc01 100644 --- a/src/sna/sna_io.c +++ b/src/sna/sna_io.c @@ -482,6 +482,49 @@ fallback: sna->blt_state.fill_bo = 0; } +static bool upload_inplace__tiled(struct kgem *kgem, struct kgem_bo *bo) +{ + if (kgem->gen < 50) /* bit17 swizzling :( */ + return false; + + if (bo->tiling != I915_TILING_X) + return false; + + if (bo->scanout) + return false; + + return bo->domain == DOMAIN_CPU || kgem->has_llc; +} + +static bool +write_boxes_inplace__tiled(struct kgem *kgem, + const uint8_t *src, int stride, int bpp, int16_t src_dx, int16_t src_dy, + struct kgem_bo *bo, int16_t dst_dx, int16_t dst_dy, + const BoxRec *box, int n) +{ + uint8_t *dst; + int swizzle; + + assert(bo->tiling == I915_TILING_X); + + dst = __kgem_bo_map__cpu(kgem, bo); + if (dst == NULL) + return false; + + kgem_bo_sync__cpu(kgem, bo); + swizzle = kgem_bo_get_swizzling(kgem, bo); + do { + memcpy_to_tiled_x(src, dst, bpp, swizzle, stride, bo->pitch, + box->x1 + src_dx, box->y1 + src_dy, + box->x1 + dst_dx, box->y1 + dst_dy, + box->x2 - box->x1, box->y2 - box->y1); + box++; + } while (--n); + __kgem_bo_unmap__cpu(kgem, bo, dst); + + return true; +} + static bool write_boxes_inplace(struct kgem *kgem, const void *src, int stride, int bpp, int16_t src_dx, int16_t src_dy, struct kgem_bo *bo, int16_t dst_dx, int16_t dst_dy, @@ -492,6 +535,11 @@ static bool write_boxes_inplace(struct kgem *kgem, DBG(("%s x %d, handle=%d, tiling=%d\n", __FUNCTION__, n, bo->handle, bo->tiling)); + if (upload_inplace__tiled(kgem, bo) && + write_boxes_inplace__tiled(kgem, src, stride, bpp, src_dx, src_dy, + bo, dst_dx, dst_dy, box, n)) + return true; + if (!kgem_bo_can_map(kgem, bo)) return false; @@ -539,7 +587,7 @@ static bool upload_inplace(struct kgem *kgem, { unsigned int bytes; - if (!kgem_bo_can_map(kgem, bo)) + if (!kgem_bo_can_map(kgem, bo) && !upload_inplace__tiled(kgem, bo)) return false; if (FORCE_INPLACE) @@ -871,8 +919,6 @@ write_boxes_inplace__xor(struct kgem *kgem, const BoxRec *box, int n, uint32_t and, uint32_t or) { - int dst_pitch = bo->pitch; - int src_pitch = stride; void *dst; DBG(("%s x %d, tiling=%d\n", __FUNCTION__, n, bo->tiling)); @@ -888,10 +934,22 @@ write_boxes_inplace__xor(struct kgem *kgem, box->x1 + src_dx, box->y1 + src_dy, box->x1 + dst_dx, box->y1 + dst_dy, box->x2 - box->x1, box->y2 - box->y1, - bpp, src_pitch, dst_pitch)); + bpp, stride, bo->pitch)); + + assert(box->x2 > box->x1); + assert(box->y2 > box->y1); + + assert(box->x1 + dst_dx >= 0); + assert((box->x2 + dst_dx)*bpp <= 8*bo->pitch); + assert(box->y1 + dst_dy >= 0); + assert((box->y2 + dst_dy)*bo->pitch <= kgem_bo_size(bo)); + + assert(box->x1 + src_dx >= 0); + assert((box->x2 + src_dx)*bpp <= 8*stride); + assert(box->y1 + src_dy >= 0); memcpy_xor(src, dst, bpp, - src_pitch, dst_pitch, + stride, bo->pitch, box->x1 + src_dx, box->y1 + src_dy, box->x1 + dst_dx, box->y1 + dst_dy, box->x2 - box->x1, box->y2 - box->y1, @@ -1282,6 +1340,19 @@ bool sna_replace(struct sna *sna, pixmap->drawable.bitsPerPixel, bo->tiling, busy)); + if (!busy && upload_inplace__tiled(kgem, bo)) { + BoxRec box; + + box.x1 = box.y1 = 0; + box.x2 = pixmap->drawable.width; + box.y2 = pixmap->drawable.height; + + if (write_boxes_inplace__tiled(kgem, src, + stride, pixmap->drawable.bitsPerPixel, 0, 0, + bo, 0, 0, &box, 1)) + return true; + } + if ((busy || !kgem_bo_can_map(kgem, bo)) && indirect_replace(sna, pixmap, bo, src, stride)) return true; @@ -1304,6 +1375,19 @@ bool sna_replace(struct sna *sna, (pixmap->drawable.height-1)*stride + pixmap->drawable.width*pixmap->drawable.bitsPerPixel/8)) goto err; } else { + if (upload_inplace__tiled(kgem, bo)) { + BoxRec box; + + box.x1 = box.y1 = 0; + box.x2 = pixmap->drawable.width; + box.y2 = pixmap->drawable.height; + + if (write_boxes_inplace__tiled(kgem, src, + stride, pixmap->drawable.bitsPerPixel, 0, 0, + bo, 0, 0, &box, 1)) + goto done; + } + if (kgem_bo_is_mappable(kgem, bo)) { dst = kgem_bo_map(kgem, bo); if (!dst) @@ -1330,6 +1414,7 @@ bool sna_replace(struct sna *sna, } } +done: if (bo != *_bo) kgem_bo_destroy(kgem, *_bo); *_bo = bo; |