summaryrefslogtreecommitdiff
path: root/src/sna
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2012-09-11 21:48:24 +0100
committerChris Wilson <chris@chris-wilson.co.uk>2012-09-21 11:56:16 +0100
commit0be1d964713ca407f029278a8256d02d925dc9da (patch)
treed360eb12a9eed2b0938df9a5c5475da2bf82c1b2 /src/sna
parentd853064e7eebc5719645c12605782f995131a6fe (diff)
sna: Use inplace X tiling for LLC uploads
Based on a suggestion by Chad Versace (taken from a patch for mesa). This allows for a faster upload of pixel data through a ShmImage, or for complete replacement of a GPU bo. Using a modified version of x11perf to upload to a pixmap rather than scanout on an IVB i7-3720qm: Before: 40000000 trep @ 0.0007 msec (1410000.0/sec): ShmPutImage 10x10 square 4000000 trep @ 0.0110 msec ( 90700.0/sec): ShmPutImage 100x100 square 160000 trep @ 0.1689 msec ( 5920.0/sec): ShmPutImage 500x500 square After: 40000000 trep @ 0.0007 msec (1450000.0/sec): ShmPutImage 10x10 square 6000000 trep @ 0.0061 msec ( 164000.0/sec): ShmPutImage 100x100 square 400000 trep @ 0.1126 msec ( 8880.0/sec): ShmPutImage 500x500 square However, the real takeaway from this is that the overheads for ShmPutImage are substantial, only hitting around 70% expected efficiency, and overshadowed by PutImage, which for reference is 60000000 trep @ 0.0006 msec (1800000.0/sec): PutImage 10x10 square Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Diffstat (limited to 'src/sna')
-rw-r--r--src/sna/blt.c117
-rw-r--r--src/sna/kgem.c63
-rw-r--r--src/sna/kgem.h10
-rw-r--r--src/sna/sna.h6
-rw-r--r--src/sna/sna_io.c95
5 files changed, 284 insertions, 7 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c
index 853eb20d..4735d14c 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -214,6 +214,123 @@ memcpy_blt(const void *src, void *dst, int bpp,
}
void
+memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
+ int32_t src_stride, int32_t dst_stride,
+ int16_t src_x, int16_t src_y,
+ int16_t dst_x, int16_t dst_y,
+ uint16_t width, uint16_t height)
+{
+ const unsigned tile_width = 512;
+ const unsigned tile_height = 8;
+ const unsigned tile_size = 4096;
+
+ const unsigned cpp = bpp / 8;
+ const unsigned stride_tiles = dst_stride / tile_width;
+ const unsigned swizzle_pixels = (swizzling ? 64 : tile_width) / cpp;
+ const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+ const unsigned tile_mask = (1 << tile_pixels) - 1;
+
+ unsigned x, y;
+
+ DBG(("%s(bpp=%d, swizzling=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+ __FUNCTION__, bpp, swizzling, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+ src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+
+ for (y = 0; y < height; ++y) {
+ const uint32_t dy = y + dst_y;
+ const uint32_t tile_row =
+ (dy / tile_height * stride_tiles * tile_size +
+ (dy & (tile_height-1)) * tile_width);
+ const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
+ uint32_t dx = dst_x, offset;
+
+ x = width * cpp;
+ if (dx & (swizzle_pixels - 1)) {
+ const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
+ const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
+ offset = tile_row +
+ (dx >> tile_pixels) * tile_size +
+ (dx & tile_mask) * cpp;
+ switch (swizzling) {
+ case I915_BIT_6_SWIZZLE_NONE:
+ break;
+ case I915_BIT_6_SWIZZLE_9:
+ offset ^= (offset >> 3) & 64;
+ break;
+ case I915_BIT_6_SWIZZLE_9_10:
+ offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+ break;
+ case I915_BIT_6_SWIZZLE_9_11:
+ offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+ break;
+ }
+
+ memcpy((char *)dst + offset, src_row, length * cpp);
+
+ src_row += length * cpp;
+ x -= length * cpp;
+ dx += length;
+ }
+ if (swizzling) {
+ while (x >= 64) {
+ offset = tile_row +
+ (dx >> tile_pixels) * tile_size +
+ (dx & tile_mask) * cpp;
+ switch (swizzling) {
+ case I915_BIT_6_SWIZZLE_9:
+ offset ^= (offset >> 3) & 64;
+ break;
+ case I915_BIT_6_SWIZZLE_9_10:
+ offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+ break;
+ case I915_BIT_6_SWIZZLE_9_11:
+ offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+ break;
+ }
+
+ memcpy((char *)dst + offset, src_row, 64);
+
+ src_row += 64;
+ x -= 64;
+ dx += swizzle_pixels;
+ }
+ } else {
+ while (x >= 512) {
+ assert((dx & tile_mask) == 0);
+ offset = tile_row + (dx >> tile_pixels) * tile_size;
+
+ memcpy((char *)dst + offset, src_row, 512);
+
+ src_row += 512;
+ x -= 512;
+ dx += swizzle_pixels;
+ }
+ }
+ if (x) {
+ offset = tile_row +
+ (dx >> tile_pixels) * tile_size +
+ (dx & tile_mask) * cpp;
+ switch (swizzling) {
+ case I915_BIT_6_SWIZZLE_NONE:
+ break;
+ case I915_BIT_6_SWIZZLE_9:
+ offset ^= (offset >> 3) & 64;
+ break;
+ case I915_BIT_6_SWIZZLE_9_10:
+ offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+ break;
+ case I915_BIT_6_SWIZZLE_9_11:
+ offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+ break;
+ }
+
+ memcpy((char *)dst + offset, src_row, x);
+ }
+ }
+}
+
+void
memmove_box(const void *src, void *dst,
int bpp, int32_t stride,
const BoxRec *box,
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index fc7c8811..0ea14f01 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -4082,6 +4082,56 @@ retry:
return (void *)(uintptr_t)mmap_arg.addr_ptr;
}
+void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
+{
+ struct drm_i915_gem_mmap mmap_arg;
+
+ DBG(("%s(handle=%d, size=%d, mapped? %d)\n",
+ __FUNCTION__, bo->handle, bytes(bo), (int)__MAP_TYPE(bo->map)));
+ assert(bo->refcnt);
+ assert(!bo->purged);
+ assert(list_is_empty(&bo->list));
+ assert(bo->proxy == NULL);
+
+ if (IS_CPU_MAP(bo->map))
+ return MAP(bo->map);
+
+retry:
+ VG_CLEAR(mmap_arg);
+ mmap_arg.handle = bo->handle;
+ mmap_arg.offset = 0;
+ mmap_arg.size = bytes(bo);
+ if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
+ ErrorF("%s: failed to mmap %d, %d bytes, into CPU domain: %d\n",
+ __FUNCTION__, bo->handle, bytes(bo), errno);
+ if (__kgem_throttle_retire(kgem, 0))
+ goto retry;
+
+ return NULL;
+ }
+
+ VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo)));
+ if (bo->map == NULL) {
+ DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
+ bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
+ }
+ return (void *)(uintptr_t)mmap_arg.addr_ptr;
+}
+
+void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr)
+{
+ DBG(("%s(handle=%d, size=%d)\n",
+ __FUNCTION__, bo->handle, bytes(bo)));
+ assert(bo->refcnt);
+
+ if (IS_CPU_MAP(bo->map)) {
+ assert(ptr == MAP(bo->map));
+ return;
+ }
+
+ munmap(ptr, bytes(bo));
+}
+
uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo)
{
struct drm_gem_flink flink;
@@ -4961,6 +5011,19 @@ void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset)
}
}
+int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo)
+{
+ struct drm_i915_gem_get_tiling tiling;
+
+ VG_CLEAR(tiling);
+ tiling.handle = bo->handle;
+ if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_TILING, &tiling))
+ return 0;
+
+ assert(bo->tiling == tiling.tiling_mode);
+ return tiling.swizzle_mode;
+}
+
struct kgem_bo *
kgem_replace_bo(struct kgem *kgem,
struct kgem_bo *src,
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 832b3f06..cdbb7cbf 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -262,6 +262,7 @@ struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem,
uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format);
void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset);
+int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo);
void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo);
bool kgem_retire(struct kgem *kgem);
@@ -419,6 +420,8 @@ void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo);
void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo);
void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo);
+void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
+void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr);
uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo);
bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
@@ -494,7 +497,7 @@ static inline bool kgem_bo_is_mappable(struct kgem *kgem,
return bo->presumed_offset + kgem_bo_size(bo) <= kgem->aperture_mappable;
}
-static inline bool kgem_bo_mapped(struct kgem_bo *bo)
+static inline bool kgem_bo_mapped(struct kgem *kgem, struct kgem_bo *bo)
{
DBG(("%s: map=%p, tiling=%d, domain=%d\n",
__FUNCTION__, bo->map, bo->tiling, bo->domain));
@@ -502,12 +505,15 @@ static inline bool kgem_bo_mapped(struct kgem_bo *bo)
if (bo->map == NULL)
return bo->tiling == I915_TILING_NONE && bo->domain == DOMAIN_CPU;
+ if (bo->tiling == I915_TILING_X && !bo->scanout && kgem->has_llc)
+ return IS_CPU_MAP(bo->map);
+
return IS_CPU_MAP(bo->map) == !bo->tiling;
}
static inline bool kgem_bo_can_map(struct kgem *kgem, struct kgem_bo *bo)
{
- if (kgem_bo_mapped(bo))
+ if (kgem_bo_mapped(kgem, bo))
return true;
if (!bo->tiling && kgem->has_llc)
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 382c0a52..28dff6d2 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -764,6 +764,12 @@ memcpy_blt(const void *src, void *dst, int bpp,
int16_t dst_x, int16_t dst_y,
uint16_t width, uint16_t height);
void
+memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
+ int32_t src_stride, int32_t dst_stride,
+ int16_t src_x, int16_t src_y,
+ int16_t dst_x, int16_t dst_y,
+ uint16_t width, uint16_t height);
+void
memmove_box(const void *src, void *dst,
int bpp, int32_t stride,
const BoxRec *box,
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index a466f558..cdaadc01 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -482,6 +482,49 @@ fallback:
sna->blt_state.fill_bo = 0;
}
+static bool upload_inplace__tiled(struct kgem *kgem, struct kgem_bo *bo)
+{
+ if (kgem->gen < 50) /* bit17 swizzling :( */
+ return false;
+
+ if (bo->tiling != I915_TILING_X)
+ return false;
+
+ if (bo->scanout)
+ return false;
+
+ return bo->domain == DOMAIN_CPU || kgem->has_llc;
+}
+
+static bool
+write_boxes_inplace__tiled(struct kgem *kgem,
+ const uint8_t *src, int stride, int bpp, int16_t src_dx, int16_t src_dy,
+ struct kgem_bo *bo, int16_t dst_dx, int16_t dst_dy,
+ const BoxRec *box, int n)
+{
+ uint8_t *dst;
+ int swizzle;
+
+ assert(bo->tiling == I915_TILING_X);
+
+ dst = __kgem_bo_map__cpu(kgem, bo);
+ if (dst == NULL)
+ return false;
+
+ kgem_bo_sync__cpu(kgem, bo);
+ swizzle = kgem_bo_get_swizzling(kgem, bo);
+ do {
+ memcpy_to_tiled_x(src, dst, bpp, swizzle, stride, bo->pitch,
+ box->x1 + src_dx, box->y1 + src_dy,
+ box->x1 + dst_dx, box->y1 + dst_dy,
+ box->x2 - box->x1, box->y2 - box->y1);
+ box++;
+ } while (--n);
+ __kgem_bo_unmap__cpu(kgem, bo, dst);
+
+ return true;
+}
+
static bool write_boxes_inplace(struct kgem *kgem,
const void *src, int stride, int bpp, int16_t src_dx, int16_t src_dy,
struct kgem_bo *bo, int16_t dst_dx, int16_t dst_dy,
@@ -492,6 +535,11 @@ static bool write_boxes_inplace(struct kgem *kgem,
DBG(("%s x %d, handle=%d, tiling=%d\n",
__FUNCTION__, n, bo->handle, bo->tiling));
+ if (upload_inplace__tiled(kgem, bo) &&
+ write_boxes_inplace__tiled(kgem, src, stride, bpp, src_dx, src_dy,
+ bo, dst_dx, dst_dy, box, n))
+ return true;
+
if (!kgem_bo_can_map(kgem, bo))
return false;
@@ -539,7 +587,7 @@ static bool upload_inplace(struct kgem *kgem,
{
unsigned int bytes;
- if (!kgem_bo_can_map(kgem, bo))
+ if (!kgem_bo_can_map(kgem, bo) && !upload_inplace__tiled(kgem, bo))
return false;
if (FORCE_INPLACE)
@@ -871,8 +919,6 @@ write_boxes_inplace__xor(struct kgem *kgem,
const BoxRec *box, int n,
uint32_t and, uint32_t or)
{
- int dst_pitch = bo->pitch;
- int src_pitch = stride;
void *dst;
DBG(("%s x %d, tiling=%d\n", __FUNCTION__, n, bo->tiling));
@@ -888,10 +934,22 @@ write_boxes_inplace__xor(struct kgem *kgem,
box->x1 + src_dx, box->y1 + src_dy,
box->x1 + dst_dx, box->y1 + dst_dy,
box->x2 - box->x1, box->y2 - box->y1,
- bpp, src_pitch, dst_pitch));
+ bpp, stride, bo->pitch));
+
+ assert(box->x2 > box->x1);
+ assert(box->y2 > box->y1);
+
+ assert(box->x1 + dst_dx >= 0);
+ assert((box->x2 + dst_dx)*bpp <= 8*bo->pitch);
+ assert(box->y1 + dst_dy >= 0);
+ assert((box->y2 + dst_dy)*bo->pitch <= kgem_bo_size(bo));
+
+ assert(box->x1 + src_dx >= 0);
+ assert((box->x2 + src_dx)*bpp <= 8*stride);
+ assert(box->y1 + src_dy >= 0);
memcpy_xor(src, dst, bpp,
- src_pitch, dst_pitch,
+ stride, bo->pitch,
box->x1 + src_dx, box->y1 + src_dy,
box->x1 + dst_dx, box->y1 + dst_dy,
box->x2 - box->x1, box->y2 - box->y1,
@@ -1282,6 +1340,19 @@ bool sna_replace(struct sna *sna,
pixmap->drawable.bitsPerPixel,
bo->tiling, busy));
+ if (!busy && upload_inplace__tiled(kgem, bo)) {
+ BoxRec box;
+
+ box.x1 = box.y1 = 0;
+ box.x2 = pixmap->drawable.width;
+ box.y2 = pixmap->drawable.height;
+
+ if (write_boxes_inplace__tiled(kgem, src,
+ stride, pixmap->drawable.bitsPerPixel, 0, 0,
+ bo, 0, 0, &box, 1))
+ return true;
+ }
+
if ((busy || !kgem_bo_can_map(kgem, bo)) &&
indirect_replace(sna, pixmap, bo, src, stride))
return true;
@@ -1304,6 +1375,19 @@ bool sna_replace(struct sna *sna,
(pixmap->drawable.height-1)*stride + pixmap->drawable.width*pixmap->drawable.bitsPerPixel/8))
goto err;
} else {
+ if (upload_inplace__tiled(kgem, bo)) {
+ BoxRec box;
+
+ box.x1 = box.y1 = 0;
+ box.x2 = pixmap->drawable.width;
+ box.y2 = pixmap->drawable.height;
+
+ if (write_boxes_inplace__tiled(kgem, src,
+ stride, pixmap->drawable.bitsPerPixel, 0, 0,
+ bo, 0, 0, &box, 1))
+ goto done;
+ }
+
if (kgem_bo_is_mappable(kgem, bo)) {
dst = kgem_bo_map(kgem, bo);
if (!dst)
@@ -1330,6 +1414,7 @@ bool sna_replace(struct sna *sna,
}
}
+done:
if (bo != *_bo)
kgem_bo_destroy(kgem, *_bo);
*_bo = bo;