sna: Use inplace X tiling for LLC uploads

Based on a suggestion by Chad Versace (taken from a patch for mesa). This allows for a faster upload of pixel data through a ShmImage, or for complete replacement of a GPU bo. Using a modified version of x11perf to upload to a pixmap rather than scanout on an IVB i7-3720qm: Before: 40000000 trep @ 0.0007 msec (1410000.0/sec): ShmPutImage 10x10 square 4000000 trep @ 0.0110 msec ( 90700.0/sec): ShmPutImage 100x100 square 160000 trep @ 0.1689 msec ( 5920.0/sec): ShmPutImage 500x500 square After: 40000000 trep @ 0.0007 msec (1450000.0/sec): ShmPutImage 10x10 square 6000000 trep @ 0.0061 msec ( 164000.0/sec): ShmPutImage 100x100 square 400000 trep @ 0.1126 msec ( 8880.0/sec): ShmPutImage 500x500 square However, the real takeaway from this is that the overheads for ShmPutImage are substantial, only hitting around 70% expected efficiency, and overshadowed by PutImage, which for reference is 60000000 trep @ 0.0006 msec (1800000.0/sec): PutImage 10x10 square Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
author: Chris Wilson <chris@chris-wilson.co.uk> 2012-09-11 21:48:24 +0100
committer: Chris Wilson <chris@chris-wilson.co.uk> 2012-09-21 11:56:16 +0100
commit: 0be1d964713ca407f029278a8256d02d925dc9da (patch)
tree: d360eb12a9eed2b0938df9a5c5475da2bf82c1b2
parent: d853064e7eebc5719645c12605782f995131a6fe (diff)
5 files changed, 284 insertions, 7 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c
index 853eb20d..4735d14c 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -214,6 +214,123 @@ memcpy_blt(const void *src, void *dst, int bpp,
 }
 
 void
+memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
+		  int32_t src_stride, int32_t dst_stride,
+		  int16_t src_x, int16_t src_y,
+		  int16_t dst_x, int16_t dst_y,
+		  uint16_t width, uint16_t height)
+{
+	const unsigned tile_width = 512;
+	const unsigned tile_height = 8;
+	const unsigned tile_size = 4096;
+
+	const unsigned cpp = bpp / 8;
+	const unsigned stride_tiles = dst_stride / tile_width;
+	const unsigned swizzle_pixels = (swizzling ? 64 : tile_width) / cpp;
+	const unsigned tile_pixels = ffs(tile_width / cpp) - 1;
+	const unsigned tile_mask = (1 << tile_pixels) - 1;
+
+	unsigned x, y;
+
+	DBG(("%s(bpp=%d, swizzling=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
+	     __FUNCTION__, bpp, swizzling, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
+
+	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
+
+	for (y = 0; y < height; ++y) {
+		const uint32_t dy = y + dst_y;
+		const uint32_t tile_row =
+			(dy / tile_height * stride_tiles * tile_size +
+			 (dy & (tile_height-1)) * tile_width);
+		const uint8_t *src_row = (const uint8_t *)src + src_stride * y;
+		uint32_t dx = dst_x, offset;
+
+		x = width * cpp;
+		if (dx & (swizzle_pixels - 1)) {
+			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels);
+			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx;
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			switch (swizzling) {
+			case I915_BIT_6_SWIZZLE_NONE:
+				break;
+			case I915_BIT_6_SWIZZLE_9:
+				offset ^= (offset >> 3) & 64;
+				break;
+			case I915_BIT_6_SWIZZLE_9_10:
+				offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+				break;
+			case I915_BIT_6_SWIZZLE_9_11:
+				offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+				break;
+			}
+
+			memcpy((char *)dst + offset, src_row, length * cpp);
+
+			src_row += length * cpp;
+			x -= length * cpp;
+			dx += length;
+		}
+		if (swizzling) {
+			while (x >= 64) {
+				offset = tile_row +
+					(dx >> tile_pixels) * tile_size +
+					(dx & tile_mask) * cpp;
+				switch (swizzling) {
+				case I915_BIT_6_SWIZZLE_9:
+					offset ^= (offset >> 3) & 64;
+					break;
+				case I915_BIT_6_SWIZZLE_9_10:
+					offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+					break;
+				case I915_BIT_6_SWIZZLE_9_11:
+					offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+					break;
+				}
+
+				memcpy((char *)dst + offset, src_row, 64);
+
+				src_row += 64;
+				x -= 64;
+				dx += swizzle_pixels;
+			}
+		} else {
+			while (x >= 512) {
+				assert((dx & tile_mask) == 0);
+				offset = tile_row + (dx >> tile_pixels) * tile_size;
+
+				memcpy((char *)dst + offset, src_row, 512);
+
+				src_row += 512;
+				x -= 512;
+				dx += swizzle_pixels;
+			}
+		}
+		if (x) {
+			offset = tile_row +
+				(dx >> tile_pixels) * tile_size +
+				(dx & tile_mask) * cpp;
+			switch (swizzling) {
+			case I915_BIT_6_SWIZZLE_NONE:
+				break;
+			case I915_BIT_6_SWIZZLE_9:
+				offset ^= (offset >> 3) & 64;
+				break;
+			case I915_BIT_6_SWIZZLE_9_10:
+				offset ^= ((offset ^ (offset >> 1)) >> 3) & 64;
+				break;
+			case I915_BIT_6_SWIZZLE_9_11:
+				offset ^= ((offset ^ (offset >> 2)) >> 3) & 64;
+				break;
+			}
+
+			memcpy((char *)dst + offset, src_row, x);
+		}
+	}
+}
+
+void
 memmove_box(const void *src, void *dst,
 	    int bpp, int32_t stride,
 	    const BoxRec *box,
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index fc7c8811..0ea14f01 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -4082,6 +4082,56 @@ retry:
 	return (void *)(uintptr_t)mmap_arg.addr_ptr;
 }
 
+void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct drm_i915_gem_mmap mmap_arg;
+
+	DBG(("%s(handle=%d, size=%d, mapped? %d)\n",
+	     __FUNCTION__, bo->handle, bytes(bo), (int)__MAP_TYPE(bo->map)));
+        assert(bo->refcnt);
+	assert(!bo->purged);
+	assert(list_is_empty(&bo->list));
+	assert(bo->proxy == NULL);
+
+	if (IS_CPU_MAP(bo->map))
+		return MAP(bo->map);
+
+retry:
+	VG_CLEAR(mmap_arg);
+	mmap_arg.handle = bo->handle;
+	mmap_arg.offset = 0;
+	mmap_arg.size = bytes(bo);
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
+		ErrorF("%s: failed to mmap %d, %d bytes, into CPU domain: %d\n",
+		       __FUNCTION__, bo->handle, bytes(bo), errno);
+		if (__kgem_throttle_retire(kgem, 0))
+			goto retry;
+
+		return NULL;
+	}
+
+	VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bytes(bo)));
+	if (bo->map == NULL) {
+		DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
+		bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
+	}
+	return (void *)(uintptr_t)mmap_arg.addr_ptr;
+}
+
+void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr)
+{
+	DBG(("%s(handle=%d, size=%d)\n",
+	     __FUNCTION__, bo->handle, bytes(bo)));
+        assert(bo->refcnt);
+
+	if (IS_CPU_MAP(bo->map)) {
+                assert(ptr == MAP(bo->map));
+                return;
+        }
+
+	munmap(ptr, bytes(bo));
+}
+
 uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo)
 {
 	struct drm_gem_flink flink;
@@ -4961,6 +5011,19 @@ void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset)
 	}
 }
 
+int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo)
+{
+	struct drm_i915_gem_get_tiling tiling;
+
+	VG_CLEAR(tiling);
+	tiling.handle = bo->handle;
+	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_GET_TILING, &tiling))
+		return 0;
+
+	assert(bo->tiling == tiling.tiling_mode);
+	return tiling.swizzle_mode;
+}
+
 struct kgem_bo *
 kgem_replace_bo(struct kgem *kgem,
 		struct kgem_bo *src,
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 832b3f06..cdbb7cbf 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -262,6 +262,7 @@ struct kgem_bo *kgem_create_cpu_2d(struct kgem *kgem,
 
 uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format);
 void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset);
+int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo);
 
 void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo);
 bool kgem_retire(struct kgem *kgem);
@@ -419,6 +420,8 @@ void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
 void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo);
+void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
+void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr);
 uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo);
 
 bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
@@ -494,7 +497,7 @@ static inline bool kgem_bo_is_mappable(struct kgem *kgem,
 	return bo->presumed_offset + kgem_bo_size(bo) <= kgem->aperture_mappable;
 }
 
-static inline bool kgem_bo_mapped(struct kgem_bo *bo)
+static inline bool kgem_bo_mapped(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: map=%p, tiling=%d, domain=%d\n",
 	     __FUNCTION__, bo->map, bo->tiling, bo->domain));
@@ -502,12 +505,15 @@ static inline bool kgem_bo_mapped(struct kgem_bo *bo)
 	if (bo->map == NULL)
 		return bo->tiling == I915_TILING_NONE && bo->domain == DOMAIN_CPU;
 
+	if (bo->tiling == I915_TILING_X && !bo->scanout && kgem->has_llc)
+		return IS_CPU_MAP(bo->map);
+
 	return IS_CPU_MAP(bo->map) == !bo->tiling;
 }
 
 static inline bool kgem_bo_can_map(struct kgem *kgem, struct kgem_bo *bo)
 {
-	if (kgem_bo_mapped(bo))
+	if (kgem_bo_mapped(kgem, bo))
 		return true;
 
 	if (!bo->tiling && kgem->has_llc)
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 382c0a52..28dff6d2 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -764,6 +764,12 @@ memcpy_blt(const void *src, void *dst, int bpp,
 	   int16_t dst_x, int16_t dst_y,
 	   uint16_t width, uint16_t height);
 void
+memcpy_to_tiled_x(const void *src, void *dst, int bpp, int swizzling,
+		  int32_t src_stride, int32_t dst_stride,
+		  int16_t src_x, int16_t src_y,
+		  int16_t dst_x, int16_t dst_y,
+		  uint16_t width, uint16_t height);
+void
 memmove_box(const void *src, void *dst,
 	    int bpp, int32_t stride,
 	    const BoxRec *box,
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index a466f558..cdaadc01 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -482,6 +482,49 @@ fallback:
 	sna->blt_state.fill_bo = 0;
 }
 
+static bool upload_inplace__tiled(struct kgem *kgem, struct kgem_bo *bo)
+{
+	if (kgem->gen < 50) /* bit17 swizzling :( */
+		return false;
+
+	if (bo->tiling != I915_TILING_X)
+		return false;
+
+	if (bo->scanout)
+		return false;
+
+	return bo->domain == DOMAIN_CPU || kgem->has_llc;
+}
+
+static bool
+write_boxes_inplace__tiled(struct kgem *kgem,
+                           const uint8_t *src, int stride, int bpp, int16_t src_dx, int16_t src_dy,
+                           struct kgem_bo *bo, int16_t dst_dx, int16_t dst_dy,
+                           const BoxRec *box, int n)
+{
+	uint8_t *dst;
+	int swizzle;
+
+	assert(bo->tiling == I915_TILING_X);
+
+	dst = __kgem_bo_map__cpu(kgem, bo);
+	if (dst == NULL)
+		return false;
+
+	kgem_bo_sync__cpu(kgem, bo);
+	swizzle = kgem_bo_get_swizzling(kgem, bo);
+	do {
+		memcpy_to_tiled_x(src, dst, bpp, swizzle, stride, bo->pitch,
+				  box->x1 + src_dx, box->y1 + src_dy,
+				  box->x1 + dst_dx, box->y1 + dst_dy,
+				  box->x2 - box->x1, box->y2 - box->y1);
+		box++;
+	} while (--n);
+	__kgem_bo_unmap__cpu(kgem, bo, dst);
+
+	return true;
+}
+
 static bool write_boxes_inplace(struct kgem *kgem,
 				const void *src, int stride, int bpp, int16_t src_dx, int16_t src_dy,
 				struct kgem_bo *bo, int16_t dst_dx, int16_t dst_dy,
@@ -492,6 +535,11 @@ static bool write_boxes_inplace(struct kgem *kgem,
 	DBG(("%s x %d, handle=%d, tiling=%d\n",
 	     __FUNCTION__, n, bo->handle, bo->tiling));
 
+	if (upload_inplace__tiled(kgem, bo) &&
+	    write_boxes_inplace__tiled(kgem, src, stride, bpp, src_dx, src_dy,
+				       bo, dst_dx, dst_dy, box, n))
+		return true;
+
 	if (!kgem_bo_can_map(kgem, bo))
 		return false;
 
@@ -539,7 +587,7 @@ static bool upload_inplace(struct kgem *kgem,
 {
 	unsigned int bytes;
 
-	if (!kgem_bo_can_map(kgem, bo))
+	if (!kgem_bo_can_map(kgem, bo) && !upload_inplace__tiled(kgem, bo))
 		return false;
 
 	if (FORCE_INPLACE)
@@ -871,8 +919,6 @@ write_boxes_inplace__xor(struct kgem *kgem,
 			 const BoxRec *box, int n,
 			 uint32_t and, uint32_t or)
 {
-	int dst_pitch = bo->pitch;
-	int src_pitch = stride;
 	void *dst;
 
 	DBG(("%s x %d, tiling=%d\n", __FUNCTION__, n, bo->tiling));
@@ -888,10 +934,22 @@ write_boxes_inplace__xor(struct kgem *kgem,
 		     box->x1 + src_dx, box->y1 + src_dy,
 		     box->x1 + dst_dx, box->y1 + dst_dy,
 		     box->x2 - box->x1, box->y2 - box->y1,
-		     bpp, src_pitch, dst_pitch));
+		     bpp, stride, bo->pitch));
+
+		assert(box->x2 > box->x1);
+		assert(box->y2 > box->y1);
+
+		assert(box->x1 + dst_dx >= 0);
+		assert((box->x2 + dst_dx)*bpp <= 8*bo->pitch);
+		assert(box->y1 + dst_dy >= 0);
+		assert((box->y2 + dst_dy)*bo->pitch <= kgem_bo_size(bo));
+
+		assert(box->x1 + src_dx >= 0);
+		assert((box->x2 + src_dx)*bpp <= 8*stride);
+		assert(box->y1 + src_dy >= 0);
 
 		memcpy_xor(src, dst, bpp,
-			   src_pitch, dst_pitch,
+			   stride, bo->pitch,
 			   box->x1 + src_dx, box->y1 + src_dy,
 			   box->x1 + dst_dx, box->y1 + dst_dy,
 			   box->x2 - box->x1, box->y2 - box->y1,
@@ -1282,6 +1340,19 @@ bool sna_replace(struct sna *sna,
 	     pixmap->drawable.bitsPerPixel,
 	     bo->tiling, busy));
 
+	if (!busy && upload_inplace__tiled(kgem, bo)) {
+		BoxRec box;
+
+		box.x1 = box.y1 = 0;
+		box.x2 = pixmap->drawable.width;
+		box.y2 = pixmap->drawable.height;
+
+		if (write_boxes_inplace__tiled(kgem, src,
+					       stride, pixmap->drawable.bitsPerPixel, 0, 0,
+					       bo, 0, 0, &box, 1))
+			return true;
+	}
+
 	if ((busy || !kgem_bo_can_map(kgem, bo)) &&
 	    indirect_replace(sna, pixmap, bo, src, stride))
 		return true;
@@ -1304,6 +1375,19 @@ bool sna_replace(struct sna *sna,
 				   (pixmap->drawable.height-1)*stride + pixmap->drawable.width*pixmap->drawable.bitsPerPixel/8))
 			goto err;
 	} else {
+		if (upload_inplace__tiled(kgem, bo)) {
+			BoxRec box;
+
+			box.x1 = box.y1 = 0;
+			box.x2 = pixmap->drawable.width;
+			box.y2 = pixmap->drawable.height;
+
+			if (write_boxes_inplace__tiled(kgem, src,
+						       stride, pixmap->drawable.bitsPerPixel, 0, 0,
+						       bo, 0, 0, &box, 1))
+				goto done;
+		}
+
 		if (kgem_bo_is_mappable(kgem, bo)) {
 			dst = kgem_bo_map(kgem, bo);
 			if (!dst)
@@ -1330,6 +1414,7 @@ bool sna_replace(struct sna *sna,
 		}
 	}
 
+done:
 	if (bo != *_bo)
 		kgem_bo_destroy(kgem, *_bo);
 	*_bo = bo;
author	Chris Wilson <chris@chris-wilson.co.uk>	2012-09-11 21:48:24 +0100
committer	Chris Wilson <chris@chris-wilson.co.uk>	2012-09-21 11:56:16 +0100
commit	0be1d964713ca407f029278a8256d02d925dc9da (patch)
tree	d360eb12a9eed2b0938df9a5c5475da2bf82c1b2
parent	d853064e7eebc5719645c12605782f995131a6fe (diff)